In [25]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score


In [26]:
import re
import pickle
import string
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import nltk
from collections import Counter
from textstat import textstat
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK imports
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# NLTK downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("vader_lexicon")

# Ignore warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /Users/lila/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lila/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [27]:
df = pd.read_csv('YoutubeCommentsDataSet.csv')

In [28]:
from sklearn.model_selection import train_test_split

# split the dataset into train (85%) and validation (15%)
df, validation_df = train_test_split(df, test_size=0.15, stratify=df['Sentiment'], random_state=42)
df, test_df = train_test_split(df, test_size=0.1275, stratify=df['Sentiment'], random_state=42)
validation_df.to_csv("validation_set.csv", index=False)

TF-IDF Embedding

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=3000,        # Tune this depending on dataset size
    ngram_range=(1, 2),       # Unigrams + bigrams capture short phrases
    stop_words='english',     # Optional — experiment with and without
    sublinear_tf=True,        # Log scale term frequency
    norm='l2'                 # Normalize each vector
)

In [30]:
class TfidfEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=3000,
            ngram_range=(1, 2),
            stop_words='english',
            sublinear_tf=True
        )

    def fit(self, X, y=None):
        self.vectorizer.fit(X['Comment_preprocessed'])
        return self


    def transform(self, X):
        return self.vectorizer.transform(X['Comment_preprocessed'])


Inżynieria cech

In [31]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet corpus
nltk.download('omw-1.4')  # Optional: Better word sense
nltk.download('punkt')

wnl = WordNetLemmatizer()

def preprocess_text(text):
    # Get all stopwords but preserve negations
    all_stop_words = set()
    for lang in stopwords.fileids():
        all_stop_words.update(stopwords.words(lang))  

    # Stopwords, które nie są usuwane
    negation_words = {"no", "not", "never", "none", "nowhere", "nobody", "nothing", "neither", "nor", 
                    "isn't", "wasn't", "weren't", "doesn't", "don't", "didn't", "hasn't", "haven't", 
                    "hadn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "couldn't", "mustn't"}

    # Remove stopwords except negations
    filtered_stop_words = all_stop_words - negation_words  

    words = word_tokenize(text.lower())  # Tokenize & lowercase
    filtered_words = [wnl.lemmatize(word) for word in words if word.isalnum() and word not in filtered_stop_words]  
    return ' '.join(filtered_words)  # Join words back into a sentence

[nltk_data] Downloading package wordnet to /Users/lila/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lila/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lila/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
def preprocess_data(df):
    
    df = df.dropna()

    # dropping consecutive duplicates
    df = df.loc[df['Comment'] != df['Comment'].shift()].reset_index(drop=True)

    df['Comment_preprocessed'] = df['Comment'].apply(preprocess_text)
    df['Comment_preprocessed'] = df['Comment_preprocessed'].apply(lambda x: re.sub(r'\d+', '', x))

    return df

In [33]:
# Funkcja do obliczania Sentiment Score za pomocą NLTK
sia = SentimentIntensityAnalyzer()
def sentiment_score(comment):
    return sia.polarity_scores(comment)['compound']

In [34]:
# Funkcja do liczenia liczby zaimków osobistych
personal_pronouns = {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves",
                     "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself",
                     "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
                     "theirs", "themselves"}

def count_personal_pronouns(comment):
    words = word_tokenize(comment.lower())
    return sum(1 for word in words if word in personal_pronouns)

In [35]:
# Funkcja do analizy złożoności zdań (np. liczba fraz, słów)
connectors = ["and", "but", "so", "because", "which", "that", "then", "if", "or"]
def sentence_complexity(comment):
    words = word_tokenize(comment)
    phrases = re.split(r'\b(?:' + '|'.join(connectors) + r')\b', comment)

    num_words = len(words)
    num_phrases = len(phrases)

    avg_word_length = np.mean([len(word) for word in words]) if words else 0
    avg_phrase_length = num_words / num_phrases if num_phrases > 0 else 0
    unique_word_ratio = len(set(words)) / num_words if num_words > 0 else 0

    return {
        "Number_of_phrases": num_phrases,
        "Number_of_words": num_words,
        "Avg_phrase_length": avg_phrase_length,
        "Avg_word_length": avg_word_length,
        "Unique_word_ratio": unique_word_ratio
    }


In [36]:
# Funkcja do obliczania readability score
def readability_score(comment):
    return textstat.flesch_reading_ease(comment)

In [37]:
# Funkcja do liczenia negacji
def negation_count(comment):
    negations = {"not", "never", "no", "none", "cannot", "nothing","dont","nah",'wont','cant','doesnt','shouldnt',"shouldn't","doesn't"
                 "don't","won't","wouldn't", "can't","nobody",'neither','nope',"ain't","nowhere",'wouldnt'}
    words = word_tokenize(comment.lower())
    return sum(1 for word in words if word in negations)

In [38]:
# Funkcja do liczenia rzadkich słów
def rare_word_count(comment, all_words):
    words = word_tokenize(comment.lower())
    rare_words = [word for word in words if all_words[word] < 10]  # Możemy dostosować próg (tutaj <10)
    return len(rare_words)

In [39]:
class CustomFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy() 
        
        comments = X['Comment']
        features = pd.DataFrame()

        comments = comments.fillna('').astype(str)

        features['Sentiment_Score_nltk'] = comments.apply(sentiment_score)
        features['Starts_with_i'] = comments.str.startswith("i")
        features['Comment_Length'] = comments.str.len()

        features["Personal_Pronoun_count"] = comments.apply(count_personal_pronouns)
        features["Readability_Score"] = comments.apply(readability_score)
        features["Negation_Count"] = comments.apply(negation_count)

        df_complexity = comments.apply(lambda x: sentence_complexity(x))
        df_complexity = pd.DataFrame(df_complexity.tolist(), index=X.index)
        features = pd.concat([features, df_complexity], axis=1)

        all_words = Counter(' '.join(comments).lower().split())
        features['Rare_Word_Count'] = comments.apply(lambda x: rare_word_count(x, all_words))

        return features.to_numpy()


Pipeline

In [40]:
df = preprocess_data(df)

separate pipelines, to comparee in the final pipeline (if embeddings help, etc...)

In [41]:
text_pipeline = Pipeline([
    ('tfidf', TfidfEmbedding())
])

custom_feat_pipeline = Pipeline([
    ('custom', CustomFeatureExtractor()),
    ('scaler', StandardScaler())
])

combined_features = FeatureUnion([
    ('text', text_pipeline),
    ('custom', custom_feat_pipeline)
])

In [43]:
df.head()

Unnamed: 0,Comment,Sentiment,Comment_preprocessed
0,i remember this but also the tutorial map for ...,positive,remember tutorial map ps map iconic
1,gabriel jesus has seriously fit into this squa...,positive,gabriel jesus fit squad glove goal assist amaz...
2,this movie is going to be one of the best marv...,positive,movie marvel movie
3,just got home from work and listen to your cha...,positive,home work listen channel music relieve stress
4,bruh i had this on for 18 mins and couldnt sle...,negative,bruh min sleep sleep schedule wack


In [44]:
X_raw = df[['Comment', 'Comment_preprocessed']]
y = df['Sentiment']

In [45]:
#X_features = combined_features.fit_transform(X_raw)

In [46]:
#X_features.shape

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=42)


In [48]:
combined_features.fit(X_train)

In [50]:
X_train_features = combined_features.transform(X_train)
X_test_features = combined_features.transform(X_test)


In [51]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K Nearest Neighbours': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'LightGBM': LGBMClassifier()
}

In [53]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

results = []

for name, model in models.items():
    print(f"\n{name}")
    
    model.fit(X_train_features, y_train)
    y_pred = model.predict(X_test_features)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    results.append({
        'Model': name,
        'Accuracy': acc
    })

# Show all results in a table
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
print("\nSummary:")
print(results_df.to_string(index=False))



Logistic Regression
Accuracy: 0.7529
Classification Report:
              precision    recall  f1-score   support

    negative       0.64      0.42      0.50       355
     neutral       0.58      0.64      0.61       684
    positive       0.85      0.87      0.86      1685

    accuracy                           0.75      2724
   macro avg       0.69      0.64      0.66      2724
weighted avg       0.75      0.75      0.75      2724


K Nearest Neighbours
Accuracy: 0.6964
Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.43      0.46       355
     neutral       0.51      0.59      0.55       684
    positive       0.82      0.80      0.81      1685

    accuracy                           0.70      2724
   macro avg       0.61      0.60      0.61      2724
weighted avg       0.70      0.70      0.70      2724


Support Vector Machine
Accuracy: 0.7353
Classification Report:
              precision    recall  f1-score   suppo