In [125]:
import pandas as pd
import numpy as np
import re
import pickle
import string
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import nltk
from collections import Counter
from textstat import textstat
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK imports
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# NLTK downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("vader_lexicon")

# Ignore warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /Users/lila/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lila/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [126]:
df = pd.read_csv('YoutubeCommentsDataSet.csv')

Wyodrębnienie zbioru walidacyjnego

In [127]:
from sklearn.model_selection import train_test_split

# split the dataset into train (85%) and validation (15%)
df, validation_df = train_test_split(df, test_size=0.15, stratify=df['Sentiment'], random_state=42)

validation_df.to_csv("validation_set.csv", index=False)

# Preprocessing

In [128]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet corpus
nltk.download('omw-1.4')  # Optional: Better word sense
nltk.download('punkt')

wnl = WordNetLemmatizer()

def preprocess_text(text):
    # Get all stopwords but preserve negations
    all_stop_words = set()
    for lang in stopwords.fileids():
        all_stop_words.update(stopwords.words(lang))  

    # Stopwords, które nie są usuwane
    negation_words = {"no", "not", "never", "none", "nowhere", "nobody", "nothing", "neither", "nor", 
                    "isn't", "wasn't", "weren't", "doesn't", "don't", "didn't", "hasn't", "haven't", 
                    "hadn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "couldn't", "mustn't"}

    # Remove stopwords except negations
    filtered_stop_words = all_stop_words - negation_words  

    words = word_tokenize(text.lower())  # Tokenize & lowercase
    filtered_words = [wnl.lemmatize(word) for word in words if word.isalnum() and word not in filtered_stop_words]  
    return ' '.join(filtered_words)  # Join words back into a sentence

[nltk_data] Downloading package wordnet to /Users/lila/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lila/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lila/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [129]:
def preprocess_data(df):
    
    df = df.dropna()

    # dropping consecutive duplicates
    df = df.loc[df['Comment'] != df['Comment'].shift()].reset_index(drop=True)

    df['Comment_preprocessed'] = df['Comment'].apply(preprocess_text)
    df['Comment_preprocessed'] = df['Comment_preprocessed'].apply(lambda x: re.sub(r'\d+', '', x))

    return df

# Funkcje do obróbki danych, inżynieria cech

In [130]:
# Funkcja do obliczania Sentiment Score za pomocą NLTK
sia = SentimentIntensityAnalyzer()
def sentiment_score(comment):
    return sia.polarity_scores(comment)['compound']

In [131]:
# Funkcja do liczenia liczby zaimków osobistych
personal_pronouns = {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves",
                     "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself",
                     "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
                     "theirs", "themselves"}

def count_personal_pronouns(comment):
    words = word_tokenize(comment.lower())
    return sum(1 for word in words if word in personal_pronouns)

In [132]:
# Funkcja do analizy złożoności zdań (np. liczba fraz, słów)
connectors = ["and", "but", "so", "because", "which", "that", "then", "if", "or"]
def sentence_complexity(comment):
    words = word_tokenize(comment)
    phrases = re.split(r'\b(?:' + '|'.join(connectors) + r')\b', comment)

    num_words = len(words)
    num_phrases = len(phrases)

    avg_word_length = np.mean([len(word) for word in words]) if words else 0
    avg_phrase_length = num_words / num_phrases if num_phrases > 0 else 0
    unique_word_ratio = len(set(words)) / num_words if num_words > 0 else 0

    return {
        "Number_of_phrases": num_phrases,
        "Number_of_words": num_words,
        "Avg_phrase_length": avg_phrase_length,
        "Avg_word_length": avg_word_length,
        "Unique_word_ratio": unique_word_ratio
    }


In [133]:
# Funkcja do obliczania readability score
def readability_score(comment):
    return textstat.flesch_reading_ease(comment)

In [134]:
# Funkcja do liczenia negacji
def negation_count(comment):
    negations = {"not", "never", "no", "none", "cannot", "nothing","dont","nah",'wont','cant','doesnt','shouldnt',"shouldn't","doesn't"
                 "don't","won't","wouldn't", "can't","nobody",'neither','nope',"ain't","nowhere",'wouldnt'}
    words = word_tokenize(comment.lower())
    return sum(1 for word in words if word in negations)

In [135]:
# Funkcja do liczenia rzadkich słów
def rare_word_count(comment, all_words):
    words = word_tokenize(comment.lower())
    rare_words = [word for word in words if all_words[word] < 10]  # Możemy dostosować próg (tutaj <10)
    return len(rare_words)

In [136]:
def prepare_features(df):
    # Ensure 'Comment' column is string and handle NaNs
    df['Comment'] = df['Comment'].fillna('').astype(str)

    # Add features
    df['Sentiment_Score_nltk'] = df['Comment'].apply(sentiment_score)
    df['Starts_with_i'] = df['Comment'].str.startswith("i")
    df['Comment_Length'] = df['Comment'].str.len()

    df["Personal_Pronoun_count"] = df["Comment"].apply(count_personal_pronouns)
    df["Readability_Score"] = df["Comment"].apply(readability_score)
    df["Negation_Count"] = df["Comment"].apply(negation_count)

    # Sentence complexity (assuming returns dict or Series)
    df_complexity = df["Comment"].apply(lambda x: sentence_complexity(x))
    df_complexity = pd.DataFrame(df_complexity.tolist(), index=df.index)
    df = pd.concat([df, df_complexity], axis=1)

    # Rare word counting
    all_words = Counter(' '.join(df['Comment']).lower().split())
    df['Rare_Word_Count'] = df['Comment'].apply(lambda x: rare_word_count(x, all_words))

    return df


# Pipeline

In [137]:
df = preprocess_data(df)

In [138]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
    
# Split data before creating X and y
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0)

# Create X and y after splitting
X_train = df_train.drop(columns=['Sentiment'])
y_train = df_train['Sentiment']
X_test = df_test.drop(columns=['Sentiment'])
y_test = df_test['Sentiment']

pipe = Pipeline([
    ('feature_eng', FunctionTransformer(prepare_features, validate=False)),
    ('select_text', FunctionTransformer(lambda x: x['Comment_preprocessed'], validate=False)),
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=0)),
    ('classifier', LogisticRegression())
])

# Fit and evaluate
pipe.fit(X_train, y_train)
accuracy = pipe.score(X_test, y_test)
print(f"Test accuracy: {accuracy:.4f}")


Test accuracy: 0.7053


In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naive Bayes': MultinomialNB()
}


for name, model in models.items():
    print(f"\n=== {name} ===")
    
    pipe = Pipeline([
        ('feature_eng', FunctionTransformer(prepare_features, validate=False)),
        ('select_text', FunctionTransformer(lambda x: x['Comment_preprocessed'], validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('smote', SMOTE(random_state=0)),
        ('classifier', model)
    ])

    # Fit model
    pipe.fit(X_train, y_train)

    # Predict on train and test
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)

    # Evaluate on train data
    print("\n-- Train Data --")
    print(f"Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
    print(classification_report(y_train, y_train_pred))

    # Evaluate on test data
    print("-- Test Data --")
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
    print(classification_report(y_test, y_test_pred))



=== Logistic Regression ===

-- Train Data --
Accuracy: 0.8819
              precision    recall  f1-score   support

    negative       0.76      0.96      0.85      1372
     neutral       0.79      0.86      0.82      2746
    positive       0.96      0.88      0.92      6805

    accuracy                           0.88     10923
   macro avg       0.84      0.90      0.86     10923
weighted avg       0.89      0.88      0.88     10923

-- Test Data --
Accuracy: 0.7053
              precision    recall  f1-score   support

    negative       0.45      0.59      0.51       614
     neutral       0.54      0.61      0.57      1184
    positive       0.87      0.77      0.82      2884

    accuracy                           0.71      4682
   macro avg       0.62      0.66      0.63      4682
weighted avg       0.73      0.71      0.72      4682


=== Random Forest ===

-- Train Data --
Accuracy: 0.9972
              precision    recall  f1-score   support

    negative       1.00     