In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load data

# https://www.kaggle.com/datasets/alyahmedts13/reddit-sentiment-analysis-dataset-for-nlp-projects
df_reddit = pd.read_csv("reddit_artist_posts_sentiment.csv")

# https://www.kaggle.com/datasets/vishakhdapat/imdb-movie-reviews
df_imdb = pd.read_csv("IMDB-Dataset.csv")

# https://www.kaggle.com/datasets/advaypatil/youtube-statistics
df_comments = pd.read_csv("comments.csv")

In [3]:
X, y = [], []

# 1. Reddit

In [5]:
df_reddit.head()

Unnamed: 0,text,label
0,pitchfork track review: taylor swift’s “actual...,negative
1,taylor swift has regained the masters of her f...,positive
2,pitchfork review: taylor swift - the life of a...,neutral
3,taylor swift announced engagement,neutral
4,taylor swift - the fate of ophelia (official m...,neutral


In [6]:
# Keep the necessary cols
X += df_reddit["text"].tolist()
y += df_reddit["label"].tolist()

# 2. IMDB

In [8]:
df_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
# Keep the necessary cols
X += df_imdb["review"].tolist()
y += df_imdb["sentiment"].tolist()

# 3. Comments

In [11]:
df_comments.head()

Unnamed: 0.1,Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,1.0
1,1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,0.0
2,2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,2.0


In [12]:
df_comments["Sentiment"] = df_comments["Sentiment"].map({
    0.0: "negative",
    1.0: "neutral",
    2.0: "positive"
})
df_comments.head()

Unnamed: 0.1,Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,neutral
1,1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,negative
2,2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,positive
3,3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,negative
4,4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,positive


In [13]:
# Keep the necessary cols
X += df_comments["Comment"].tolist()
y += df_comments["Sentiment"].tolist()

# 4. Create DF

In [15]:
df = pd.DataFrame({"X": X, "y": y})
df = df.dropna()

In [16]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply before creating X, y
df["X"] = df["X"].apply(preprocess_text)

# 5. Train

In [18]:
# Split dataset in training set and testing set
X_train, X_test, y_train, y_test = train_test_split(df["X"], df["y"], test_size=0.1, random_state=42, stratify=df["y"])

In [19]:
from sklearn.pipeline import FeatureUnion

# Combine word-level and character-level TF-IDF features
# Word n-grams capture semantic context, while character n-grams
# help with handling misspellings and subword patterns
vectorizer = FeatureUnion([
    ("word", TfidfVectorizer(ngram_range=(1,2))),
    ("char", TfidfVectorizer(analyzer="char", ngram_range=(1,2)))
])


In [20]:
model = Pipeline([
    ("vec", vectorizer),
    ("clf", LinearSVC(class_weight="balanced", random_state=42))
])

In [21]:
model.fit(X_train, y_train)



In [22]:
y_pred = model.predict(X_test)

In [23]:
print("Accuacy: ", accuracy_score(y_pred, y_test))
print("F1: ", classification_report(y_test, y_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred))

Accuacy:  0.8725587883618972
F1:                precision    recall  f1-score   support

    negative       0.89      0.85      0.87      3073
     neutral       0.82      0.87      0.84      2437
    positive       0.89      0.89      0.89      4526

    accuracy                           0.87     10036
   macro avg       0.87      0.87      0.87     10036
weighted avg       0.87      0.87      0.87     10036

Confusion matrix:  [[2609  195  269]
 [ 103 2126  208]
 [ 227  277 4022]]


# 6. Save

In [25]:
import joblib

In [26]:
joblib.dump(model, "sentiment_classifier.pkl")

['sentiment_classifier.pkl']