In [1]:
# train_classical.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import joblib
import re
from tqdm import tqdm  # Add this import

In [2]:
# 1. Load your dataset
df = pd.read_csv("youtube-comments-sentiment.csv")
df = df.sample(frac=0.3, random_state=42).reset_index(drop=True)
comments = df["CommentText"].tolist()
labels = df["Sentiment"].map({"Negative":0, "Neutral":1, "Positive":2}).tolist()

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    stops = set(stopwords.words('english'))
    stops.remove('not')  # keep 'not' for sentiment
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stops]
    return ' '.join(tokens)

clean_comments = [preprocess(comment) for comment in tqdm(comments, desc="Cleaning comments")]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\natur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Cleaning comments: 100%|██████████| 309668/309668 [01:30<00:00, 3432.65it/s]


In [4]:
# # 2. Preprocess (same as your BERT preprocessing)
# def preprocess(text):
#     text = text.lower()
#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
#     return text

# comments_clean = [preprocess(c) for c in comments]

In [5]:
# 3. Vectorize with TF-IDF (better than CountVectorizer)
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
X = vectorizer.fit_transform(clean_comments)
y = labels

In [6]:
# from nltk.sentiment import SentimentIntensityAnalyzer
# from scipy.sparse import hstack, csr_matrix
# nltk.download('vader_lexicon')

# sia = SentimentIntensityAnalyzer()

# def add_sentiment_features(text):
#     scores = sia.polarity_scores(text)
#     return [scores['pos'], scores['neg'], scores['neu']]

# # Sentiment features (on original text)
# X_extra = [add_sentiment_features(t) for t in tqdm(comments, desc="Adding sentiment scores")]

# # Combine features
# X_combined = hstack([X, csr_matrix(X_extra)])

In [7]:

# # Load the vectorizer
# vectorizer = joblib.load("classical_models/tfidf_vectorizer.pkl")

# X = vectorizer.transform(comments_clean)
# y = labels


In [8]:
# # 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [9]:
# # 4. Train/Test Split
# X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y)

In [10]:
# Saving the vectorizer
joblib.dump(vectorizer, "classical_models/tfidf_vectorizer.pkl")

joblib.dump(X_train, "classical_models/X_train.pkl")
joblib.dump(X_test, "classical_models/X_test.pkl")
joblib.dump(y_train, "classical_models/y_train.pkl")
joblib.dump(y_test, "classical_models/y_test.pkl")  


['classical_models/y_test.pkl']

In [11]:
# Loading the splitted data
X_train = joblib.load("classical_models/X_train.pkl")
X_test = joblib.load("classical_models/X_test.pkl") 
y_train = joblib.load("classical_models/y_train.pkl")
y_test = joblib.load("classical_models/y_test.pkl")

Naive Bayes

In [53]:
model=GaussianNB()
# model = joblib.load("classical_models/NaiveBayes_model.pkl")

print(f"\nTraining Naive Bayes...")
model.fit(X_train.toarray(), y_train)
y_pred = model.predict(X_test.toarray())

# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)


print(f"  Accuracy: {acc}")
print(f"  Macro-F1: {f1}")
print(f"  Confusion Matrix:\n{cm}\n")

joblib.dump(model, f"classical_models/NaiveBayes_model.pkl")


Training Naive Bayes...
  Accuracy: 0.5282720315174217
  Macro-F1: 0.5207781338847075
  Confusion Matrix:
[[10946  4396  5440]
 [ 5091  7685  7792]
 [ 3476  3021 14087]]



['classical_models/NaiveBayes_model.pkl']

SDGclassifier

In [13]:
model=SGDClassifier(loss='log_loss', max_iter=1000)
# model = joblib.load("classical_models/SGDClassifier_model.pkl")


print(f"\nTraining SDGclassifier...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)


print(f"  Accuracy: {acc}")
print(f"  Macro-F1: {f1}")
print(f"  Confusion Matrix:\n{cm}\n")

joblib.dump(model, f"classical_models/SGDClassifier_model.pkl")



Training SDGclassifier...
  Accuracy: 0.5955210385248814
  Macro-F1: 0.5976096475214091
  Confusion Matrix:
[[12258  6326  2198]
 [ 5212 12357  2999]
 [ 3525  4791 12268]]



['classical_models/SGDClassifier_model.pkl']

Log Regression

In [55]:
model=LogisticRegression(max_iter=1000)
# model = joblib.load("classical_models/LogRegress_model.pkl")


print(f"\nTraining Logistic Regression...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)


print(f"  Accuracy: {acc}")
print(f"  Macro-F1: {f1}")
print(f"  Confusion Matrix:\n{cm}\n")

joblib.dump(model, f"classical_models/LogRegress_model.pkl")




Training Logistic Regression...
  Accuracy: 0.6173022895340201
  Macro-F1: 0.6189651441072018
  Confusion Matrix:
[[12862  5715  2205]
 [ 4917 12598  3053]
 [ 3145  4667 12772]]



['classical_models/LogRegress_model.pkl']

Decision Trees

In [None]:
model=DecisionTreeClassifier( max_depth=15, min_samples_leaf=10)
# model = joblib.load("classical_models/DecisionTrees_model.pkl")


print(f"\nTraining Decision Trees...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)


print(f"  Accuracy: {acc}")
print(f"  Macro-F1: {f1}")
print(f"  Confusion Matrix:\n{cm}\n")

joblib.dump(model, f"classical_models/DecisionTrees_model.pkl")



Training Decision Trees...
  Accuracy: 0.5788904317499274
  Macro-F1: 0.5808441541711924
  Confusion Matrix:
[[11810  7105  1867]
 [ 4707 13214  2647]
 [ 3432  6323 10829]]



['classical_models/DecisionTrees_model.pkl']

Random Forest

In [59]:
model=RandomForestClassifier(n_estimators=50,  # Reduce from 100
    max_depth=15,     # Limit tree depth
    min_samples_leaf=10,
    n_jobs=-1         # Parallelize)
)
# model = joblib.load("classical_models/Randomforest_model.pkl")



print(f"\nTraining Random Forest...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)


print(f"  Accuracy: {acc}")
print(f"  Macro-F1: {f1}")
print(f"  Confusion Matrix:\n{cm}\n")

joblib.dump(model, f"classical_models/Randomforest_model.pkl")



Training Random Forest...
  Accuracy: 0.5796331578777408
  Macro-F1: 0.5816186013185404
  Confusion Matrix:
[[11656  7015  2111]
 [ 4447 12587  3534]
 [ 3415  5513 11656]]



['classical_models/Randomforest_model.pkl']