In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re  # Import regex library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Download stopwords (run once)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
df = pd.read_csv('YoutubeCommentsDataSet.csv')

In [3]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [4]:
df.shape

(18408, 2)

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.dropna(inplace=True)

In [7]:
def preprocess_text(text):
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)  # Keep only alphanumeric and whitespace
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    return ' '.join(stemmed_words)


In [8]:
df['Comment_new'] = df['Comment'].apply(preprocess_text)

In [9]:
df['Comment_new']

Unnamed: 0,Comment_new
0,let forget appl pay 2014 requir brand new ipho...
1,nz 50 retail dont even contactless credit card...
2,forev acknowledg channel help lesson idea expl...
3,whenev go place doesnt take appl pay doesnt ha...
4,appl pay conveni secur easi use use korean jap...
...,...
18403,realli like point engin toolbox think that lot...
18404,ive start explor field realli good remind get ...
18405,excelent video con una pregunta filosófica pro...
18406,hey daniel discov channel coupl day ago im lea...


In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Sentiment_encoded'] = label_encoder.fit_transform(df['Sentiment'])

In [11]:
df['Sentiment_encoded']

Unnamed: 0,Sentiment_encoded
0,1
1,0
2,2
3,0
4,2
...,...
18403,2
18404,2
18405,1
18406,2


In [12]:
X = df['Comment_new']
y = df['Sentiment_encoded']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
vectorizer = CountVectorizer(max_features=5000)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [22]:
rf_model = RandomForestClassifier(
    n_estimators=100,        # Fewer trees
    class_weight='balanced', # Handle class imbalance
    random_state=42
)
rf_model.fit(X_train_counts, y_train)

In [None]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train_counts, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train_counts, y_train)

In [23]:
train_score = rf_model.score(X_train_counts, y_train)
test_score = rf_model.score(X_test_counts, y_test)
print(f"Training Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")

Training Accuracy: 0.9957
Test Accuracy: 0.7256


In [24]:
def make_predictions(text):
    cleaned_text = preprocess_text(text)
    text_counts = vectorizer.transform([cleaned_text])
    prediction = rf_model.predict(text_counts)
    predicted_label = label_encoder.inverse_transform(prediction)[0]
    print("News category is:", predicted_label)

In [28]:
make_predictions('i like it very much')

News category is: positive


In [29]:
from sklearn.metrics import classification_report, confusion_matrix

# After training the model
y_pred = rf_model.predict(X_test_counts)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.27      0.36       456
           1       0.65      0.49      0.56       906
           2       0.76      0.92      0.83      2213

    accuracy                           0.73      3575
   macro avg       0.65      0.56      0.58      3575
weighted avg       0.70      0.73      0.70      3575

Confusion Matrix:
 [[ 123   76  257]
 [  82  442  382]
 [  26  158 2029]]


In [21]:
import pickle

# Save the CountVectorizer
with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the LabelEncoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save the Random Forest model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)