# Import necessary libraries

In [1]:
import pandas as pd

#used for working with human language data
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [20]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : {'filename', 'file', 'content'}, default='content'
 |      - If `'filename'`, the sequence passed as an argument to fit is
 |        expected to be a list of filenames that need reading to fetc

In [2]:
# Step 1: Load the dataset (already cleaned it)
df = pd.read_csv('/kaggle/input/youtube-comments-dataset/YoutubeCommentsDataSet.csv')
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [5]:
# Drop rows from the DataFrame 'df' where the 'Comment' column has missing (NaN) values
df = df.dropna(subset=['Comment'])

In [7]:
# Replace 'neg' with 0, 'pos' with 1, and 'nue' with 2
df['Sentiment'] = df['Sentiment'].replace({'negative': 0, 'neutral': 1, 'positive': 2})

df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,1
1,here in nz 50 of retailers don’t even have con...,0
2,i will forever acknowledge this channel with t...,2
3,whenever i go to a place that doesn’t take app...,0
4,apple pay is so convenient secure and easy to ...,2


In [8]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Example usage
df['Comment'] = df['Comment'].apply(remove_stopwords)
df.head()

Unnamed: 0,Comment,Sentiment
0,lets forget apple pay 2014 required brand new ...,1
1,nz 50 retailers don’t even contactless credit ...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesn’t take apple pay doesn...,0
4,apple pay convenient secure easy use used kore...,2


In [13]:
# Step 2: Split the data into features (X) and target (y)
X = df['Comment']  
y = df['Sentiment']

In [14]:
# Step 3: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Step 4: Convert text data into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000) 
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:
# Step 5: Initialize models
models = {
    "Logistic Regression": LogisticRegression(multi_class='ovr', max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
    "Naive Bayes": MultinomialNB()
}

In [19]:
# Step 6: Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_tfidf, y_train)
    
    # Predict the sentiments for the test set
    y_pred = model.predict(X_test_tfidf)
    
    # Evaluate the model
    print(f"Accuracy Score for {model_name}: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")


Training Logistic Regression...
Accuracy Score for Logistic Regression: 0.7495235502314185
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.63      0.26      0.37       441
           1       0.63      0.55      0.59       912
           2       0.79      0.92      0.85      2320

    accuracy                           0.75      3673
   macro avg       0.69      0.58      0.60      3673
weighted avg       0.73      0.75      0.73      3673


Training Random Forest...
Accuracy Score for Random Forest: 0.732643615573101
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.59      0.18      0.27       441
           1       0.63      0.56      0.59       912
           2       0.77      0.91      0.83      2320

    accuracy                           0.73      3673
   macro avg       0.66      0.55      0.57      3673
weighted avg       0.71      0.73 