In [68]:
import pandas as pd
import spacy
import random

In [81]:
df = pd.read_csv('twitter_training.csv')

In [82]:
df.shape

(74681, 4)

In [83]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [85]:
df.columns =['Tweet ID', 'Platform', 'Sentiments', 'Comments']
df.head()

Unnamed: 0,Tweet ID,Platform,Sentiments,Comments
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Tweet ID    74681 non-null  int64 
 1   Platform    74681 non-null  object
 2   Sentiments  74681 non-null  object
 3   Comments    73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [87]:
df.isna().sum()

Tweet ID        0
Platform        0
Sentiments      0
Comments      686
dtype: int64

In [88]:
df = df.dropna(how='any',axis=0)
df.isna().sum()

Tweet ID      0
Platform      0
Sentiments    0
Comments      0
dtype: int64

In [89]:
df['Sentiments'].value_counts()

Negative      22358
Positive      20654
Neutral       18108
Irrelevant    12875
Name: Sentiments, dtype: int64

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73995 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Tweet ID    73995 non-null  int64 
 1   Platform    73995 non-null  object
 2   Sentiments  73995 non-null  object
 3   Comments    73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [91]:
df['sentiment_code'] = df['Sentiments'].map({'Negative': 0, 'Positive': 1, 'Neutral': 2, 'Irrelevant': 3})
df.head()

Unnamed: 0,Tweet ID,Platform,Sentiments,Comments,sentiment_code
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,1
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,1
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,1
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,1
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,1


In [92]:
nlp = spacy.load("en_core_web_lg")
def preprocess(text):
    doc = nlp(text)
    words = []
    for word in doc:
        if word.is_stop or word.is_punct:
            continue
        words.append(word.lemma_)
    return ' '.join(words)

In [93]:
df['preprocessed_comment'] = df['Comments'].apply(preprocess)

### Handling class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)

In [71]:
X = df['preprocessed_comment']
y = df['sentiment_code']
y.value_counts()

0    22358
1    20654
2    18108
3    12875
Name: sentiment_code, dtype: int64

0    1
1    1
2    1
3    1
4    1
Name: sentiment_code, dtype: int64

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_comment'], 
    df['sentiment_code'], 
    test_size = 0.2,
    random_state=1234,
    stratify=df.sentiment_code
)

### Random Forest Classifier with Count Vectorizer

In [69]:
clf = Pipeline([
    ('count_vec', CountVectorizer(ngram_range=(1, 2))),
    ('random_forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      4472
           1       0.84      0.94      0.89      4131
           2       0.93      0.88      0.91      3621
           3       0.97      0.82      0.89      2575

    accuracy                           0.90     14799
   macro avg       0.91      0.89      0.90     14799
weighted avg       0.91      0.90      0.90     14799



### Multinomial Naive Bayes with Count Vectorizer

In [46]:
clf = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      4472
           1       0.87      0.89      0.88      4131
           2       0.92      0.85      0.88      3621
           3       0.97      0.82      0.89      2575

    accuracy                           0.88     14799
   macro avg       0.90      0.88      0.88     14799
weighted avg       0.89      0.88      0.88     14799



### RandomForestClassifier with TF-IDF Vectorizer

In [47]:
clf = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('rf_class', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4472
           1       0.85      0.95      0.90      4131
           2       0.94      0.89      0.91      3621
           3       0.96      0.85      0.91      2575

    accuracy                           0.91     14799
   macro avg       0.92      0.91      0.91     14799
weighted avg       0.92      0.91      0.91     14799



### Multinomial Naive Bayes with TF-IDF Vectorizer

In [48]:
clf = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.90      0.76      4472
           1       0.71      0.81      0.76      4131
           2       0.84      0.64      0.73      3621
           3       0.95      0.46      0.62      2575

    accuracy                           0.73     14799
   macro avg       0.79      0.70      0.71     14799
weighted avg       0.77      0.73      0.73     14799

