# Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

In [2]:
columns = ['id','company','Label','Text']
train_df=pd.read_csv('/content/drive/MyDrive/NLP/twitter_training.csv', names = columns)
test_df=pd.read_csv('/content/drive/MyDrive/NLP/twitter_validation.csv', names = columns)

In [3]:
train_df = train_df.dropna()

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73996 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       73996 non-null  int64 
 1   company  73996 non-null  object
 2   Label    73996 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [5]:
test_df = test_df.dropna()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   company  1000 non-null   object
 2   Label    1000 non-null   object
 3   Text     1000 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB


In [6]:
train_df['Label'].value_counts()

Label
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

# Preprocessing

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [9]:
train_df['Preprocessed Text'] = train_df['Text'].apply(preprocess)

In [10]:
train_df

Unnamed: 0,id,company,Label,Text,Preprocessed Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [11]:
label_en = LabelEncoder()

train_df['Label'] = label_en.fit_transform(train_df['Label'])

In [12]:
train_df

Unnamed: 0,id,company,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,3,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,3,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,3,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,3,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


train test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_df['Preprocessed Text'],
                                                    train_df['Label'],
                                                    test_size=0.2,
                                                    random_state=64,
                                                    stratify=train_df['Label'])

In [14]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (59196,)
X_test shape:  (14800,)


# ML Model

In [15]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('random forest', (RandomForestClassifier()))
])

In [16]:
clf.fit(X_train, y_train)

In [17]:
y_pred = clf.predict(X_test)

In [18]:
print(accuracy_score(y_test, y_pred))

0.9110135135135136


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2575
           1       0.93      0.93      0.93      4472
           2       0.94      0.89      0.91      3622
           3       0.85      0.95      0.90      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.90      0.91     14800
weighted avg       0.91      0.91      0.91     14800



# Test Model

In [20]:
test_df

Unnamed: 0,id,company,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [36]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

1. Test - 7

In [21]:
t_text = test_df['Text'][7]
print(f"{t_text} ===> {test_df['Label'][7]}")

Rocket League, Sea of Thieves or Rainbow Six: Siege🤔? I love playing all three on stream but which is the best? #stream #twitch #RocketLeague #SeaOfThieves #RainbowSixSiege #follow ===> Positive


In [22]:
t_text_prepro = [preprocess(t_text)]
t_text_prepro

['Rocket League Sea Thieves Rainbow siege 🤔 love play stream good stream twitch RocketLeague seaofthieve rainbowsixsiege follow']

In [23]:
t_text = clf.predict(t_text_prepro)

In [37]:
print(f"True Label: {test_df['Label'][7]}")
print(f'Predict Label: {classes[t_text[0]]}')

True Label: Positive
Predict Label: Positive


2. Test - 59

In [32]:
t1_text = test_df['Text'][59]
print(f"{t1_text} ===> {test_df['Label'][59]}")

Ok I'm blocking this man's he is on a new level of being ===> Irrelevant


In [33]:
t1_text_prepro = [preprocess(t1_text)]
t1_text_prepro

['ok block man new level']

In [34]:
t1_text = clf.predict(t1_text_prepro)

In [35]:
print(f"True Label: {test_df['Label'][59]}")
print(f'Predict Label: {classes[t1_text[0]]}')

True Label: Irrelevant
Predict Label: Irrelevant
