In [6]:
## importing the dependancies
import pandas as pd 
import numpy as np 
import spacy

In [7]:
## reading dataset
data = pd.read_csv('twitter_training.csv')
data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [8]:
data.shape

(74681, 4)

In [9]:
data.drop_duplicates()
data.shape

(74681, 4)

In [10]:
## removing unwanted columns
data.drop(columns=['2401'],inplace=True)

In [11]:
data.columns

Index(['Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [12]:
## renaming columns
data.rename(columns={'Borderlands':'Game_name',
                     'Positive':'sentiments',
                     'im getting on borderlands and i will murder you all ,':'tweets'},inplace=True)
data.head()

Unnamed: 0,Game_name,sentiments,tweets
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


In [13]:
## checking for the null values
data.isnull().sum()

Game_name       0
sentiments      0
tweets        686
dtype: int64

In [14]:
## deleting all null values
data=data.dropna()
data.isnull().sum()

Game_name     0
sentiments    0
tweets        0
dtype: int64

In [15]:
data.sentiments.value_counts()

sentiments
Negative      22358
Positive      20654
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [16]:
data.Game_name.nunique()

32

In [20]:
import pickle
from sklearn.preprocessing import OneHotEncoder
Game_encode = OneHotEncoder(sparse_output=False)

game_encode = Game_encode.fit_transform(data[['Game_name']])
game_df = pd.DataFrame(game_encode, columns=Game_encode.get_feature_names_out())
print(game_df.shape)

with open('game_encoder.pkl','wb') as f:
    pickle.dump(Game_encode,f)
game_df.head()

(73995, 32)


Unnamed: 0,Game_name_Amazon,Game_name_ApexLegends,Game_name_AssassinsCreed,Game_name_Battlefield,Game_name_Borderlands,Game_name_CS-GO,Game_name_CallOfDuty,Game_name_CallOfDutyBlackopsColdWar,Game_name_Cyberpunk2077,Game_name_Dota2,...,Game_name_Overwatch,Game_name_PlayStation5(PS5),Game_name_PlayerUnknownsBattlegrounds(PUBG),Game_name_RedDeadRedemption(RDR),Game_name_TomClancysGhostRecon,Game_name_TomClancysRainbowSix,Game_name_Verizon,Game_name_WorldOfCraft,Game_name_Xbox(Xseries),Game_name_johnson&johnson
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
## preprocessing tweets column
nlp = spacy.load('en_core_web_sm')

## function to preprocess
def lemmatization(text):
    doc = nlp(text)
    lemm_ls = [word.lemma_ for word in doc]
    return ' '.join(lemm_ls)

In [15]:
data['lemma'] = data['tweets'].apply(lemmatization)
data.head()

Unnamed: 0,Game_name,sentiments,tweets,lemma
0,Borderlands,Positive,I am coming to the borders and I will kill you...,"I be come to the border and I will kill you all ,"
1,Borderlands,Positive,im getting on borderlands and i will kill you ...,"I m get on borderland and I will kill you all ,"
2,Borderlands,Positive,im coming on borderlands and i will murder you...,I m come on borderland and I will murder you a...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,I m get on borderland 2 and I will murder you ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...,I m get into borderland and I can murder you a...


In [16]:
## remove stopwords
def rm_stop(text):
    doc = nlp(text)
    ns = [word.text for word in doc if not word.is_stop and not word.is_punct]
    return " ".join(ns)

In [17]:
data['final'] = data.lemma.apply(rm_stop)

In [18]:
data.head()

Unnamed: 0,Game_name,sentiments,tweets,lemma,final
0,Borderlands,Positive,I am coming to the borders and I will kill you...,"I be come to the border and I will kill you all ,",come border kill
1,Borderlands,Positive,im getting on borderlands and i will kill you ...,"I m get on borderland and I will kill you all ,",m borderland kill
2,Borderlands,Positive,im coming on borderlands and i will murder you...,I m come on borderland and I will murder you a...,m come borderland murder
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,I m get on borderland 2 and I will murder you ...,m borderland 2 murder
4,Borderlands,Positive,im getting into borderlands and i can murder y...,I m get into borderland and I can murder you a...,m borderland murder


In [19]:
data.drop(columns=['Game_name','tweets'],inplace= True)

In [20]:
data.drop(columns=['lemma'],inplace= True)

In [21]:
data.head()

Unnamed: 0,sentiments,final
0,Positive,come border kill
1,Positive,m borderland kill
2,Positive,m come borderland murder
3,Positive,m borderland 2 murder
4,Positive,m borderland murder


In [22]:
data.shape

(73995, 2)

In [23]:
data.reset_index(drop=True,inplace=True)
game_df.reset_index(drop=True,inplace=True)

In [24]:
senti_df = pd.concat([data,game_df],axis=1)
print(senti_df.shape)
senti_df.head()

(73995, 34)


Unnamed: 0,sentiments,final,Game_name_Amazon,Game_name_ApexLegends,Game_name_AssassinsCreed,Game_name_Battlefield,Game_name_Borderlands,Game_name_CS-GO,Game_name_CallOfDuty,Game_name_CallOfDutyBlackopsColdWar,...,Game_name_Overwatch,Game_name_PlayStation5(PS5),Game_name_PlayerUnknownsBattlegrounds(PUBG),Game_name_RedDeadRedemption(RDR),Game_name_TomClancysGhostRecon,Game_name_TomClancysRainbowSix,Game_name_Verizon,Game_name_WorldOfCraft,Game_name_Xbox(Xseries),Game_name_johnson&johnson
0,Positive,come border kill,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Positive,m borderland kill,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Positive,m come borderland murder,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Positive,m borderland 2 murder,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Positive,m borderland murder,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
## splitting into x and y
x = senti_df.drop(columns='sentiments')
y = senti_df['sentiments']

In [26]:
## TfIdf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [27]:
tfdf_matrix = tfidf.fit_transform(x['final'])

In [42]:
with open('tfidf_vect_model.pkl','wb') as f:
    pickle.dump(tfidf,f)

In [28]:
vect_df = pd.DataFrame(tfdf_matrix.toarray(),columns= tfidf.get_feature_names_out())
vect_df.head()

Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,اللعبه,حبيت,خلاص,عبر,فيديو,٥υ,घरच,การออกอากาศของฉ,นจาก,ℐℓ٥
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
vect_df.shape

(73995, 28037)

In [30]:
tfdf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 694023 stored elements and shape (73995, 28037)>

In [31]:
x.drop(columns=['final'],inplace= True)

In [32]:
x = pd.concat([x,vect_df],axis=1)
x.head()

Unnamed: 0,Game_name_Amazon,Game_name_ApexLegends,Game_name_AssassinsCreed,Game_name_Battlefield,Game_name_Borderlands,Game_name_CS-GO,Game_name_CallOfDuty,Game_name_CallOfDutyBlackopsColdWar,Game_name_Cyberpunk2077,Game_name_Dota2,...,اللعبه,حبيت,خلاص,عبر,فيديو,٥υ,घरच,การออกอากาศของฉ,นจาก,ℐℓ٥
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
x = x.fillna(0)

In [34]:
## train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20, random_state=45)

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [36]:
rf_model = RandomForestClassifier(n_estimators=200,
                                  random_state=42)
rf_model.fit(x_train,y_train)

In [40]:
with open('random_forest_model.pkl','wb') as f:
    pickle.dump(rf_model,f)

In [37]:
rf_pred = rf_model.predict(x_test)
print(accuracy_score(y_test,rf_pred))

0.9420906818028245


In [39]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

  Irrelevant       0.97      0.91      0.94      2562
    Negative       0.94      0.96      0.95      4411
     Neutral       0.95      0.94      0.94      3687
    Positive       0.93      0.95      0.94      4139

    accuracy                           0.94     14799
   macro avg       0.94      0.94      0.94     14799
weighted avg       0.94      0.94      0.94     14799

