# Sentiment analysis for Hindi/English code-mixed text.

<hr/>

### CS521 - Project | Spring'23

In [23]:
# All the main imports
import pandas as pd
import numpy as np
import re
import json


# All the sklearn imports
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize


In [24]:
df = pd.read_csv('train_data_clean.csv')

In [25]:
df.columns

Index(['id', 'Sentence', 'Label'], dtype='object')

In [26]:
df['Label'].value_counts()

neutral     5264
positive    4634
negative    4102
Name: Label, dtype: int64

In [27]:
df.isnull().value_counts()

id     Sentence  Label
False  False     False    14000
dtype: int64

In [29]:
df = df[df['Sentence'].isnull() == False]
df = df[df['Label'].isnull() == False]

In [30]:
df.isnull().value_counts()

id     Sentence  Label
False  False     False    14000
dtype: int64

In [34]:
df['Label'].value_counts()

neutral     5264
positive    4634
negative    4102
Name: Label, dtype: int64

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14000 entries, 0 to 13999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        14000 non-null  int64 
 1   Sentence  14000 non-null  object
 2   Label     14000 non-null  object
dtypes: int64(1), object(2)
memory usage: 437.5+ KB


In [36]:
df.isnull().value_counts()

id     Sentence  Label
False  False     False    14000
dtype: int64

In [37]:
df['Sentence'] = df['Sentence'].str.lower()

In [38]:
df['Sentence'].head(50)

0      nen vist bolest vztek smutek zmatek osam lost...
1      nehantics haan yaar neha kab karega woh post ...
2      rahulgandhi television media congress ke liye...
3      amitshah narendramodi all india me nrc lagu k...
4      nehr who typomantri anjanaomkashyap pagal hai...
5      narendramodi jeet ki dher sari subh kamnaye m...
6      fakeionist samjhotaxpress plichapel but topi ...
7      aajtak syedasimwaqar chitraaum syedzafarbjp y...
8      tarekfatah baih tere itjey kya jalti hai paki...
9      desimarthastew hehe i saw that coming and it ...
10     ecisveep can you answer miscalculated votes o...
11     mahaali3320 allah pak os k dil ka darwaza kbi...
12     ravishkumarblog bahut hi samajhdari se cingre...
13     rt mastani4423509 tu safar mera tu hi meri ma...
14     dobar se pm bnne ki aapko dher sari shubhakam...
15     siitae we can face life together and we can s...
16     waah kisi ne khub likha ke shama khud ko jala...
17     its tani yabikikainaaat abe haattt tum fl

In [39]:
# Define stemming and lemmatization functions
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


# Define function to perform stemming on words
def perform_stemming(text):
    tokens = word_tokenize(text)
    stemmed_words = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_words)

# Define function to perform lemmatization on words
def perform_lemmatization(text):
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(token, pos=wordnet.VERB) for token in tokens]
    return ' '.join(lemmatized_words)

stop_words_set_eng = set(stopwords.words('english'))

stop_words_set_hin = set([
    'is', 'ke', 'ka', 'ek', 'hai', 'hain', 'ki', 'ko', 'mein', 'se', 'par', 'bhi', 'ke', 'liye', 'saath',
    'ho', 'kar', 'vale', 'vali', 'kuch', 'jo', 'to', 'hi', 'tak', 'ya', 'hote', 'hota', 'tha', 'the',
    'ab', 'jab', 'kahaa', 'kisi', 'ne', 'unke', 'uske', 'uski', 'usmein', 'uskoe', 'usse', 'iskay',
    'iski', 'ismein', 'iskoe', 'isse', 'tab', 'phir', 'jaise', 'jiske', 'jiskee', 'jismein', 'jiskoe',
    'jisse', 'yah', 'yahee', 'ye', 'vah', 'vahee', 've', 'kai', 'kul', 'door', 'parantu', 'aap', 'tum',
    'tumhara', 'tumhare', 'main', 'mera', 'mere', 'ham', 'hamara', 'hamare', 'apna', 'apne', 'khud',
    'yahan', 'vahan', 'sabka', 'sabke', 'kisi', 'kise', 'sabhi', 'sab', 'koi', 'koi', 'kuch', 'kisi',
    'kisi', 'kisi', 'koi', 'dusra', 'any', 'any', 'aur', 'etc', 'etc', 'etc', 'etc', 'etc', 'etc', 'etc'
])

stop_words_set = stop_words_set_eng | stop_words_set_hin
df['Sentence'] = df['Sentence'].apply(perform_stemming)
df['Sentence'] = df['Sentence'].apply(perform_lemmatization)
df['Sentence'].head(50)

0     nen vist bolest vztek smutek zmatek osam lose ...
1     nehant haan yaar neha kab karega woh post usn ...
2     rahulgandhi televis media congress ke liy nhi ...
3     amitshah narendramodi all india me nrc lagu ka...
4     nehr who typomantri anjanaomkashyap pagal hai ...
5     narendramodi jeet ki dher sari subh kamnay mod...
6     fakeionist samjhotaxpress plichapel but topi w...
7     aajtak syedasimwaqar chitraaum syedzafarbjp ye...
8     tarekfatah baih tere itjey kya jalti hai pakis...
9     desimarthastew hehe i saw that come and it s a...
10    ecisveep can you answer miscalcul vote on each...
11    mahaali3320 allah pak os k dil ka darwaza kbi ...
12    ravishkumarblog bahut hi samajhdari se cingres...
13    rt mastani4423509 tu safar mera tu hi meri man...
14    dobar se pm bnne ki aapko dher sari shubhakamn...
15    siita we can face life togeth and we can spend...
16    waah kisi ne khub likha ke shama khud ko jala ...
17    it tani yabikikainaaat abe haattt tum flop

In [40]:
df.info

<bound method DataFrame.info of           id                                           Sentence     Label
0       4330  nen vist bolest vztek smutek zmatek osam lose ...   neutral
1      41616  nehant haan yaar neha kab karega woh post usn ...   neutral
2       6648  rahulgandhi televis media congress ke liy nhi ...  negative
3       2512  amitshah narendramodi all india me nrc lagu ka...  positive
4        610  nehr who typomantri anjanaomkashyap pagal hai ...   neutral
...      ...                                                ...       ...
13995  31686  anandk2012 railminindia irctcoffici piyushgoya...  negative
13996  34552  so i could n't get no one to cover me for my b...  positive
13997  16924  rt theskindoctor13 najimkhan07 aap logo ki baa...  positive
13998   5556  priyaverma ashutosh83b jay jay shree ram ram r...  positive
13999   3308  kanpuriya mujh to bhayankar ho gaya hai shayad...   neutral

[14000 rows x 3 columns]>

In [41]:
# print(stop_words_set)

In [43]:
# z = []

# for wrd in df['Words']:
#     if wrd in stop_words_set:
#         df.drop(df['Words'] == wrd)


In [46]:

# for i in stop_words_set:
#     df = df[df['Words'] != i]

def remove_stopwords(tokens):
    filtered_tokens = []
    for t in tokens.split(' '):
        if t not in stop_words_set:
            filtered_tokens.append(t)
    return ' '.join(filtered_tokens)

df['Sentence']= df['Sentence'].apply(remove_stopwords)

In [45]:
df.info

<bound method DataFrame.info of           id                                           Sentence     Label
0       4330  nen vist bolest vztek smutek zmatek osam lose ...   neutral
1      41616  nehant haan yaar neha kab karega woh post usn ...   neutral
2       6648  rahulgandhi televis media congress liy nhi h t...  negative
3       2512  amitshah narendramodi india nrc lagu kare w ka...  positive
4        610  nehr typomantri anjanaomkashyap pagal kya real...   neutral
...      ...                                                ...       ...
13995  31686  anandk2012 railminindia irctcoffici piyushgoya...  negative
13996  34552  could n't get one cover birthday im go open da...  positive
13997  16924  rt theskindoctor13 najimkhan07 logo baat nahi ...  positive
13998   5556  priyaverma ashutosh83b jay jay shree ram ram r...  positive
13999   3308  kanpuriya mujh bhayankar gaya shayad mai sahi ...   neutral

[14000 rows x 3 columns]>

In [56]:
def remove_num(tokens):
    filtered_tokens = []
    for t in tokens.split(' '):
        if t.isnumeric():
            continue;
        else:
            filtered_tokens.append(t)
    return ' '.join(filtered_tokens)

def remove_space(tokens):
    return tokens.strip()
df['Sentence']= df['Sentence'].apply(remove_num)
df['Sentence']= df['Sentence'].apply(remove_space)

In [58]:
df.info

<bound method DataFrame.info of           id                                           Sentence     Label
0       4330  nen vist bolest vztek smutek zmatek osam lose ...   neutral
1      41616  nehant haan yaar neha kab karega woh post usn ...   neutral
2       6648  rahulgandhi televis media congress liy nhi h t...  negative
3       2512  amitshah narendramodi india nrc lagu kare w ka...  positive
4        610  nehr typomantri anjanaomkashyap pagal kya real...   neutral
...      ...                                                ...       ...
13995  31686  anandk2012 railminindia irctcoffici piyushgoya...  negative
13996  34552  could n't get one cover birthday im go open da...  positive
13997  16924  rt theskindoctor13 najimkhan07 logo baat nahi ...  positive
13998   5556  priyaverma ashutosh83b jay jay shree ram ram r...  positive
13999   3308  kanpuriya mujh bhayankar gaya shayad mai sahi ...   neutral

[14000 rows x 3 columns]>

In [59]:
df.info
df_test = df['Label']
df_test.head(10)
df_train = df['Sentence']

In [61]:
df_train.head(10)

0    nen vist bolest vztek smutek zmatek osam lose ...
1    nehant haan yaar neha kab karega woh post usn ...
2    rahulgandhi televis media congress liy nhi h t...
3    amitshah narendramodi india nrc lagu kare w ka...
4    nehr typomantri anjanaomkashyap pagal kya real...
5    narendramodi jeet dher sari subh kamnay modi j...
6    fakeionist samjhotaxpress plichapel topi walay...
7    aajtak syedasimwaqar chitraaum syedzafarbjp mo...
8    tarekfatah baih tere itjey kya jalti pakistan ...
9    desimarthastew hehe saw come actual someon shaadi
Name: Sentence, dtype: object

#### Split dataset into 80% training set and 20% combined validation and test set
#### Split combined validation and test set into 50% validation set and 50% test set

In [62]:

X_train, X_val_test, y_train, y_val_test = train_test_split(df_train, df_test, test_size=0.2, random_state=42)


X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)


vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [63]:
'''
Logistic Regression
Traning and calculating the metrics for the predicted output
'''

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')


print("Logistic Regression:")
print("Accuracy: ", accuracy_lr)
print("Precision: ", precision_lr)
print("Recall: ", recall_lr)
print("F1 Score: ", f1_score_lr)

Logistic Regression:
Accuracy:  0.605
Precision:  0.6030917960900032
Recall:  0.605
F1 Score:  0.603736823128473


In [64]:
'''
Decision Tree 
Traning and calculating the metrics for the predicted output
'''

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Make predictions on test set using decision tree model
y_pred_dt = dt_model.predict(X_test_tfidf)

# Calculate metrics for decision tree model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_score_dt = f1_score(y_test, y_pred_dt, average='weighted')

print("\nDecision Tree:")
print("Accuracy: ", accuracy_dt)
print("Precision: ", precision_dt)
print("Recall: ", recall_dt)
print("F1 Score: ", f1_score_dt)


Decision Tree:
Accuracy:  0.5207142857142857
Precision:  0.5221932125129237
Recall:  0.5207142857142857
F1 Score:  0.5206642607367593
