In [127]:
import pandas as pd
import nltk
import numpy as np
import string
from scipy.sparse import csr_matrix
from nltk.tokenize import  word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm.notebook import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df_negative = pd.read_csv('./data/processedNegative.csv').T.reset_index()
df_negative['class'] = 0

In [3]:
df_neutral = pd.read_csv('./data/processedNeutral.csv').T.reset_index()
df_neutral['class'] = 1

In [4]:
df_possitive = pd.read_csv('./data/processedPositive.csv').T.reset_index()
df_possitive['class'] = 2

In [5]:
df = df_negative.append(df_possitive).append(df_neutral)
df.reset_index(drop=True,inplace=True)
df.columns = ['text', 'class']

In [38]:
df

Unnamed: 0,text,class
0,How unhappy some dogs like it though,0
1,talking to my over driver about where I'm goin...,0
2,Does anybody know if the Rand's likely to fall...,0
3,I miss going to gigs in Liverpool unhappy,0
4,There isnt a new Riverdale tonight ? unhappy,0
...,...,...
3868,IDFC official Vikram Limaye,1
3869,former captain Diana Edulji are others in pan...,1
3870,Supreme Court names former CAG as head of 4-me...,1
3871,Court summons CM suspended BJP MP as accused i...,1


# Preprocesing

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

In [30]:
text = ' '.join(df.text.map(str.lower).map(str.split).map(lambda x: [i for i in x if i not in stop_words]).map(' '.join).values)

In [31]:
text = text.translate(str.maketrans('', '', string.punctuation))

In [136]:
df.text = df.text\
.map(str.lower) \
.map(lambda x : x.translate(str.maketrans('', '', string.punctuation))) \
.map(str.split) \
.map(lambda x: [i for i in x if i not in stop_words]) \
.map(' '.join)

In [137]:
X, y = df.text, df['class']

Unnamed: 0,text,class
0,unhappy dogs like though,0
1,talking driver im goinghe said hed love go new...,0
2,anybody know rands likely fall dollar got mone...,0
3,miss going gigs liverpool unhappy,0
4,isnt new riverdale tonight unhappy,0
...,...,...
3868,idfc official vikram limaye,1
3869,former captain diana edulji others panel run,1
3870,supreme court names former cag head 4member pa...,1
3871,court summons cm suspended bjp mp accused crim...,1


## JUST TOKENIZATIONS

In [65]:
token_df = pd.Series(df.text).str.get_dummies(sep=' ')

In [66]:
token_df

Unnamed: 0,0,000,000019,014736,04,041017,0570,0700am,0845am,09,...,yummy,yura,yuri,zabardast,zac,zcc,zero,zoo,zoos,zplus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer

In [70]:
count_vectorizer = CountVectorizer()

In [77]:
count_df = pd.DataFrame(data=count_vectorizer.fit_transform(df.text).toarray())

In [79]:
count_df.columns = count_vectorizer.get_feature_names()

In [89]:
count_df.sum(axis=1)

0        4
1       13
2       14
3        5
4        5
        ..
3868     4
3869     7
3870    11
3871    10
3872    13
Length: 3873, dtype: int64

## TFIDFVectorizer

In [104]:
tfidf_vectorizer =  TfidfVectorizer()

In [108]:
tfidf_df = pd.DataFrame(tfidf_vectorizer.fit_transform(df.text).toarray())
tfidf_df.columns = tfidf_vectorizer.get_feature_names()

In [113]:
tfidf_df.loc[:,tfidf_df.iloc[-1] > 0]

Unnamed: 0,1985,agmut,amulya,appointed,cadre,commissioner,delhi,ips,new,officer,patnaik,police
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.220644,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.343744,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3869,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3870,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3871,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Classification

In [116]:
bayes_class = MultinomialNB()

### Token

In [133]:
X, y = token_df, df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

bayes_class.fit(X_train, y_train)

accuracy_score(y_test, bayes_class.predict(X_test))

0.8683870967741936

### Count

In [134]:
X, y = count_df, df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

bayes_class.fit(X_train, y_train)

accuracy_score(y_test, bayes_class.predict(X_test))

0.8683870967741936

### TFIDF

In [135]:
X, y = tfidf_df, df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

bayes_class.fit(X_train, y_train)

accuracy_score(y_test, bayes_class.predict(X_test))

0.8606451612903225