# Sentiment Analysis

In [4]:
# importing libraries 
import pandas as pd 
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import contractions
from unidecode import unidecode
import warnings 
warnings.filterwarnings("ignore")
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from  sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [3]:
# loading the data 
data = pd.read_csv(r"C:\Users\pjosh\Documents\NLP\22_JAN_2022\Train.csv")
data.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [5]:
# creating functions for preprocessing
def remove_newlines(data):
    formatted_text = data.replace("\\n",' ').replace("\t"," ")
    return formatted_text

def contraction_map(data):
    fixed_text = contractions.fix(data)
    return fixed_text

def handle_accented(data):
    fixed_text = unidecode(data)
    return fixed_text

stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("not")
stopword_list.remove("nor")
def cleaning_text(data):
    tokens = word_tokenize(data)
    clean_tokens = [ i.lower() for i in tokens if (i.lower() not in stopword_list) and (i not in punctuation) ]
    clean_tokens = [ i for i in  clean_tokens if (len(i)>1) and i.isalpha()]
    return clean_tokens

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_tokens = []
    for i in data:
        lemmatized_word = lemmatizer.lemmatize(i)
        final_tokens.append(lemmatized_word)
    return " ".join(final_tokens)

In [7]:
# data leakage 
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,test_size=0.25,stratify=data.label,random_state=42)

In [8]:
clean_train = x_train.apply(remove_newlines)
clean_test = x_test.apply(remove_newlines)

clean_train = clean_train.apply(contraction_map)
clean_test = clean_test.apply(contraction_map)

clean_train = clean_train.apply(handle_accented)
clean_test = clean_test.apply(handle_accented)

clean_train = clean_train.apply(cleaning_text)
clean_test = clean_test.apply(cleaning_text)

clean_train = clean_train.apply(lemmatization)
clean_test = clean_test.apply(lemmatization)

# Count Vectorizer

In [9]:
count = CountVectorizer(max_features=1000,max_df=0.95,min_df=1)
count_val_train = count.fit_transform(clean_train,y_train)
count_val_test = count.transform(clean_test)


In [23]:
count_val_train

<30000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 1657115 stored elements in Compressed Sparse Row format>

In [17]:
data = pd.DataFrame(count_val_train.A,columns=count.get_feature_names())

In [13]:
count.get_feature_names()

['ability',
 'able',
 'absolutely',
 'accent',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'adaptation',
 'add',
 'admit',
 'adult',
 'adventure',
 'age',
 'ago',
 'agree',
 'air',
 'alien',
 'alive',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'amusing',
 'animal',
 'animation',
 'annoying',
 'another',
 'answer',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apparently',
 'appear',
 'appearance',
 'appears',
 'appreciate',
 'army',
 'around',
 'art',
 'aside',
 'ask',
 'aspect',
 'atmosphere',
 'attack',
 'attempt',
 'attention',
 'audience',
 'average',
 'avoid',
 'award',
 'away',
 'awesome',
 'awful',
 'baby',
 'back',
 'background',
 'bad',
 'badly',
 'band',
 'based',
 'basic',
 'basically',
 'battle',
 'beautiful',
 'beauty',
 'became',
 'become',
 'becomes',
 'bed',
 'begin',
 'beginning',
 'behind',
 'belief',
 'believable',
 'belie

#  Model Building

In [24]:
count_mnb = MultinomialNB()
count_mnb.fit(count_val_train.A,y_train)

In [25]:
predicted_y = count_mnb.predict(count_val_test.A)

In [27]:
count_acc = accuracy_score(y_test,predicted_y)*100
count_acc

83.31

# Tfidf Vectorizer

In [28]:
tfidf = TfidfVectorizer(max_features=1000,max_df=0.95,min_df=1)
tfidf_val_train = tfidf.fit_transform(clean_train,y_train)
tfidf_val_test = tfidf.transform(clean_test)

In [29]:
pd.DataFrame(tfidf_val_train.A,columns=tfidf.get_feature_names())

Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
3,0.0,0.117108,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.075478,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.066141,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.078977,0.052978,0.081266,0.0
29996,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.194612,0.0,0.0,0.000000,0.000000,0.000000,0.0
29997,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
29998,0.0,0.071574,0.0,0.0,0.0,0.000000,0.0,0.089954,0.0,0.000000,...,0.0,0.0,0.0,0.047505,0.0,0.0,0.000000,0.000000,0.000000,0.0


In [30]:
tfidf_mnb = MultinomialNB()
tfidf_mnb.fit(tfidf_val_train.A,y_train)
predicted_y = tfidf_mnb.predict(tfidf_val_test.A)
tfidf_acc = accuracy_score(y_test,predicted_y)*100
tfidf_acc

83.82

In [31]:
count_acc

83.31