In [10]:
#importing the libraries
import numpy as np
import pandas as pd
import string
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem import WordNetLemmatizer 

In [11]:
#positive sentiment
pos_rev = pd.read_csv("pos.txt", sep='\n', encoding="latin", header=None)

In [12]:
pos_rev['mood'] = 1

In [13]:
#renaming the column
pos_rev.rename(columns = {0:'review'}, inplace = True)

In [14]:
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [15]:
neg_rev = pd.read_csv("negative.txt", sep='\n', encoding="latin", header=None)
neg_rev['mood'] = 0
#renaming the column
neg_rev.rename(columns = {0:'review'}, inplace = True)

In [16]:
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


#pipeline
1.lowercase
2 tokenization
3 remove stop words
4 remove punctuation
5 Lemma/stemming
6 Bag of words or Tfidf
7 Train test split
8 Naive Bayes, SVM
9 Evaluate
10 Saving the model
11 Testing


In [17]:
lemma = WordNetLemmatizer()
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : x.lower())
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))


#code to remove the punc using regex
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
 
for i in pos_rev:
    if i in punc:
        test_str = test_str.replace(i, "")


In [18]:
lemma = WordNetLemmatizer()
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : x.lower())
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))


In [19]:
pos_rev

Unnamed: 0,review,mood
0,rock destine 21st century 's new `` conan `` '...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,sometimes like go movies fun wasabi good place...,1
4,emerge something rare issue movie 's honest ke...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family 's joyous life act yiddi...,1
5328,stand shadow motown best kind documentary one ...,1
5329,'s nice see piscopo years chaykin headly price...,1


In [21]:
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,'s laddish juvenile teenage boys could possibl...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find move,0
5327,many definitions 'time waster movie must surel...,0
5328,stand crocodile hunter hurry badly cobble look...,0
5329,thing look like made-for-home-video quickie,0


In [22]:
# common dataset
com_rev = pd.concat([pos_rev , neg_rev]).reset_index()
com_rev


Unnamed: 0,index,review,mood
0,0,rock destine 21st century 's new `` conan `` '...,1
1,1,gorgeously elaborate continuation `` lord ring...,1
2,2,effective too-tepid biopic,1
3,3,sometimes like go movies fun wasabi good place...,1
4,4,emerge something rare issue movie 's honest ke...,1
...,...,...,...
10657,5326,terrible movie people nevertheless find move,0
10658,5327,many definitions 'time waster movie must surel...,0
10659,5328,stand crocodile hunter hurry badly cobble look...,0
10660,5329,thing look like made-for-home-video quickie,0


In [24]:
# train test split
X_train , X_test , y_train , y_test = train_test_split(com_rev['review'].values , com_rev['mood'].values , test_size = 0.2, random_state = 101)
train_data = pd.DataFrame({'review':X_train , 'mood':y_train})
test_data = pd.DataFrame({'review':X_test , 'mood':y_test})


In [25]:
train_data

Unnamed: 0,review,mood
0,put washington honest work man john q archibal...,0
1,poignant familiar story young person suspend t...,1
2,timely director could ever dream quietly lyric...,1
3,film virtually choke self-consciousness,0
4,film take inside rhythms subject experience watch,1
...,...,...
8524,branagh forceful non-shakespeare screen perfor...,1
8525,movie friday fan critics damn already like sor...,0
8526,perhaps heaviest joyless movie ever make giant...,0
8527,film rival live fine little amuse-bouche keep ...,1
