### Consumer Experience Project

In [349]:
#Importing Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [229]:
#Importing dataset
df = pd.read_csv("IMDB Dataset.csv")
df = df.iloc[0:1000]

In [230]:
#checking the 10 rows from the dataset
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [231]:
df.shape

(1000, 2)

#### Text PreProcessing

In [261]:
#Lowercasing Text
df['review'] = df['review'].str.lower()

In [262]:
#Removing stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [263]:
#Storing English stopwords
stop_words = stopwords.words('english')

In [264]:
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [265]:
df['review'][0]

'one reviewers mentioned watching 1 oz episode hooked right exactly happened mebr br first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordbr br called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awaybr br would say main appeal show due fact goes shows dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered middle class inmates turned

In [286]:
#remove duplicates
df.drop_duplicates(inplace=True)

In [287]:
df.shape

(1000, 2)

In [268]:
#Remove Punctuations Marks
df['review'] = df['review'].replace('<.*?>"",', " ")

In [269]:
df['review'][1]

'wonderful little production br br filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece br br actors extremely well chosen michael sheen got polari voices pat truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life br br realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwells murals decorating every surface terribly well done'

In [270]:
#removing punctuation marks with a function
def remove_punctuation(input_text):
    translator = str.maketrans('','',string.punctuation)
    return input_text.translate(translator)

In [271]:
df['review'] = df['review'].apply(remove_punctuation)

In [272]:
df['review'][1]

'wonderful little production br br filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece br br actors extremely well chosen michael sheen got polari voices pat truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life br br realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwells murals decorating every surface terribly well done'

In [273]:
#Removing html tags with a function and using regex
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

In [274]:
def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [275]:
df['review'] = df['review'].apply(cleanhtml)

In [276]:
df['review'][1]

'wonderful little production br br filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece br br actors extremely well chosen michael sheen got polari voices pat truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life br br realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwells murals decorating every surface terribly well done'

In [288]:
pattern = re.compile('<.*?>')

In [289]:
def clean_html(text):
    clean_text = re.sub(pattern,"",text)
    return clean_text

In [291]:
df['review'] = df['review'].apply(clean_html)

In [293]:
#Splitting dataframe into X and Y frame
X = df.iloc[:,0:1]
y = df['sentiment']

In [294]:
#Applying Label Encoder on y set
encoder = LabelEncoder()

In [295]:
y = encoder.fit_transform(y)

In [146]:
#splitting the dataset in Train and Test Sets
#import Train Test Split from Sklearn Preprocessing

In [296]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [297]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(800, 1) (200, 1) (800,) (200,)


In [298]:
#Applying BagofWords/Count_Vectorizer on X_train and X_test
cv = CountVectorizer()

In [299]:
X_train_cv = cv.fit_transform(X_train['review']).toarray()
X_test_cv = cv.transform(X_test['review']).toarray()

In [152]:
#Applying Classifier Models and checking accuracy for all models for BagOfWords

In [338]:
def all_models(X_train,X_test,y_train,y_test):

    GNB = GaussianNB()
    y_pred_GNB = GNB.fit(X_train,y_train).predict(X_test)
    accuracy_GNB = metrics.accuracy_score(y_pred_GNB,y_test)
    print(f"Accuracy Score for Naive Bayes is : {accuracy_GNB}")

    CLF = DecisionTreeClassifier()
    y_pred_CLF = CLF.fit(X_train,y_train).predict(X_test)
    accuracy_CLF = metrics.accuracy_score(y_pred_CLF,y_test)
    print(f"Accuracy Score for DecisionTreeClassifier is : {accuracy_CLF}")
    
    RFC = RandomForestClassifier()
    y_pred_RFC = RFC.fit(X_train,y_train).predict(X_test)
    accuracy_RFC = metrics.accuracy_score(y_pred_RFC,y_test)
    print(f"Accuracy Score for RandomForestClassifier is : {accuracy_RFC}")
    
    SupV = SVC()
    y_pred_SupV = SupV.fit(X_train,y_train).predict(X_test)
    accuracy_SupV = metrics.accuracy_score(y_pred_SupV,y_test)
    print(f"Accuracy Score for SupportVectorClassifier is : {accuracy_SupV}")
    

In [339]:
all_models(X_train_cv,X_test_cv,y_train,y_test)

Accuracy Score for Naive Bayes is : 0.54
Accuracy Score for DecisionTreeClassifier is : 0.66
Accuracy Score for RandomForestClassifier is : 0.835
Accuracy Score for SupportVectorClassifier is : 0.78


In [340]:
#Applying BiGrams
Bi_grams = CountVectorizer(ngram_range=(1,2))

In [341]:
X_train_bigrams = Bi_grams.fit_transform(X_train['review']).toarray()
X_test_bigrams = Bi_grams.transform(X_test['review']).toarray()

In [342]:
#applying all models to Bigrams text vectorization
all_models(X_train_bigrams,X_test_bigrams,y_train,y_test)

Accuracy Score for Naive Bayes is : 0.59
Accuracy Score for DecisionTreeClassifier is : 0.7
Accuracy Score for RandomForestClassifier is : 0.795
Accuracy Score for SupportVectorClassifier is : 0.74


In [343]:
#Applying TriGrams
Tri_grams = CountVectorizer(ngram_range=(1,3))

In [345]:
X_train_trigrams = Tri_grams.fit_transform(X_train['review']).toarray()
X_test_trigrams = Tri_grams.transform(X_test['review']).toarray()

In [346]:
#applying all models to trigrams text vectorization
all_models(X_train_trigrams,X_test_trigrams,y_train,y_test)

Accuracy Score for Naive Bayes is : 0.585
Accuracy Score for DecisionTreeClassifier is : 0.67
Accuracy Score for RandomForestClassifier is : 0.75
Accuracy Score for SupportVectorClassifier is : 0.715


In [350]:
#Applying TfIDf
tfidf = TfidfVectorizer()

In [353]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

In [354]:
#applying all models to tfidf text vectorization
all_models(X_train_tfidf,X_test_tfidf,y_train,y_test)

Accuracy Score for Naive Bayes is : 0.52
Accuracy Score for DecisionTreeClassifier is : 0.71
Accuracy Score for RandomForestClassifier is : 0.79
Accuracy Score for SupportVectorClassifier is : 0.805


# 