# Introduction: Movies which made U :)) or :((

## Performing Sentimental Analysis on Movie Reviews
## Goal is to figure out which Model gives the Best Accuracy and Get the Report of it.

## Dataset Details:
### * Dataset consists of 50,000 IMDB movie reviews
### * Consists of Three Columns ie ID, Sentiment & Reviews
### * Sentiment columns is binary, meaning the IMDB rating < 5 results in a sentiment score of 0, and rating >=7      have a sentiment score of 1.
### * No individual movie has more than 30 reviews.

## Lets Get Started:


### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load the Dataset & check for the info of it

In [2]:
senti = pd.read_csv('./movie_review.csv',sep='\t')

In [3]:
# 1 means positive review
# 2 means negative reivew

senti.head()

Unnamed: 0,id,review,sentiment
0,5814_8,With all this stuff going down at the moment w...,1
1,2381_9,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,It must be assumed that those who praised this...,0
4,9495_8,Superbly trashy and wondrously unpretentious 8...,1


In [4]:
senti.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22500 entries, 0 to 22499
Data columns (total 3 columns):
id           22500 non-null object
review       22500 non-null object
sentiment    22500 non-null int64
dtypes: int64(1), object(2)
memory usage: 527.4+ KB


In [5]:
senti.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sentiment,22500.0,0.501244,0.50001,0.0,0.0,1.0,1.0,1.0


# Pre-processing Step:

In [6]:
# stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import re
def preprocessor(text):
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text) 
    
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)  
    
    text = re.sub('([\W]+)(d|p)*', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')    
    return text

preprocessor(':( Today is not fine :()')


' today is not fine :( :('

### Tokenization & Stemming:

In [8]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    stem_tokens=[]
    tokens = text.split(' ')
    for token in tokens:
        stem_tokens.append(porter.stem(token))
    return stem_tokens

### Train & split the dataset

In [9]:
from sklearn.model_selection import train_test_split
X = senti['review']
y = senti['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Build the Model

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# If we don't pass these params, defaultly they are None
# If we don't pass stop_words, and preprocessor, our data is not good enough
# If we don't pass tokenizer, TfidfVectorizer will use the default one, but here we have tokenizer combined with stemming, better than default
tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf), 
                ('clf', DecisionTreeClassifier(random_state=0))])
clf.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x000002880430D620>,
                                 smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'mysel...
                                 tokenizer=<function tokenizer_porter at 0x0000028800E87598>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=No

### To get Report of the DecisionTree Classifier - Acc: 71%

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

prediction = clf.predict(X_test)
acc = accuracy_score(y_test, prediction)
matrix = confusion_matrix(y_test, prediction)
report = classification_report(y_test, prediction)
print(matrix)
print(report)
print(acc)

[[2397 1011]
 [ 958 2384]]
              precision    recall  f1-score   support

           0       0.71      0.70      0.71      3408
           1       0.70      0.71      0.71      3342

    accuracy                           0.71      6750
   macro avg       0.71      0.71      0.71      6750
weighted avg       0.71      0.71      0.71      6750

0.7082962962962963


### Check for other Model:

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# If we don't pass these params, defaultly they are None
# If we don't pass stop_words, and preprocessor, our data is not good enough
# If we don't pass tokenizer, TfidfVectorizer will use the default one, but here we have tokenizer combined with stemming, better than default
tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf), 
                ('clf', RandomForestClassifier(random_state=0))])
clf.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x000002880430D620>,
                                 smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'mysel...
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                     

### Accuracy for Random Forest Classifier - 76.5%

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

prediction = clf.predict(X_test)
acc = accuracy_score(y_test, prediction)
matrix = confusion_matrix(y_test, prediction)
report = classification_report(y_test, prediction)
print(matrix)
print(report)
print(acc)

[[2797  611]
 [1032 2310]]
              precision    recall  f1-score   support

           0       0.73      0.82      0.77      3408
           1       0.79      0.69      0.74      3342

    accuracy                           0.76      6750
   macro avg       0.76      0.76      0.76      6750
weighted avg       0.76      0.76      0.76      6750

0.7565925925925926


## Linear Support Vector Machine
### Linear Support Vector Machine is widely regarded as one of the best text classification algorithms.


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

# If we don't pass these params, defaultly they are None
# If we don't pass stop_words, and preprocessor, our data is not good enough
# If we don't pass tokenizer, TfidfVectorizer will use the default one, but here we have tokenizer combined with stemming, better than default
tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf), 
                ('clf', SGDClassifier(random_state=0))])
clf.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x000002880430D620>,
                                 smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'mysel...
                ('clf',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                 

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

prediction = clf.predict(X_test)
acc = accuracy_score(y_test, prediction)
matrix = confusion_matrix(y_test, prediction)
report = classification_report(y_test, prediction)
print(matrix)
print(report)
print(acc)

[[2998  410]
 [ 316 3026]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      3408
           1       0.88      0.91      0.89      3342

    accuracy                           0.89      6750
   macro avg       0.89      0.89      0.89      6750
weighted avg       0.89      0.89      0.89      6750

0.8924444444444445


### Linear Support Vector Machines - SGD Classifier - Best Accuracy with 89%

### For KNN Classifier

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

  
tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf), ('clf', KNeighborsClassifier())])
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x000002880430D620>,
                                 smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'mysel...
                                             'it', "it's", 'its', 'itself', ...],
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenizer_porter at 0x0000028800E87598>,
               

In [17]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

prediction = clf.predict(X_test)
acc = accuracy_score(y_test, prediction)
matrix = confusion_matrix(y_test, prediction)
report = classification_report(y_test, prediction)
print(matrix)
print(report)
print(acc)

[[2514  894]
 [ 545 2797]]
              precision    recall  f1-score   support

           0       0.82      0.74      0.78      3408
           1       0.76      0.84      0.80      3342

    accuracy                           0.79      6750
   macro avg       0.79      0.79      0.79      6750
weighted avg       0.79      0.79      0.79      6750

0.7868148148148149


### Accuracy for the KNN Classifier - 79%

### Conclusion: The Best Accuracy among all the Models tested were the Linear Support Vector Machine ie SGD Classifier with 89%