##IMDB Movies Reviews - Sentiment Analysis

## **Sentiment analysis**

Sentiment analysis is the interpretation and classification of emotions within text data using text analysis techniques.
- Positive
- Neutral
- Negative



Instalation libraries

In [2]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-vh5o8nbk
  Running command git clone --filter=blob:none --quiet https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-vh5o8nbk
  Resolved https://github.com/laxmimerit/preprocess_kgptalkie.git to commit 96bf02872d9756f29d6cddb8aafaedcd2a39bbb4
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-py3-none-any.whl size=7602 sha256=b188f2094a0d63e7674734a3f44fedf1c3fb8624a336e7afadd0d63bf2ce861c
  Stored in directory: /tmp/pip-ephem-wheel-cache-avt55nah/wheels/5c/94/34/99d5ff65e88b8d9a6c5e8d8652f2311d87790a61a1b7466e21
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptal

Libraries

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix,classification_report


import preprocess_kgptalkie as ps

#Read DataSet

In [5]:
df=pd.read_csv("/content/imdb_reviews.txt", sep="\t", header=None)
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [6]:
df.columns=["Reviews","Sentiment"]
df.head()

Unnamed: 0,Reviews,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


###Cleaning reviews

In [11]:
df['Reviews'] = df['Reviews'].apply(lambda x: ps.cont_exp(x))
df['Reviews'] = df['Reviews'].apply(lambda x: ps.remove_special_chars(x))
df['Reviews'] = df['Reviews'].apply(lambda x: ps.remove_html_tags(x))
df['Reviews'] = df['Reviews'].apply(lambda x: ps.remove_urls(x))
df['Reviews'] = df['Reviews'].apply(lambda x: ps.remove_accented_chars(x))
df['Reviews'] = df['Reviews'].apply(lambda x: ps.remove_emails(x))
df['Reviews'] = df['Reviews'].apply(lambda x: ps.make_base(x))

df['Reviews'] = df['Reviews'].apply(lambda x: str(x).lower())

df.head()

Unnamed: 0,Reviews,Sentiment
0,a very very very slowmoving aimless movie abou...,0
1,not sure who was more lose the flat character ...,0
2,attempt artiness with black white and clever c...,0
3,very little music or anything to speak of,0
4,the good scene in the movie was when gerardo i...,1


###Data Preparation

In [12]:
X=df["Reviews"]
y=df['Sentiment']

In [13]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=21, stratify=y)

In [14]:
X_train.shape, X_test.shape

((598,), (150,))

###RANDOM FOREST CLASSIFIER MODEL

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [19]:
pipe=Pipeline([
    ('tfidf', TfidfVectorizer()),
       ('clf',RandomForestClassifier() )
])




In [40]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__n_estimators': (50, 100),
    'clf__min_samples_split': (2,3)
}

In [41]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = 5, error_score='raise')

In [42]:
clf.fit(X_train,y_train)

In [43]:
clf.best_estimator_

In [44]:
clf.best_params_

{'clf__min_samples_split': 2,
 'clf__n_estimators': 100,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 2),
 'tfidf__use_idf': True}

In [45]:
clf.best_score_

0.7474929971988795

In [46]:
y_pred = clf.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.60      0.64        73
           1       0.66      0.73      0.69        77

    accuracy                           0.67       150
   macro avg       0.67      0.67      0.66       150
weighted avg       0.67      0.67      0.67       150



### LOGISTIC REGRESSION MODEL

In [48]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver = 'liblinear')),

])

In [49]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__penalty': ('l2', 'l1'),
    'clf__C': (1,2)
}

In [50]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = None)

In [51]:
clf.fit(X_train, y_train)

In [52]:
clf.best_estimator_

In [53]:
clf.best_params_

{'clf__C': 1,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [54]:
clf.best_score_

0.7575350140056022

In [55]:
y_pred = clf.predict(X_test)

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.77      0.81        73
           1       0.80      0.88      0.84        77

    accuracy                           0.83       150
   macro avg       0.83      0.83      0.83       150
weighted avg       0.83      0.83      0.83       150



### SVM MODEL

In [57]:
from sklearn.svm import LinearSVC

In [58]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [59]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__C': (1,2,2.5,3)
}

In [60]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = 5)

In [61]:
clf.fit(X_train, y_train)

In [62]:
clf.best_estimator_

In [63]:
clf.best_params_

{'clf__C': 3,
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 1.0,
 'tfidf__ngram_range': (1, 2),
 'tfidf__use_idf': True}

In [64]:
clf.best_score_

0.7675350140056022

In [65]:
y_pred = clf.predict(X_test)

In [66]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81        73
           1       0.83      0.81      0.82        77

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150



###Model testig

In [67]:
x=['This is a very "right on case" movie that delivers everything almost right in your face.  ']

In [73]:
clf.predict(x)
print (clf.predict(x))
if clf.predict(x)==[1]:
  print('Positive classification')
else:
  print('Negative classification')

[1]
Positive classification


In [74]:
import pickle as pkl

In [75]:
pkl.dump(clf, open('model.pkl', 'wb'))