In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import preprocess_seroetr as ps

In [2]:
df=pd.read_csv("imdb_reviews.txt",sep="\t",header=None)
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
df.columns=["reviews","Sentiment"]
df.head()

Unnamed: 0,reviews,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
x="You can improve your machine-learning skills"
ps.remove_special_chars(x)
ps.remove_accented_chars(x)
ps.get_cont_to_exp(x)
ps.remove_emails(x)
ps.remove_html_tags(x)
ps.remove_urls(x)
ps.make_base(x)
ps.spelling_correction(x)

TextBlob("You can improve your machine-learning skill")

In [5]:
df["reviews"]=df["reviews"].apply(lambda x: ps.remove_special_chars(x))
df["reviews"]=df["reviews"].apply(lambda x: ps.remove_accented_chars(x))
df["reviews"]=df["reviews"].apply(lambda x: ps.get_cont_to_exp(x))
df["reviews"]=df["reviews"].apply(lambda x: ps.remove_emails(x))
df["reviews"]=df["reviews"].apply(lambda x: ps.remove_html_tags(x))
df["reviews"]=df["reviews"].apply(lambda x: ps.remove_urls(x))
df["reviews"]=df["reviews"].apply(lambda x: ps.make_base(x))

In [6]:
df.head()

Unnamed: 0,reviews,Sentiment
0,a very very very slowmove aimless movie about ...,0
1,not sure who was more lose the flat character ...,0
2,attempt artiness with black white and clever c...,0
3,very little music or anything to speak of,0
4,the good scene in the movie was when Gerardo i...,1


In [7]:
df.shape

(748, 2)

#### Data preparation for training

In [8]:
X=df["reviews"]
y=df["Sentiment"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape

((598,), (150,))

#### ML Logistic Logistic regression model building and training

In [11]:
pipe=Pipeline([
    ("tfidf",TfidfVectorizer()),
    ("clf",LogisticRegression(solver="liblinear"))
])

In [12]:
hyperparameters={
    "tfidf__max_df":[0.3, 0.5, 1.0],
    "tfidf__ngram_range":[(1,1),(1,2)],
    "tfidf__use_idf":[True,False],
    "tfidf__analyzer":["word","char","char_wb"],
    "clf__penalty":["l2","l1"],
    "clf__C":[1,2,5]
}

In [13]:
clf=GridSearchCV(pipe,hyperparameters,cv=5,n_jobs=-1)

In [14]:
%%time
print( clf.fit(X_train,y_train) )

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf',
                                        LogisticRegression(solver='liblinear'))]),
             n_jobs=-1,
             param_grid={'clf__C': [1, 2, 5], 'clf__penalty': ['l2', 'l1'],
                         'tfidf__analyzer': ['word', 'char', 'char_wb'],
                         'tfidf__max_df': [0.3, 0.5, 1.0],
                         'tfidf__ngram_range': [(1, 1), (1, 2)],
                         'tfidf__use_idf': [True, False]})
CPU times: total: 2.02 s
Wall time: 31.7 s


In [16]:
print( clf.best_estimator_ )

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.3)),
                ('clf', LogisticRegression(C=5, solver='liblinear'))])


In [18]:
print( clf.best_params_ )

{'clf__C': 5, 'clf__penalty': 'l2', 'tfidf__analyzer': 'word', 'tfidf__max_df': 0.3, 'tfidf__ngram_range': (1, 1), 'tfidf__use_idf': True}


In [19]:
print(clf.best_score_)

0.7691596638655461


In [20]:
y_pred=clf.predict(X_test)
print( classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79        76
           1       0.77      0.82      0.80        74

    accuracy                           0.79       150
   macro avg       0.79      0.79      0.79       150
weighted avg       0.79      0.79      0.79       150



#### ML Logistic SVM model building and training

In [21]:
from sklearn.svm import LinearSVC

In [37]:
pipe=Pipeline([
    ("tfidf",TfidfVectorizer()),
    ("clf",LinearSVC())
])

In [38]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__C': (1,2,2.5,3)
}

In [39]:
clf=GridSearchCV(pipe,hyperparameters,cv=5,n_jobs=-1)

In [40]:
%%time
print( clf.fit(X_train,y_train) )

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', LinearSVC())]),
             n_jobs=-1,
             param_grid={'clf__C': (1, 2, 2.5, 3),
                         'tfidf__analyzer': ('word', 'char', 'char_wb'),
                         'tfidf__max_df': (0.5, 1.0),
                         'tfidf__ngram_range': ((1, 1), (1, 2)),
                         'tfidf__use_idf': (True, False)})
CPU times: total: 719 ms
Wall time: 13.4 s


In [41]:
print( clf.best_estimator_ )

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5)),
                ('clf', LinearSVC(C=1))])


In [42]:
print( clf.best_params_ )

{'clf__C': 1, 'tfidf__analyzer': 'word', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1), 'tfidf__use_idf': True}


In [43]:
print(clf.best_score_)

0.7608403361344538


In [44]:
y_pred=clf.predict(X_test)
print( classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78        76
           1       0.77      0.81      0.79        74

    accuracy                           0.79       150
   macro avg       0.79      0.79      0.79       150
weighted avg       0.79      0.79      0.79       150



#### Test, Save, and Load ML model

In [47]:
x=["I would like to watch this movie","It's really bad sceenes","I want my money back","I can watch it again"]

In [48]:
clf.predict(x)

array([1, 0, 0, 1], dtype=int64)

In [49]:
import pickle as pkl

In [50]:
pkl.dump(clf,open("imdb_model.pkl","wb"))

#### Testing model

In [51]:
import pickle as pkl
clf_test=pkl.load(open("imdb_model.pkl","rb"))

In [54]:
import random
x=["I would like to watch this movie","It's really bad sceenes","I want my money back","I can watch it again"]
random.shuffle(x)
print(x)

["It's really bad sceenes", 'I can watch it again', 'I would like to watch this movie', 'I want my money back']


In [55]:
clf_test.predict(x)

array([0, 1, 1, 0], dtype=int64)