In [1]:
import pandas as pd 
import numpy as np
import re
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score

In [2]:
df = pd.read_csv('IMDB Dataset.csv',nrows=10000)

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def preprocessing_reviews(review):
    review = review.lower()
    review = re.sub(r"http\S+|www\S+|http\S+" , "" , review, flags=re.MULTILINE)
    review = review.translate(str.maketrans("","",string.punctuation))
    review =  re.sub(r'\@\w+|\#', "", review)
    
    review_token = word_tokenize(review)
    filtered_word = [word for word in review_token if word not in stop_words]
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_word]
    
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(lemma_words)

In [5]:
df = shuffle(df)
y = df['sentiment']
x = df.review.apply(preprocessing_reviews)

In [6]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)

In [7]:
train_x,val_x,train_y,val_y = train_test_split(x_vectorized,y)

In [8]:
regression = LogisticRegression(multi_class='multinomial',solver='newton-cg')
model = regression.fit(train_x,train_y)

In [9]:
params = {'C':[0.001,0.01,0.1,1,10,100,1000]}
gs_clf = GridSearchCV(model,params,n_jobs = 1, cv=5)
gs_clf = gs_clf.fit(train_x,train_y)
model = gs_clf.best_estimator_

In [10]:
y_pred = model.predict(val_x)

_f1 = f1_score(val_y, y_pred, average='micro')
_confusion = confusion_matrix(val_y , y_pred)
_precision = precision_score(val_y,y_pred,average='micro')
_recall = recall_score(val_y, y_pred , average='micro')
_statistics = { 'f1_score': _f1,
                'confusion_matrix': _confusion,
                'precision': _precision,
                'recall': _recall
    
               }

In [11]:
print(_statistics)

{'f1_score': 0.8652, 'confusion_matrix': array([[1076,  180],
       [ 157, 1087]], dtype=int64), 'precision': 0.8652, 'recall': 0.8652}


In [12]:
test_feature = vectorizer.transform(['Movie is good'])
model.predict(test_feature)

array(['positive'], dtype=object)

In [13]:
test_feature = vectorizer.transform(['Movie is bad and i do not like this movie at all'])
model.predict(test_feature)

array(['negative'], dtype=object)

In [14]:
import pickle

In [16]:
pick1 = { 'vectorizer': vectorizer,
          'model': model
        }
pickle.dump(pick1,open('models'+".p","wb"))