In [1]:
import numpy as np
import pandas as pd
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin 
from sklearn.model_selection import train_test_split

In [5]:
dataset = pd.read_csv('dataset/IMDB Dataset.csv')

In [6]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
print(f'Rows: {dataset.shape[1]}\nColumns: {dataset.shape[0]}')

Rows: 2
Columns: 50000


In [9]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


In [10]:
#text operation
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [11]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [16]:
#transformation and vectorization

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [17]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1)) 
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

In [18]:
#splitting into testing and training
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pickle
classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Train the Model
LRmodel.fit(X_train,y_train)   
LRpred = LRmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,LRpred)}')
print(f'Accuracy: {accuracy_score(y_test,LRpred)*100}%')
pickle.dump(LRmodel, open('saved_model/LinearRegression_model.sav', 'wb'))


Confusion Matrix:
[[4510  541]
 [ 442 4507]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      5051
    positive       0.89      0.91      0.90      4949

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 90.16999999999999%


In [34]:
# Another random review
pre = LRmodel.predict(["Production has an incredibly important place to shoot a series or film. Sometimes even a very minimalist story can reach an incredibly successful point after the right production stages. The Witcher series is far from minimalist. The Witcher is one of the best Middle-earth works in the world. Production quality is essential if you want to handle such a topic successfully."])
print(f'Prediction: {pre[0]}')

Prediction: positive


In [30]:
pre = LRmodel.predict(["It was a bad movie."])
print(f'Prediction: {pre[0]}')

Prediction: negative
