#Loading the Dataset

Import the Data

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

In [2]:
# Import the dataset
dataset = pd.read_csv('IMDB Dataset.csv',engine = 'python', error_bad_lines=False)
dataset.shape

(50000, 2)

Transforming Documents into Feature Vectors

In [3]:
#Transforming Documents into Feature Vectors
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = ([
         'The dog is running',
         'I like chocolates beacause it is sweet',
         'The dog is running and chocolate is sweet, and two and two is four'
])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())

{'the': 11, 'dog': 4, 'is': 6, 'running': 9, 'like': 8, 'chocolates': 3, 'beacause': 1, 'it': 7, 'sweet': 10, 'and': 0, 'chocolate': 2, 'two': 12, 'four': 5}
[[0 0 0 0 1 0 1 0 0 1 0 1 0]
 [0 1 0 1 0 0 1 1 1 0 1 0 0]
 [3 0 1 0 1 1 3 0 0 1 1 1 2]]


Word Relevancy using TF-IDF

In [4]:
# Word Relevancy using TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
np.set_printoptions(precision=2)
tfidf = TfidfTransformer(use_idf = True, norm='l2', smooth_idf = True)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.   0.   0.   0.53 0.   0.41 0.   0.   0.53 0.   0.53 0.  ]
 [0.   0.45 0.   0.45 0.   0.   0.27 0.45 0.45 0.   0.34 0.   0.  ]
 [0.66 0.   0.22 0.   0.17 0.22 0.39 0.   0.   0.17 0.17 0.17 0.44]]


#Data Cleaning


In [5]:
# Data Cleaning - removing tags , replacing all the emoticons at the end of line
import re
def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
  return text
preprocessor("This is a :) test :-( !")

'this is a test :):('

Tokenization of Documents

In [6]:
# Tokenization of Document
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
  return text.split()
tokenizer('Running like running thus they run')

['Running', 'like', 'running', 'thus', 'they', 'run']

In [7]:
# Tokenization of Document and Steaming
def tokenizer_stemmer(text):
  return[porter.stem(word) for word in text.split()]
tokenizer_stemmer('Running like running thus they run')

['run', 'like', 'run', 'thu', 'they', 'run']

Transform Text Data into TF-IDF Vectors

In [8]:
# Transform Text Data into TF-IDF Vectors
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                         lowercase=True,
                         preprocessor=preprocessor,  # applied preprocessor in Data Cleaning
                         tokenizer=tokenizer_stemmer,
                         use_idf=True,
                         norm='l2',
                         smooth_idf=True)
y = dataset.sentiment.values
X = tfidf.fit_transform(dataset.review)

Document Classification using Logistic Regression

In [9]:
# Training Logistic Regression model on Dataset
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state =1, shuffle=False)
clf = LogisticRegressionCV(cv=5, 
                           scoring = 'accuracy',
                           random_state = 0,
                           n_jobs = -1,
                           verbose = 3,
                           max_iter = 300).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.9min finished


Model Evaluation

In [10]:
# Model Evaluation
clf.score(X_test, y_test)

0.89476

Confusion Matrix

In [13]:
# predicting the result for X_test
y_pred = clf.predict(X_test)

In [14]:
# To evaluate the performance of a classification model create confussion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[11046  1428]
 [ 1203 11323]]


0.89476