# Importing Libraries

In [2]:
import nltk
import numpy
import pandas
import re
import pickle
from nltk.corpus import stopwords
from sklearn.datasets import load_files

# Importing Dataset

In [58]:
# Data - http://www.cs.cornell.edu/people/pabo/movie-review-data/

In [12]:
# Loop through all the files in the directory and create classses based on the subdirectory
dataset = load_files('data/')

In [60]:
print(dataset)

In [13]:
# will be a list of inputs
x = dataset.data

# will be a numpy array of target
y = dataset.target

In [14]:
# persisting the dataset --> both input and output seperately
with open('input.pkl', 'wb') as f:
    pickle.dump(x, f)
    
with open('output.pkl', 'wb') as f:
    pickle.dump(y, f)

In [15]:
"""
Using a pickle file, we can load the data within seconds. 
When we have a huge file, will take a lot of time, so when we load the huge file for the first time, we can save it as a pickle file

""";

In [16]:
# Unplicking the dataset (example)
with open('input.pkl', 'rb') as f:
    x = pickle.load(f)
    
with open('output.pkl', 'rb') as f:
    pickle.load(f)

# Preprocessing

In [18]:
corpus = []

In [19]:
# Looping through the texts and applying preprocessing
for i in range(0, len(x)):
    review = re.sub(r'\W', ' ', str(x[i]))
    # lowering the text
    review = review.lower()
    # removing the single letter 
    review = re.sub(r'\s+[a-z]\s+', ' ', review)
    # removing the single letter in the start of the sentence
    review = re.sub(r'^[a-z]\s+', ' ', review)
    # removing the multiple space to a single space
    review = re.sub(r'\s+', ' ', review)
    #appending to corpous
    corpus.append(review)

In [21]:
print(len(corpus))

2000


In [23]:
"""
Note:
try 1 : Lets apply Bow first followed by tfidf on Bow.
try 2 : Lets apply tfidf directly. 

Both process yields the same ouput. Just for learning purpose.
""";

# Bag of words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features= 2000, min_df= 3, max_df= 0.6, stop_words= stopwords.words('english'))
x= vectorizer.fit_transform(corpus).toarray()

In [30]:
x.shape

(2000, 2000)

# Transform BOW to TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer
tftransformer = TfidfTransformer()
x = tftransformer.fit_transform(x).toarray()

In [32]:
x.shape

(2000, 2000)

# TFIDF Vectorizer

In [None]:
# Than going from a bag of words to Tfidf, we can go ahead and directly implemnt tfidf vectorizer

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features= 2000, min_df= 3, max_df= 0.6, stop_words= stopwords.words('english'))
x= vectorizer.fit_transform(corpus).toarray()

# Creating a train and test set

In [68]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 0)

In [69]:
x_train.shape

(1600, 2000)

In [70]:
x_test.shape

(400, 2000)

# Logistic Model

In [71]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [72]:
history = model.fit(x_train, y_train)

In [73]:
history

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Testing model

In [74]:
pred = model.predict(x_test)

In [75]:
from sklearn.metrics import confusion_matrix

In [76]:
cm = confusion_matrix(y_test , pred)

In [77]:
cm

array([[168,  40],
       [ 21, 171]], dtype=int64)

In [78]:
accuracy =( cm[0][0] + cm[1][1]) / 4
print(accuracy)

84.75


# Saving the model

In [80]:
# pickling the model
with open('classifier.pkl', 'wb') as f:
    pickle.dump(model, f)

In [81]:
# pickling the vectors of the model
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Using our Model

In [4]:
# unplickling the model and vectorizer
import pickle
with open('classifier.pkl', 'rb') as f:
    model = pickle.load(f)
    
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [7]:
sample_sentence = ['Hey buddy, you are a nice person ']

In [8]:
data = vectorizer.transform(sample_sentence).toarray()
print(model.predict(data))

[1]
