### Getting the Data

In [1]:
# Importing the libraries

import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Shashank
[nltk_data]     Prakash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Importing Datasets(Cornell Movie Review Data)
# Data Source : "http://www.cs.cornell.edu/people/pabo/movie-review-data/"; Dataset : "polarity dataset v2.0"

reviews = load_files('review_polarity/txt_sentoken/')
X,y = reviews.data, reviews.target #documents and corresponding classes in two diff. lists

In [7]:
X[0] #the first review

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is 

In [10]:
y[0] #the class for first review, 0 is for 'neg' folder and 1 is for 'pos' folder

0

In [11]:
# Storing as Pickle Files

with open('X.pickle','wb') as f:
    pickle.dump(X,f) #'wb' is for write as byte files, as pickles are byte files
with open('y.pickle','wb') as f:
    pickle.dump(y,f)

In [35]:
# Unpickling the dataset

with open('X.pickle','rb') as f:
    X = pickle.load(f)
with open('y.pickle','rb') as f:
    y = pickle.load(f)

### Preprocessing

In [36]:
# Creating the Corpus(list of documents)

corpus = []
for i in range(len(X)):
    review = re.sub(r'\W',' ',str(X[i]))
    review = review.lower()
    review = re.sub(r'\s+[a-z]\s+',' ',review)
    review = re.sub(r'^[a-z]\s+',' ',review)
    review = re.sub(r'\s+',' ',review)
    corpus.append(review)
corpus[0]

' arnold schwarzenegger has been an icon for action enthusiasts since the late 80 but lately his films have been very sloppy and the one liners are getting worse nit hard seeing arnold as mr freeze in batman and robin especially when he says tons of ice jokes but hey he got 15 million what it matter to him nonce again arnold has signed to do another expensive blockbuster that can compare with the likes of the terminator series true lies and even eraser nin this so called dark thriller the devil gabriel byrne has come upon earth to impregnate woman robin tunney which happens every 1000 years and basically destroy the world but apparently god has chosen one man and that one man is jericho cane arnold himself nwith the help of trusty sidekick kevin pollack they will stop at nothing to let the devil take over the world nparts of this are actually so absurd that they would fit right in with dogma nyes the film is that weak but it better than the other blockbuster right now sleepy hollow but

### Transforming data into BOW Model

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
vectorizer = CountVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray() #BOW Model - rows are docs. and cols. are words
X.shape

(2000, 2000)

### Transforming BOW Model to Tf-Idf Model

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer

In [38]:
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12007883, 0.        , 0.06321361, ..., 0.        , 0.        ,
        0.        ]])

### Logistic Regression Model

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [41]:
text_train,text_test,sent_train,sent_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [43]:
classifier = LogisticRegression()
classifier.fit(text_train,sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
sent_pred = classifier.predict(text_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sent_test,sent_pred)
cm

array([[168,  40],
       [ 21, 171]], dtype=int64)

In [51]:
accuracy = (cm[0,0]+cm[1,1])/cm.sum()
accuracy

0.8475

In [52]:
# Pickling the classifier

with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f)

In [60]:
#we also need the vectorizer to make a similar tfidf matrix and here we have a chain of bow model and then final tfidf transformer
#so, lets make a tfidf vectorizer and then pickle it.

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray() #BOW Model - rows are docs. and cols. are words

# Pickling the classifier

with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)

In [61]:
X #so, its same as the one we get using "data to BOW to TfIdf" steps

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12007883, 0.        , 0.06321361, ..., 0.        , 0.        ,
        0.        ]])

In [62]:
# Unpickling the classifier and vectorizer
with open('classifier.pickle','rb') as f:
    clf = pickle.load(f)
with open('tfidfmodel.pickle','rb') as f:
    tfidf = pickle.load(f)

In [63]:
sample = ["You are a nice person man, have a good life"]
sample = tfidf.transform(sample).toarray()
print(clf.predict(sample)) #the sample was peredicted as positive statement.

[1]
