In [21]:
import numpy as np
import re
import nltk
import pickle
import os
from zipfile import ZipFile 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import load_files

In [2]:
file_name = "review_polarity.zip"

# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip: 
    # printing all the contents of the zip file 
    zip.printdir() 
  
    # extracting all the files 
    print('Extracting all the files now...') 
    zip.extractall() 
    print('Done!') 

File Name                                             Modified             Size
txt_sentoken/                                  2019-06-22 00:12:24            0
txt_sentoken/neg/                              2019-06-22 00:12:24            0
txt_sentoken/neg/cv000_29416.txt               2004-02-16 09:10:34         4043
txt_sentoken/neg/cv001_19502.txt               2004-02-16 09:10:38         1370
txt_sentoken/neg/cv002_17424.txt               2004-02-16 09:10:46         2848
txt_sentoken/neg/cv003_12683.txt               2004-02-16 09:10:50         2929
txt_sentoken/neg/cv004_12641.txt               2004-02-16 09:10:56         4418
txt_sentoken/neg/cv005_29357.txt               2004-02-16 09:11:02         3911
txt_sentoken/neg/cv006_17022.txt               2004-02-16 09:11:12         3365
txt_sentoken/neg/cv007_4992.txt                2004-02-16 09:11:22         3554
txt_sentoken/neg/cv008_29326.txt               2004-02-16 09:11:28         4545
txt_sentoken/neg/cv009_29417.txt        

txt_sentoken/neg/cv977_4776.txt                2004-02-16 10:53:44         2668
txt_sentoken/neg/cv978_22192.txt               2004-02-16 10:53:52         3947
txt_sentoken/neg/cv979_2029.txt                2004-02-16 10:53:58         3476
txt_sentoken/neg/cv980_11851.txt               2004-02-16 10:54:04         3921
txt_sentoken/neg/cv981_16679.txt               2004-02-16 10:54:10         4881
txt_sentoken/neg/cv982_22209.txt               2004-02-16 10:54:14         2468
txt_sentoken/neg/cv983_24219.txt               2004-02-16 10:54:20         3000
txt_sentoken/neg/cv984_14006.txt               2004-02-16 10:54:24         2918
txt_sentoken/neg/cv985_5964.txt                2004-02-16 10:54:30         3949
txt_sentoken/neg/cv986_15092.txt               2004-02-16 10:54:34         3352
txt_sentoken/neg/cv987_7394.txt                2004-02-16 10:54:44         4009
txt_sentoken/neg/cv988_20168.txt               2004-02-16 10:54:50         4343
txt_sentoken/neg/cv989_17297.txt        

txt_sentoken/pos/cv475_21692.txt               2004-02-16 08:09:10         5406
txt_sentoken/pos/cv476_16856.txt               2004-02-16 08:09:12         4283
txt_sentoken/pos/cv477_22479.txt               2004-02-16 08:09:26         4040
txt_sentoken/pos/cv478_14309.txt               2004-02-16 08:09:32         3159
txt_sentoken/pos/cv479_5649.txt                2004-02-16 08:09:36         3594
txt_sentoken/pos/cv480_19817.txt               2004-02-16 08:09:40         2485
txt_sentoken/pos/cv481_7436.txt                2004-02-16 08:09:46         2959
txt_sentoken/pos/cv482_10580.txt               2004-02-16 08:09:54         4158
txt_sentoken/pos/cv483_16378.txt               2004-02-16 08:10:00         2621
txt_sentoken/pos/cv484_25054.txt               2004-02-16 08:10:02         1977
txt_sentoken/pos/cv485_26649.txt               2004-02-16 08:10:06         3178
txt_sentoken/pos/cv486_9799.txt                2004-02-16 08:10:08         4046
txt_sentoken/pos/cv487_10446.txt        

txt_sentoken/pos/cv975_10981.txt               2004-02-16 09:07:32         5662
txt_sentoken/pos/cv976_10267.txt               2004-02-16 09:07:32         1364
txt_sentoken/pos/cv977_4938.txt                2004-02-16 09:07:38         2532
txt_sentoken/pos/cv978_20929.txt               2004-02-16 09:07:46         4670
txt_sentoken/pos/cv979_18921.txt               2004-02-16 09:07:54         5010
txt_sentoken/pos/cv980_10953.txt               2004-02-16 09:07:58         2756
txt_sentoken/pos/cv981_14989.txt               2004-02-16 09:08:06         4519
txt_sentoken/pos/cv982_21103.txt               2004-02-16 09:08:12         3456
txt_sentoken/pos/cv983_22928.txt               2004-02-16 09:08:20         4836
txt_sentoken/pos/cv984_12767.txt               2004-02-16 09:08:28         3450
txt_sentoken/pos/cv985_6359.txt                2004-02-16 09:08:40         3826
txt_sentoken/pos/cv986_13527.txt               2004-02-16 09:08:52         6527
txt_sentoken/pos/cv987_6965.txt         

In [3]:
#Importing The dataset
reviews = load_files('txt_sentoken/')
X, Y = reviews.data, reviews.target

In [4]:
len(X)

2000

# Cleaning The Dataset

In [5]:
# it will have the cleaned data
# Creating the corpus
corpus = []

for i in range(0, 2000):
    
    review = re.sub(r'\W', ' ', str(X[i]))
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    corpus.append(review)    

# Making the TF-IDF Model

In [6]:
vectorizer = TfidfVectorizer(max_features = 2000, 
                             min_df = 3, 
                             max_df = 0.6, 
                             stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

In [7]:
# Splitting the dataset into the Training set and Test set
text_train, text_test, sent_train, sent_test = train_test_split(X, 
                                                                Y, 
                                                                test_size = 0.20, 
                                                                random_state = 0)

In [11]:
#Build our Classifier
classifier = LogisticRegression(random_state = 0)
classifier.fit(text_train, sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
#Predicting The Results
y_pred = classifier.predict(text_test)
y_pred

array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,

In [16]:
#Comparing the results
cm = confusion_matrix(sent_test, y_pred)
cm

array([[168,  40],
       [ 21, 171]], dtype=int64)

In [17]:
acc = accuracy_score(sent_test, y_pred)
print("Hence our models accuracy is: {}%".format(acc*100))

Hence our models accuracy is: 84.75%


# Building a text report showing the main classification metrics

In [19]:
cr = classification_report(sent_test, y_pred)
print(cr)

             precision    recall  f1-score   support

          0       0.89      0.81      0.85       208
          1       0.81      0.89      0.85       192

avg / total       0.85      0.85      0.85       400



# Saving Our Model and the Classifier

In [23]:
# Saving our classifier
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f)
    
# Saving the Tf-Idf model
with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)