In [1]:
import pandas as pd
pd.set_option('max_colwidth',200)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier , LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
#Amazon Data
amazon = pd.read_csv("amazon_cells_labelled.txt" , delimiter='\t' , header=None)
amazon.columns = ['Sentence','Class']

#Yelp Data
yelp = pd.read_csv("yelp_labelled.txt" , delimiter='\t' , header=None)
yelp.columns = ['Sentence','Class']

#Imdb Data
imdb = pd.read_csv("imdb_labelled.txt" , delimiter='\t' , header=None)
imdb.columns = ['Sentence','Class']

In [3]:
data = pd.concat([amazon, yelp, imdb]).reset_index(drop = True)
data.head(20)

Unnamed: 0,Sentence,Class
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up right to get decent volume.,0
6,"If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.",0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [4]:
data['Class'].value_counts()

1    1386
0    1362
Name: Class, dtype: int64

In [5]:
data.to_csv("combined_review.xlsx")

In [6]:
#Text Preprocessing

#lower string
data['Sentence'] = data['Sentence'].str.lower()

#remove email adress
data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)

#remove IP address
data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex = True)

#remove punctaitions and special chracters
data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','' , regex = True)

#remove numbers
data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)

#remove stop words
for index, row in data.iterrows():
    word_tokens = word_tokenize(row['Sentence'])
    filtered_sentence = [w for w in word_tokens if w not in stopwords.words('english')]
    data.at[index , 'Sentence'] = " ".join(filtered_sentence[0:])

In [7]:
data.head(7)

Unnamed: 0,Sentence,Class
0,way plug us unless go converter,0
1,good case excellent value,1
2,great jawbone,1
3,tied charger conversations lasting minutesmajor problems,0
4,mic great,1
5,jiggle plug get line right get decent volume,0
6,several dozen several hundred contacts imagine fun sending one one,0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(data['Sentence'] , data['Class'] , test_size = 0.10, random_state=0)

In [17]:
len(X_train),len(X_test)

(2473, 275)

In [21]:
X_train.head()

1303                                     food really boring
2454    dont afraid subtitles worth little aversion therapy
745                   fantastic buy get whatever next phone
565                                    disapointing results
14                          design odd ear clip comfortable
Name: Sentence, dtype: object

In [18]:
vectorizer = TfidfVectorizer(analyzer = 'word' , ngram_range = (1,2)) 

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
vectorizer.get_feature_names()

['aailiyah',
 'aailiyah pretty',
 'abandoned',
 'abandoned factory',
 'abhor',
 'ability',
 'ability actually',
 'ability dwight',
 'ability meld',
 'ability phone',
 'ability pull',
 'able',
 'able roam',
 'able use',
 'able voice',
 'abound',
 'abovepretty',
 'abovepretty useless',
 'abroad',
 'abroad interacting',
 'absolute',
 'absolute must',
 'absolutel',
 'absolutel junk',
 'absolutely',
 'absolutely abysmal',
 'absolutely amazing',
 'absolutely appalling',
 'absolutely back',
 'absolutely clue',
 'absolutely delicious',
 'absolutely flavor',
 'absolutely great',
 'absolutely hilarious',
 'absolutely horrible',
 'absolutely loved',
 'absolutely problem',
 'absolutely ray',
 'absolutely recommend',
 'absolutely stars',
 'absolutely suspense',
 'absolutely warmth',
 'absolutley',
 'absolutley fantastic',
 'abstruse',
 'abstruse culture',
 'abysmal',
 'abysmal everything',
 'ac',
 'ac charger',
 'academy',
 'academy award',
 'accents',
 'accents absolutely',
 'accept',
 'accept cas

In [19]:
vectorizer.vocabulary_

{'food': 5916,
 'really': 12690,
 'boring': 1662,
 'food really': 5965,
 'really boring': 12694,
 'dont': 4137,
 'afraid': 276,
 'subtitles': 15101,
 'worth': 17705,
 'little': 8964,
 'aversion': 887,
 'therapy': 15631,
 'dont afraid': 4139,
 'afraid subtitles': 278,
 'subtitles worth': 15102,
 'worth little': 17711,
 'little aversion': 8965,
 'aversion therapy': 888,
 'fantastic': 5340,
 'buy': 1942,
 'get': 6384,
 'whatever': 17316,
 'next': 10476,
 'phone': 11485,
 'fantastic buy': 5342,
 'buy get': 1948,
 'get whatever': 6428,
 'whatever next': 17321,
 'next phone': 10480,
 'disapointing': 3962,
 'results': 13097,
 'disapointing results': 3963,
 'design': 3740,
 'odd': 10721,
 'ear': 4360,
 'clip': 2714,
 'comfortable': 2827,
 'design odd': 3747,
 'odd ear': 10722,
 'ear clip': 4364,
 'clip comfortable': 2715,
 'highly': 7355,
 'doubt': 4184,
 'anyone': 605,
 'could': 3191,
 'ever': 4907,
 'like': 8788,
 'trash': 16172,
 'highly doubt': 7356,
 'doubt anyone': 4185,
 'anyone could':

In [None]:
# sorted(vectorizer.vocabulary_.items(), key = lambda x : x[1])

In [20]:
print(X_train_tfidf.todense())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
df=pd.DataFrame(X_train_tfidf.todense(),columns=vectorizer.get_feature_names())
df

Unnamed: 0,aailiyah,aailiyah pretty,abandoned,abandoned factory,abhor,ability,ability actually,ability dwight,ability meld,ability phone,...,yun fat,za,za im,zero,zero stars,zero taste,zillion,zillion times,zombie,zombie movies
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df['Target']=y_train
df.head()

Unnamed: 0,aailiyah,aailiyah pretty,abandoned,abandoned factory,abhor,ability,ability actually,ability dwight,ability meld,ability phone,...,za,za im,zero,zero stars,zero taste,zillion,zillion times,zombie,zombie movies,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
classifier = SGDClassifier(alpha=1e-05 , max_iter=50 , penalty = 'elasticnet')

logre = LogisticRegression(solver='lbfgs')

nb = MultinomialNB()

rf = RandomForestClassifier()

knn = KNeighborsClassifier()

In [25]:
classifier.fit(X_train_tfidf, y_train)

logre.fit(X_train_tfidf, y_train)

nb.fit(X_train_tfidf, y_train)

rf.fit(X_train_tfidf, y_train)

knn.fit(X_train_tfidf, y_train)

KNeighborsClassifier()

In [26]:
predictions = classifier.predict(X_test_tfidf)

log_pred = logre.predict(X_test_tfidf)

nb_pred = nb.predict(X_test_tfidf)

rf_pred = rf.predict(X_test_tfidf)

knn_pred = knn.predict(X_test_tfidf)

In [27]:
#Model Evaluation

print( classifier, '\n' , classification_report(y_test, predictions) , '\n\n')
print( logre, '\n' , classification_report(y_test, log_pred) , '\n\n')
print( nb, '\n' , classification_report(y_test, nb_pred) , '\n\n')
print( rf, '\n' , classification_report(y_test, rf_pred) , '\n\n')
print( knn, '\n' , classification_report(y_test, knn_pred) , '\n\n')

SGDClassifier(alpha=1e-05, max_iter=50, penalty='elasticnet') 
               precision    recall  f1-score   support

           0       0.82      0.82      0.82       139
           1       0.82      0.82      0.82       136

    accuracy                           0.82       275
   macro avg       0.82      0.82      0.82       275
weighted avg       0.82      0.82      0.82       275
 


LogisticRegression() 
               precision    recall  f1-score   support

           0       0.81      0.82      0.82       139
           1       0.81      0.81      0.81       136

    accuracy                           0.81       275
   macro avg       0.81      0.81      0.81       275
weighted avg       0.81      0.81      0.81       275
 


MultinomialNB() 
               precision    recall  f1-score   support

           0       0.84      0.81      0.82       139
           1       0.81      0.84      0.82       136

    accuracy                           0.82       275
   macro avg     

## For Future Use save the vocabulory and model

### Saving Vocabulory

In [29]:
model_voca=open('Model_vocabulary.txt','w')
model_voca.write(str(vectorizer.vocabulary_))
model_voca.close()

### Saving Model

In [28]:
import pickle
model_file=open("Naive_bayes_model.pkl",'wb')
pickle.dump(nb,model_file)
model_file.close()


## Prediction on Data in near future

In [33]:
voca=eval(open('Model_vocabulary.txt').read())
type(voca),voca

(dict,
 {'food': 5916,
  'really': 12690,
  'boring': 1662,
  'food really': 5965,
  'really boring': 12694,
  'dont': 4137,
  'afraid': 276,
  'subtitles': 15101,
  'worth': 17705,
  'little': 8964,
  'aversion': 887,
  'therapy': 15631,
  'dont afraid': 4139,
  'afraid subtitles': 278,
  'subtitles worth': 15102,
  'worth little': 17711,
  'little aversion': 8965,
  'aversion therapy': 888,
  'fantastic': 5340,
  'buy': 1942,
  'get': 6384,
  'whatever': 17316,
  'next': 10476,
  'phone': 11485,
  'fantastic buy': 5342,
  'buy get': 1948,
  'get whatever': 6428,
  'whatever next': 17321,
  'next phone': 10480,
  'disapointing': 3962,
  'results': 13097,
  'disapointing results': 3963,
  'design': 3740,
  'odd': 10721,
  'ear': 4360,
  'clip': 2714,
  'comfortable': 2827,
  'design odd': 3747,
  'odd ear': 10722,
  'ear clip': 4364,
  'clip comfortable': 2715,
  'highly': 7355,
  'doubt': 4184,
  'anyone': 605,
  'could': 3191,
  'ever': 4907,
  'like': 8788,
  'trash': 16172,
  'high

## Loading Vocabulary in TF-IDF Vectorizer

In [34]:
count_vect_x = TfidfVectorizer(stop_words='english',vocabulary=voca)

In [35]:
count_vect_x.get_feature_names()

['aailiyah',
 'aailiyah pretty',
 'abandoned',
 'abandoned factory',
 'abhor',
 'ability',
 'ability actually',
 'ability dwight',
 'ability meld',
 'ability phone',
 'ability pull',
 'able',
 'able roam',
 'able use',
 'able voice',
 'abound',
 'abovepretty',
 'abovepretty useless',
 'abroad',
 'abroad interacting',
 'absolute',
 'absolute must',
 'absolutel',
 'absolutel junk',
 'absolutely',
 'absolutely abysmal',
 'absolutely amazing',
 'absolutely appalling',
 'absolutely back',
 'absolutely clue',
 'absolutely delicious',
 'absolutely flavor',
 'absolutely great',
 'absolutely hilarious',
 'absolutely horrible',
 'absolutely loved',
 'absolutely problem',
 'absolutely ray',
 'absolutely recommend',
 'absolutely stars',
 'absolutely suspense',
 'absolutely warmth',
 'absolutley',
 'absolutley fantastic',
 'abstruse',
 'abstruse culture',
 'abysmal',
 'abysmal everything',
 'ac',
 'ac charger',
 'academy',
 'academy award',
 'accents',
 'accents absolutely',
 'accept',
 'accept cas

In [39]:
X_test_counts_x = count_vect_x.fit_transform(X_test)

In [40]:
df1=pd.DataFrame(X_test_counts_x.todense(),columns=count_vect_x.get_feature_names())
df1

Unnamed: 0,aailiyah,aailiyah pretty,abandoned,abandoned factory,abhor,ability,ability actually,ability dwight,ability meld,ability phone,...,yun fat,za,za im,zero,zero stars,zero taste,zillion,zillion times,zombie,zombie movies
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Loading the Machine Learning Model

In [41]:
import pickle
loaded_model = pickle.load(open("Naive_bayes_model.pkl", 'rb'))


In [42]:
predicted = loaded_model.predict(X_test_counts_x)

In [43]:
predicted

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [44]:
df1['predicted']=predicted
df1.head()

Unnamed: 0,aailiyah,aailiyah pretty,abandoned,abandoned factory,abhor,ability,ability actually,ability dwight,ability meld,ability phone,...,za,za im,zero,zero stars,zero taste,zillion,zillion times,zombie,zombie movies,predicted
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
df1.to_excel('prediction_result.xlsx')