# Naive Bayes project

In [67]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import pickle

In [2]:
# Read csv

df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [4]:
df_raw.sample(5)

Unnamed: 0,package_name,review,polarity
878,com.rovio.angrybirds,"good time waster. good fun, but getting old a...",1
859,com.rovio.angrybirds,way to ruin it. this game was good at one poi...,0
646,com.uc.browser.en,great! keep it up! i've been using uc mini fo...,1
251,com.android.chrome,unbelievably bad support by google for their...,0
135,com.king.candycrushsaga,simple fun game this is the type of game that...,1


In [5]:
df_raw['polarity'].value_counts()

0    584
1    307
Name: polarity, dtype: int64

**1. Transform dataframe**

In [6]:
df_transf = df_raw.copy()

In [7]:
# Drop package_name column
# 
df_transf = df_transf.drop('package_name', axis=1)

In [8]:

df_transf['review'] = df_transf['review'].str.strip()

# elimina espacio libre al principio y al final

In [9]:
# column review to lower case

df_transf['review'] = df_transf['review'].str.lower()

In [10]:
df = df_transf.copy()

**2. Split data frame**

In [13]:
X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=25)


In [14]:
y_train.value_counts()

0    438
1    230
Name: polarity, dtype: int64

In [15]:
y_test.value_counts()

0    146
1     77
Name: polarity, dtype: int64

**3. Pipeline with two pre-processing steps and one modeling step**

In [22]:
vect = CountVectorizer() # vector de conteo

text_vec = vect.fit_transform(X_train)

In [24]:
vect.get_feature_names_out()

array(['000', '10', '100', ..., 'žŕľ', 'ˇŕ', 'ˇŕľ'], dtype=object)

In [43]:
# text_vec.toarray()[0]

In [44]:
vect_tfidf = TfidfVectorizer()

text_vec_tfidf = vect_tfidf.fit_transform(X_train)

In [32]:

#np.set_printoptions(threshold=sys.maxsize)
#text_vec_tfidf.toarray()[0]


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [38]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
text_norm = text_clf.fit_transform(X_train)

In [41]:
# text_norm.toarray()[0]

In [45]:
clf_1 = MultinomialNB()

clf_1.fit(text_vec, y_train)

In [46]:
clf_2 = MultinomialNB()

clf_2.fit(text_vec_tfidf, y_train)

In [47]:
clf_3 = MultinomialNB()

clf_3.fit(text_norm, y_train)

In [54]:
pred_1 = clf_1.predict(vect.transform(X_test))
pred_2 = clf_2.predict(vect_tfidf.transform(X_test))
pred_3 = clf_3.predict(text_clf.transform(X_test))

print(classification_report(y_test, pred_1))
print(classification_report(y_test, pred_2))
print(classification_report(y_test, pred_3))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88       146
           1       0.87      0.61      0.72        77

    accuracy                           0.83       223
   macro avg       0.85      0.78      0.80       223
weighted avg       0.84      0.83      0.83       223

              precision    recall  f1-score   support

           0       0.69      0.99      0.81       146
           1       0.92      0.14      0.25        77

    accuracy                           0.70       223
   macro avg       0.80      0.57      0.53       223
weighted avg       0.77      0.70      0.62       223

              precision    recall  f1-score   support

           0       0.69      0.99      0.81       146
           1       0.92      0.14      0.25        77

    accuracy                           0.70       223
   macro avg       0.80      0.57      0.53       223
weighted avg       0.77      0.70      0.62       223



In [55]:
# Acá todo junto en un pipeline

text_clf_2 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf_2.fit(X_train, y_train)

**4. Check results**

In [56]:
y_pred = text_clf_2.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

(0.7664353672100255, 0.6995515695067265, 0.6171890028747934, None)

In [57]:
print('Naive Bayes Train Accuracy = ',metrics.accuracy_score(y_train,text_clf_2.predict(X_train)))
print('Naive Bayes Test Accuracy = ',metrics.accuracy_score(y_test,text_clf_2.predict(X_test)))

Naive Bayes Train Accuracy =  0.7889221556886228
Naive Bayes Test Accuracy =  0.6995515695067265


**5. Randomized search to select hyperparameters**

In [58]:
n_iter_search = 5
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
gs_clf = RandomizedSearchCV(text_clf_2, parameters, n_iter = n_iter_search)
gs_clf.fit(X_train, y_train)

In [59]:
gs_clf.best_params_

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01}

In [60]:
print('Naive Bayes Train Accuracy (grid random search) = ',metrics.accuracy_score(y_train,gs_clf.predict(X_train)))
print('Naive Bayes Test Accuracy (grid random search) = ',metrics.accuracy_score(y_test,gs_clf.predict(X_test)))

Naive Bayes Train Accuracy (grid random search) =  0.9985029940119761
Naive Bayes Test Accuracy (grid random search) =  0.8609865470852018


In [61]:
text_clf_count_vect = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf_count_vect.fit(X_train, y_train)


n_iter_search = 5
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1e-2, 1e-3)}
gs_count_vect = RandomizedSearchCV(text_clf_count_vect, parameters, n_iter = n_iter_search)
gs_count_vect.fit(X_train, y_train)



In [62]:
gs_count_vect.best_params_

{'vect__ngram_range': (1, 1), 'clf__alpha': 0.01}

In [63]:
print('Naive Bayes Train Accuracy (grid random search) = ',metrics.accuracy_score(y_train,gs_count_vect.predict(X_train)))
print('Naive Bayes Test Accuracy (grid random search) = ',metrics.accuracy_score(y_test,gs_count_vect.predict(X_test)))

Naive Bayes Train Accuracy (grid random search) =  0.9910179640718563
Naive Bayes Test Accuracy (grid random search) =  0.8251121076233184


In [64]:
y_pred_mejor = gs_clf.predict(X_test)

print(classification_report(y_test, y_pred_mejor))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90       146
           1       0.91      0.66      0.77        77

    accuracy                           0.86       223
   macro avg       0.88      0.81      0.83       223
weighted avg       0.87      0.86      0.85       223



In [66]:
best_model = gs_clf.best_estimator_
best_model

In [71]:
# Save best model

pickle.dump(best_model, open('../models/best_model.pickle', 'wb'))

In [73]:
modelo = pickle.load(open('../models/best_model.pickle', 'rb')) # lo leemos
modelo.predict(X_test) # lo usamos para predecir nueva X_test

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0])