In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer
from pickle import dump

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

In [3]:
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [4]:
df["review"] = df["review"].str.strip().str.lower()

In [5]:
df = df.drop("package_name", axis =1)

In [6]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [7]:
# We separate the predictors from the label
X = df["review"]
y = df["polarity"]

# We divide the sample into train and test at 80%
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = 0.80)

In [8]:
X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [9]:
#CountVectorizer 

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [10]:
X_train.shape

(712, 3310)

In [11]:
#Testing BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)

In [12]:
#Predicting the model

y_pred = model.predict(X_test)

In [13]:
print(f"The accuracy score of the test set is: {accuracy_score(y_test, y_pred):.3f}")

The accuracy score of the test set is: 0.771


In [14]:
print(f"The testing f1 score for the test set is: {f1_score(y_test, y_pred):.3f}")

The testing f1 score for the test set is: 0.506


In [15]:
parameters = {"alpha":[0.01, 0.1, 1, 10]}

In [16]:
# Create a GridSearchCV object
grid_search = GridSearchCV(model, parameters, cv=5)

# Fit the grid search to our data
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
best_BNB = grid_search.best_estimator_

In [17]:
best_BNB

In [18]:
#Predicting our new best model

y_pred = best_BNB.predict(X_test)

In [19]:
print(f"The accuracy score of the train set is: {accuracy_score(y_test, y_pred):.3f}")

The accuracy score of the train set is: 0.838


In [20]:
print(f"The testing f1 score for the testing set is: {f1_score(y_test, y_pred):.3f}")

The testing f1 score for the testing set is: 0.713


In [21]:
#Testing GaussianNB

GaussianNB = GaussianNB()
GaussianNB.fit(X_train, y_train)

In [22]:
y_pred_GaussianNB = GaussianNB.predict(X_test)

In [23]:
print(f"The accuracy score of the train set is: {round(accuracy_score(y_test, y_pred_GaussianNB),3)}%")

The accuracy score of the train set is: 0.804%


In [24]:
print(f"The training f1 score for the test set is: {f1_score(y_test, y_pred_GaussianNB):.3f}")

The training f1 score for the test set is: 0.653


In [25]:
#Testing MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, y_train)

In [26]:
y_pred = MNB.predict(X_test)

In [27]:
print(f"The accuracy score of the train set is: {round(accuracy_score(y_test, y_pred),3)}")

The accuracy score of the train set is: 0.816


In [28]:
print(f"The testing f1 score for the testing set is: {f1_score(y_test, y_pred):.3f}")

The testing f1 score for the testing set is: 0.660


In [29]:
params={"alpha":[0.01, 0.1, 1, 10, 100, 200, 500, 1000], "fit_prior":[True,False]}

In [30]:
scorer = make_scorer(accuracy_score)

In [31]:
# Creating GridSearchCV For MultinomialNB which had the best scores

grid_search = GridSearchCV(MNB, params, cv=5, scoring=scorer)

#Fitting the GridSearch to our data
grid_search.fit(X_train, y_train)

#Best parameters
best_parameters = grid_search.best_params_
bst_MNB = grid_search.best_estimator_

In [32]:
bst_MNB

In [33]:
y_pred = bst_MNB.predict(X_test)

In [34]:
print(f"The accuracy score of the test set is: {round(accuracy_score(y_test, y_pred),3)}")

The accuracy score of the test set is: 0.821


In [35]:
print(f"The testing f1 score of the test set is: {f1_score(y_test, y_pred):.3f}")

The testing f1 score of the test set is: 0.673


In [36]:
dump(bst_MNB, open("naive_bayes_alpha_1-9176382_fit_prior_False_42.sav", "wb"))