# Naive Bayes: Sentiment Analysis

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

### Step 1: Loading the dataset

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


### Step 2: Study of variables and their content

In [5]:
data_c=data.drop(['package_name'], axis=1)

#### 1. Removing spaces and converting the text to lowercase:

In [6]:
data_c["review"] = data_c["review"].str.strip().str.lower()
data_c.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [7]:
data_c.shape

(891, 2)

#### 2. Divide the dataset into train and test: 

In [28]:
X=data_c["review"]
Y=data_c["polarity"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

#### 3. Transform the text into a word count matrix

In [30]:
cv=CountVectorizer(stop_words = "english")
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

### Step 3: Build a naive bayes model


#### 1. MultinomilNB

In [31]:
model_mnb = MultinomialNB()
model_mnb.fit(X_train, y_train)
y_pred_mnb = model_mnb.predict(X_test)
accuracy_score(y_test, y_pred_mnb)

0.8268156424581006

#### 2. GaussianNB

In [32]:
model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)
y_pred_gnb = model_gnb.predict(X_test)
accuracy_score(y_test, y_pred_gnb)

0.6815642458100558

#### 3. BernoulliNB

In [33]:
model_bnb = BernoulliNB()
model_bnb.fit(X_train, y_train)
y_pred_bnb = model_bnb.predict(X_test)
accuracy_score(y_test, y_pred_bnb)

0.7597765363128491

In [14]:
print("We can confirm that the best model for this kind of problem is MultinomialNB")

We can confirm that the best model for this kind of problem is MultinomialNB


### Step 4: Optimize the previous model

In [34]:
hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

# We initialize the random search
random_search = RandomizedSearchCV(model_mnb, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [35]:
random_search.fit(X_train, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': np.float64(2.821256281407035)}


After identifying the best hyperparameters, we re-trained the model.

In [36]:
model = MultinomialNB(alpha = 1.917638190954774, fit_prior = False)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8324022346368715

We have improved the model!

### Step 5: Save the model

In [18]:
from pickle import dump

dump(model, open("model_naive_bayes_alpha_1-9176_fit_prior_False.sav", "wb"))

### Step 6: Explore other alternatives

In [38]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)

In [39]:
predictions = rf.predict(X_test)

In [40]:
accuracy_score(y_test, predictions)

0.8547486033519553

In [22]:
print("In this case the random forest was a little bit better than the optimized ")

In this case the random forest was a little bit better than the optimized 
