# Naive Bayes: Sentiment Analysis

In [120]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [121]:
from collections import Counter

In [122]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, f1_score

### Step 1: Loading the dataset

In [123]:
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


### Step 2: Study of variables and their content

In [124]:
data_c=data.copy()

In [125]:
data_dropped=data_c.drop(['package_name'], axis=1)

#### 1. Removing spaces and converting the text to lowercase:

In [126]:
data_dropped["review"] = data_dropped["review"].str.strip().str.lower()
data_dropped.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


#### 2. Divide the dataset into train and test: 

In [127]:
total_text=data_dropped["review"].str.cat(sep = "")
total_text_list=total_text.split()

In [128]:
counter_words = Counter(total_text_list)

In [129]:
counter_words.most_common(5)

[('the', 1273), ('to', 1159), ('i', 1048), ('and', 853), ('it', 746)]

In [130]:
X = data_dropped["review"]
Y = data_dropped["polarity"]

cv = CountVectorizer(ngram_range=(1, 2))
X_cv = cv.fit_transform(X)

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X_cv, Y, test_size=0.20)

In [132]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

In [133]:
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test.toarray())

In [134]:
confusion_matrix(y_test, y_pred_mnb)

array([[99, 17],
       [17, 46]])

In [135]:
f1_score(y_test, y_pred_mnb)

0.7301587301587301

In [136]:
confusion_matrix(y_test, y_pred_gnb)

array([[107,   9],
       [ 32,  31]])

In [137]:
f1_score(y_test, y_pred_gnb)

0.6019417475728155