# Naive Bayes Classification

In [60]:
import os
import requests
import zipfile
from io import BytesIO
import pandas as pd

# Specify directory, if it doesn't exist, make it
data_dir = f'{os.getcwd()}/data'

if not os.path.exists(data_dir):
    os.mkdir(data_dir)

#Url to data zip file
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'

response = requests.get(url)


In [61]:
#Check if successful
response

<Response [200]>

In [62]:
with zipfile.ZipFile(file=BytesIO(response.content),mode = 'r') as compressed_file:
    compressed_file.extractall(data_dir)

In [63]:
df_list = []

for csv_file in ['imdb_labelled.txt', 'yelp_labelled.txt', 'amazon_cells_labelled.txt']:
    csv_file_with_path = f'{data_dir}/sentiment labelled sentences/{csv_file}'

    temp_df = pd.read_csv(csv_file_with_path,sep='\t', header=0, names = ['text','sentiment'])

    df_list.append(temp_df)

df = pd.concat(df_list)

In [64]:
df

Unnamed: 0,text,sentiment
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0
...,...,...
994,The screen does get smudged easily because it ...,0
995,What a piece of junk.. I lose more calls on th...,0
996,Item Does Not Match Picture.,0
997,The only thing that disappoint me is the infra...,0


In [65]:
df.sentiment.value_counts().to_frame()

Unnamed: 0,sentiment
1,1385
0,1360


## We have read in the data and found that the sentiments are, more or less, split evenly. Now we can prepare the data for training and classifying usine a naive Bayes classifier

We split our data as usual. Our target variable is sentiment

In [66]:
from sklearn.model_selection import train_test_split
df_train,df_test = train_test_split(df, test_size = .4, random_state=42)

#Target Variable
y_train= df_train['sentiment']
y_test = df_test['sentiment']

We vectorize the predictor texts, getting rid of words that appear less than 3 times (min_df=3), and strip accents and turn them into ascii characters

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='ascii') 

x_train = vec.fit_transform(df_train['text']) 
x_test = vec.transform(df_test['text'])

Train and fit the model, and use a prediction

In [68]:
from sklearn.naive_bayes import MultinomialNB

clf5=MultinomialNB(fit_prior=True)
clf5.fit(x_train,y_train)
y_test_pred = clf5.predict(x_test)

Testing for accuracy

In [69]:
from sklearn.metrics import precision_recall_fscore_support
p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)

In [74]:
pd.DataFrame([p, r, f, s]).T.round(2).rename({0:'Precision',
                                              1:'Recall',
                                              2:"F",
                                              3:"Support"},axis=1)

Unnamed: 0,Precision,Recall,F,Support
0,0.81,0.78,0.79,565.0
1,0.77,0.8,0.79,533.0


Future work would be incorporating text classification into a credit card merchant algorithm for CC data