# Naive Bayes

## Importing Dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

## Reading dataset

In [3]:
df = pd.read_csv('nb_dataset.csv', names = ['msg', 'class'])
df['class'] = df['class'].replace({'pos': 1 , 'neg': 0})
df

Unnamed: 0,msg,class
0,I love this sandwich,1
1,This is an amazing place,1
2,I feel very good about these beers,1
3,This is my best work,1
4,What an awesome view,1
5,I do not like this restaurant,0
6,I am tired of this stuff,0
7,I can't deal with this,0
8,He is my sworn enemy,0
9,My boss is horrible,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   msg     18 non-null     object
 1   class   18 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 416.0+ bytes


## Pre-processing

In [5]:
# lowercase the sentences

df['msg'] = df['msg'].str.lower()
df

Unnamed: 0,msg,class
0,i love this sandwich,1
1,this is an amazing place,1
2,i feel very good about these beers,1
3,this is my best work,1
4,what an awesome view,1
5,i do not like this restaurant,0
6,i am tired of this stuff,0
7,i can't deal with this,0
8,he is my sworn enemy,0
9,my boss is horrible,0


In [6]:
# remove punctuations

df['msg'] = df['msg'].str.replace(r'[^\w\d\s]',' ')
df

Unnamed: 0,msg,class
0,i love this sandwich,1
1,this is an amazing place,1
2,i feel very good about these beers,1
3,this is my best work,1
4,what an awesome view,1
5,i do not like this restaurant,0
6,i am tired of this stuff,0
7,i can t deal with this,0
8,he is my sworn enemy,0
9,my boss is horrible,0


In [7]:
df['msg'] = df['msg'].apply(lambda x: " ".join(term for term in x.split() if term not in stops))
df

Unnamed: 0,msg,class
0,love sandwich,1
1,amazing place,1
2,feel good beers,1
3,best work,1
4,awesome view,1
5,like restaurant,0
6,tired stuff,0
7,deal,0
8,sworn enemy,0
9,boss horrible,0


In [8]:
# Initializing Vectorizer

vectorize = TfidfVectorizer()

## Spliting the dataset

In [9]:
x = vectorize.fit_transform(df['msg']) # fitting inputs 
y = df['class']

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 10)

In [11]:
xtrain.shape

(13, 34)

In [12]:
xtest.shape

(5, 34)

## Model fitting and Evaluation

In [13]:
# Initialization

model = MultinomialNB()

In [14]:
# fitting the model

model.fit(xtrain, ytrain)

In [15]:
y_pred = model.predict(xtest)
y_pred

array([1, 1, 1, 0, 0], dtype=int64)

In [16]:
model.score(xtest, y_pred)

1.0