In [2]:
import numpy as np
import pandas as pd
import sklearn

## Train Data

In [15]:
train_data = pd.read_csv("example_train.csv")
train_data

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Document  5 non-null      object
 1   Class     5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


## convert Class column into numerical

In [50]:
train_data['Class'] = train_data.Class.map({"education": 1, "cinema": 0})
train_data['Class'] = train_data['Class'].astype("int")
train_data

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

## split data to X and y

In [17]:
train_array = train_data.values
train_array

array([['Upgrad is a great educational institution.', 1],
       ['Educational greatness depends on ethics', 1],
       ['A story of great ethics and educational greatness', 1],
       ['Sholey is a great cinema', 0],
       ['good movie depends on good story', 0]], dtype=object)

In [25]:
X_train = train_array[:,0]
X_train

array(['Upgrad is a great educational institution.',
       'Educational greatness depends on ethics',
       'A story of great ethics and educational greatness',
       'Sholey is a great cinema', 'good movie depends on good story'],
      dtype=object)

In [26]:
y_train = train_array[:,1]
y_train = y_train.astype("int")
y_train

array([1, 1, 1, 0, 0])

## Creating bag of words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
vec = CountVectorizer()

In [30]:
vec.fit(X_train)
vec.vocabulary_

{'upgrad': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [33]:
## remove stop words
vec = CountVectorizer(stop_words='english')
vec.fit(X_train)
vec.vocabulary_

{'upgrad': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [35]:
## Getting feature names
print(vec.get_feature_names_out())
print(len(vec.get_feature_names_out()))

['cinema' 'depends' 'educational' 'ethics' 'good' 'great' 'greatness'
 'institution' 'movie' 'sholey' 'story' 'upgrad']
12


In [36]:
X_transformed = vec.transform(X_train)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [37]:
## Converting X_transformed to array
X_transformed.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [38]:
## Converting matrix to dataframe
pd.DataFrame(X_transformed.toarray(), columns=vec.get_feature_names_out())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,upgrad
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


## Test Data


In [60]:
test_data = pd.read_csv("example_test.csv");
test_data

Unnamed: 0,Document,Class
0,very good educational institution,education


In [58]:
## Convert Class level to numeric variable - assigning 0 and 1
test_data['Class'] = test_data['Class'].map({"education": 1, "cinema": 0}).fillna(-1, downcast='infer')
test_data['Class'] = test_data['Class'].map({"education": 1, "cinema": 0}).fillna(-1, downcast='infer')
test_data

got it


In [63]:
### convert to np array
test_data_array = test_data.values
test_data_array

array([['very good educational institution', 1]], dtype=object)

In [64]:
## split X and y
X_test = test_data_array[:,0]
y_test = test_data_array[:,1]
print(X_test)
print(y_test)

['very good educational institution']
[1]


In [65]:
## transform test data

X_test_transformed = vec.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [66]:
## convert to non-sparse array
X_test = X_test_transformed.toarray()
X_test

array([[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

## Build Naive Bayes Model

In [70]:
from sklearn.naive_bayes import MultinomialNB

In [72]:
## instance of naive bayes
mnb = MultinomialNB()

## fitting model
mnb.fit(X_transformed, y_train)

## predicting
mnb.predict_proba(X_test)

array([[0.32808399, 0.67191601]])

## Building Bernoulli Naive Bayes

In [73]:
from sklearn.naive_bayes import BernoulliNB


In [74]:
bnb = BernoulliNB()

In [75]:
bnb.fit(X_transformed, y_train)

In [76]:
prob_bnb = bnb.predict_proba(X_test)
prob_bnb

array([[0.2326374, 0.7673626]])