### Sample program for Naive Bayes Classifier (small test sample)   
単純ベイズ分類器のサンプルプログラム(小サンプルでの実行例)  

#### Import libraries  

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#### Make DataFrame  

In [2]:
csv_in = 'fruits.csv'

In [3]:
df = pd.read_csv(csv_in, skiprows=0, delimiter=',', header=0)
df.columns = ['category', 'text']
print(df.shape)
print(df.info())
display(df.head())

(3, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  3 non-null      object
 1   text      3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes
None


Unnamed: 0,category,text
0,orange,orange orange Oranges. AdstockRF Chocolate bar...
1,grape,Fantasy seedless grapes. Fantasy seedless grap...
2,apple,Apples (Malus). Apples (Malus). Grant Heilman/...


#### Obtain X_train (data) and y_train (true category)  

In [4]:
X_train = df['text']
y_train = df['category']
print(X_train)
print(y_train)

0    orange orange Oranges. AdstockRF Chocolate bar...
1    Fantasy seedless grapes. Fantasy seedless grap...
2    Apples (Malus). Apples (Malus). Grant Heilman/...
Name: text, dtype: object
0    orange
1     grape
2     apple
Name: category, dtype: object


#### Specify X_test and y_test  

In [5]:
X_test = ['wine']
print(X_test)

['wine']


#### Collect words  

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer. get_feature_names()
print('Vocabulary size:', len(vocab))

Vocabulary size: 1020


#### Make BoW (word frequency vectors)     

In [7]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<3x1020 sparse matrix of type '<class 'numpy.int64'>'
	with 1338 stored elements in Compressed Sparse Row format>
X_test_bow:
<1x1020 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>


#### Display BoW  

In [8]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow)

Unnamed: 0_level_0,000,12,15,16,17,17th,18,1800s,18th,1920,...,world,would,year,years,yeasts,yield,young,zdf,zones,épernay
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
orange,0,2,0,1,0,0,0,0,0,1,...,2,0,0,2,0,0,0,0,0,0
grape,1,0,1,0,1,1,0,1,1,0,...,1,0,3,2,1,0,1,1,0,2
apple,1,0,0,0,0,0,1,0,0,0,...,3,1,3,4,0,1,2,2,1,0


#### Training of naive Bayes classifier  

In [9]:
model = MultinomialNB(alpha=1.0, class_prior=[1/3,1/3,1/3])
model.fit(X_train_bow, y_train)
print(model.classes_)
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

['apple' 'grape' 'orange']
Train accuracy: 1.0


#### Prediction using naive Bayes classifier after training  

In [10]:
proba = model.predict_proba(X_test_bow)
print(proba)
results = pd.DataFrame(proba, columns=model.classes_)
print('Prediction:')
display(results)
print(model.predict(X_test_bow))

[[0.120131   0.81399296 0.06587603]]
Prediction:


Unnamed: 0,apple,grape,orange
0,0.120131,0.813993,0.065876


['grape']
