In [382]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [383]:
df = pd.read_json('train.json', encoding='utf-8')
df = df.sample(frac=1).reset_index(drop=True)
df = df[['content', 'disease']]
df.head()
df['disease'].value_counts()

none       302
measles     27
dengue      26
typhoid     24
Name: disease, dtype: int64

In [384]:
df.loc[df['disease']=='none', 'disease',]=0
df.loc[df['disease']=='dengue', 'disease',]=1
df.loc[df['disease']=='measles', 'disease',]=2
df.loc[df['disease']=='typhoid', 'disease',]=3
df.head()

Unnamed: 0,content,disease
0,"MANILA, Philippines — Former Igbaras, Iloilo m...",0
1,"MANILA, Philippines—The Philippine Health Insu...",2
2,May pasok na po sa karamihan ng opisina at esk...,0
3,TOKYO — Japan will provide close to P60 billio...,0
4,"MANILA, Philippines – Senator Richard Gordon o...",0


In [385]:
df_x = df['content']
df_y = df['disease']

In [386]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.5, random_state=4)

In [387]:
x_train.head()

232    MANILA, Philippines - The health department on...
297    ILOILO CITY, Philippines - Mayor Jerry Treñas ...
90     MANILA, Philippines — Following a spike of sur...
76     TACLOBAN CITY, Philippines - Dengue has killed...
108    MANILA - The Philippine National Police (PNP) ...
Name: content, dtype: object

In [388]:
cv = CountVectorizer(min_df=1, stop_words='english', vocabulary={
    'dengue':0, 
    'measles':1, 
    'typhoid':2,
})
x_traincv = cv.fit_transform(x_train)
a = x_traincv.toarray()
cv.inverse_transform(a[0])
x_testcv = cv.transform(x_test)

In [389]:
mnb = MultinomialNB()

In [390]:
y_train = y_train.astype('int')
y_train

232    3
297    3
90     0
76     3
108    0
229    0
327    0
162    0
35     0
257    0
178    0
81     0
163    1
57     0
121    0
321    0
26     2
140    3
341    0
104    0
4      0
266    0
77     0
281    0
135    0
62     0
347    0
271    2
278    3
38     0
      ..
49     0
136    3
30     2
377    0
194    0
56     1
201    2
149    3
21     0
183    0
0      0
52     0
126    0
294    0
44     0
164    3
375    0
313    1
311    0
94     0
109    0
359    0
252    0
58     0
306    0
87     0
360    1
197    0
174    0
122    0
Name: disease, Length: 189, dtype: int64

In [391]:
mnb.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [392]:
predictions = mnb.predict(x_testcv)

In [393]:
a = np.array(y_test)
a

array([0, 0, 0, 3, 0, 2, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1,
       2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 3, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 3, 0, 0, 0, 3], dtype=object)

In [394]:
count = 0

for i in range(len(predictions)):
    if predictions[i] == a[i]:
        count += 1
    else:
        print("Prediction", predictions[i], "\nActual:", a[i], "\n")
        
count

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 3 



154

In [395]:
len(predictions)

190

In [396]:
count / len(predictions)

0.8105263157894737