In [127]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [128]:
df = pd.read_json('train.json', encoding='utf-8')
df = df.sample(frac=1).reset_index(drop=True)
df = df[['content', 'disease']]
df.head()
df['disease'].value_counts()

none       302
dengue      69
typhoid     38
measles     38
Name: disease, dtype: int64

In [129]:
df.loc[df['disease']=='none', 'disease',]=0
df.loc[df['disease']=='dengue', 'disease',]=1
df.loc[df['disease']=='measles', 'disease',]=2
df.loc[df['disease']=='typhoid', 'disease',]=3
df.head()

Unnamed: 0,content,disease
0,MANILA - Sugatan ang driver ng service vehicle...,0
1,"MANILA, Philippines — De La Salle Philippines ...",0
2,Measles (“tigdas”) has killed at least 44 peop...,2
3,"MANILA, Philippines — In the face of intense c...",0
4,"CAMP VICENTE LIM, Laguna , Philippines — What ...",0


In [130]:
df_x = df['content']
df_y = df['disease']

In [131]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.50, random_state=4)

In [132]:
x_train.head()

366    New satellite images show China is still devel...
271    STA. MARIA, Ilocos Sur - Plano ng National His...
185    DAVAO CITY — Health officials in the city on M...
365    MANILA, Philippines — A fire of still unknown ...
132    MANILA - Tropical Depression Ramil has crossed...
Name: content, dtype: object

In [133]:
cv = CountVectorizer(min_df=1, stop_words='english', vocabulary={
    'dengue':0, 
    'measles':1, 
    'typhoid':2,
})
x_traincv = cv.fit_transform(x_train)
a = x_traincv.toarray()
cv.inverse_transform(a[0])
x_testcv = cv.transform(x_test)

In [134]:
mnb = MultinomialNB()

In [135]:
y_train = y_train.astype('int')
y_train

366    0
271    0
185    3
365    0
132    0
261    0
177    3
298    0
356    1
229    1
346    0
19     0
304    2
117    1
340    0
48     0
406    0
399    0
422    0
147    0
65     3
210    1
417    0
347    0
116    0
325    2
235    0
353    0
295    0
428    0
      ..
194    0
56     0
201    0
149    0
21     2
183    0
0      0
387    0
52     0
126    0
294    2
44     2
164    0
441    1
313    0
311    0
94     0
109    1
359    0
252    0
58     2
393    0
306    0
87     1
360    0
385    0
197    0
439    0
174    3
122    0
Name: disease, Length: 223, dtype: int64

In [136]:
mnb.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [137]:
predictions = mnb.predict(x_testcv)

In [138]:
a = np.array(y_test)
pd.DataFrame(a, columns=['a'])['a'].value_counts()

0    147
1     36
2     22
3     19
Name: a, dtype: int64

In [126]:
count = [0, 0, 0, 0]

for i in range(len(predictions)):
    if predictions[i] == a[i]:
        if predictions[i] == 0:
            count[0] += 1
        if predictions[i] == 1:
            count[1] += 1
        if predictions[i] == 2:
            count[2] += 1
        if predictions[i] == 3:
            count[3] += 1
    else:
        print("Prediction", predictions[i], "\nActual:", a[i], "\n")
        
count

Prediction 1 
Actual: 0 

Prediction 1 
Actual: 0 

Prediction 1 
Actual: 0 

Prediction 3 
Actual: 0 

Prediction 1 
Actual: 0 

Prediction 0 
Actual: 3 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 2 

Prediction 0 
Actual: 1 

Prediction 2 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 2 
Actual: 0 

Prediction 1 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 3 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 1 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 2 
Actual: 0 

Prediction 1 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 2 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 0 
Actual: 1 

Prediction 2 
Actual: 0 

Prediction 0 
Actual: 1 

Prediction 2 
Actual: 0 



[132, 17, 24, 16]

In [47]:
len(predictions)

191