In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_json('train.json', encoding='utf-8')
df = df.sample(frac=1).reset_index(drop=True)
df = df[['content', 'disease']]
df.head()

Unnamed: 0,content,disease
0,"STOCKHOLM, Sweden — The World Health Organizat...",measles
1,"MANILA, Philippines — Instead of targeting ima...",none
2,Mayroong legal na proseso bago maipalipat ng k...,none
3,Nagliyab ang isang kotse habang bumibiyahe sa ...,none
4,"MANILA, Philippines — Former Igbaras, Iloilo m...",none


In [3]:
df.loc[df['disease']=='none', 'disease',]=0
df.loc[df['disease']=='dengue', 'disease',]=1
df.loc[df['disease']=='measles', 'disease',]=2
df.head()

Unnamed: 0,content,disease
0,"STOCKHOLM, Sweden — The World Health Organizat...",2
1,"MANILA, Philippines — Instead of targeting ima...",0
2,Mayroong legal na proseso bago maipalipat ng k...,0
3,Nagliyab ang isang kotse habang bumibiyahe sa ...,0
4,"MANILA, Philippines — Former Igbaras, Iloilo m...",0


In [4]:
df_x = df['content']
df_y = df['disease']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [6]:
x_train.head()

84     MANILA, Philippines – As measles outbreaks in ...
293                                                     
68     MANILA, Philippines — The arrest in Quezon Cit...
70     MANILA, Philippines — Personal motives are beh...
218     The Department of Health (DOH) will halt the ...
Name: content, dtype: object

In [7]:
cv = CountVectorizer(min_df=1, stop_words='english', vocabulary={'dengue':0, 'measles':1, 'tigdas':2})

In [8]:
x_traincv = cv.fit_transform(x_train)

In [9]:
a = x_traincv.toarray()

In [10]:
cv.inverse_transform(a[0])

[array(['measles'],
       dtype='<U7')]

In [11]:
x_testcv = cv.transform(x_test)

In [12]:
x_testcv.toarray()

array([[ 0, 17,  0],
       [ 9,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0, 10,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  6,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [25,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0, 12,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  0,  0],
       [ 0,  

In [13]:
mnb = MultinomialNB()

In [14]:
y_train = y_train.astype('int')
y_train

84     2
293    0
68     0
70     0
218    1
39     0
103    1
173    0
172    0
93     0
16     2
92     0
178    0
42     0
117    0
71     0
163    0
69     0
134    0
129    1
239    0
184    0
80     0
229    0
88     0
123    0
289    0
182    0
211    0
208    0
      ..
3      0
299    0
161    0
243    0
168    0
49     0
136    0
30     0
310    0
194    0
56     0
201    0
149    0
21     2
183    0
0      2
52     0
126    0
294    0
44     0
164    0
94     0
109    0
252    0
58     0
306    0
87     0
197    0
174    1
122    0
Name: disease, Length: 249, dtype: int32

In [15]:
mnb.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
predictions = mnb.predict(x_testcv)

In [17]:
a = np.array(y_test)
a

array([2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=object)

In [18]:
count = 0

for i in range(len(predictions)):
    if predictions[i] == a[i]:
        count += 1
    else:
        print("Prediction", predictions[i], "\nActual:", a[i], "\n")
        
count

63

In [19]:
len(predictions)

63

In [20]:
count / len(predictions)

1.0