In [3]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score

In [4]:
df = pd.read_json('train.json', encoding='utf-8')
df = df.sample(frac=1).reset_index(drop=True)
df = df[['content', 'disease']]
df.head()

Unnamed: 0,content,disease
0,"MANILA, Philippines — Actor Cogie Domingo was ...",none
1,"MANILA, Philippines — The Commission on Electi...",none
2,Health officials are investigating an outbreak...,typhoid
3,"MANILA, Philippines – The Department of Health...",none
4,Arestado ang isang criminology student at isan...,none


In [5]:
df['disease'].value_counts()

none       302
dengue      29
measles     27
typhoid     24
Name: disease, dtype: int64

In [6]:
df.loc[df['disease']=='none', 'disease',]=0
df.loc[df['disease']=='dengue', 'disease',]=1
df.loc[df['disease']=='measles', 'disease',]=2
df.loc[df['disease']=='typhoid', 'disease',]=3
df.head()

Unnamed: 0,content,disease
0,"MANILA, Philippines — Actor Cogie Domingo was ...",0
1,"MANILA, Philippines — The Commission on Electi...",0
2,Health officials are investigating an outbreak...,3
3,"MANILA, Philippines – The Department of Health...",0
4,Arestado ang isang criminology student at isan...,0


In [7]:
df_x = df['content']
df_y = df['disease']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=4, stratify=df_y)

In [9]:
x_train.head()

31     Measles could lead to blindness among children...
103    MANILA, Philippines – Can the Philippines elim...
26     MANILA, Philippines – Two weeks after the Depa...
317    Maulan buong araw ng Miyerkoles, Nobyembre 1, ...
192    MANILA – Acting Presidential Spokesperson Harr...
Name: content, dtype: object

In [10]:
cv = CountVectorizer(min_df=1, stop_words='english')
x_traincv = cv.fit_transform(x_train)
a = x_traincv.toarray()
cv.inverse_transform(a[0])
x_testcv = cv.transform(x_test)

In [11]:
mnb = MultinomialNB()

In [12]:
y_train = y_train.astype('int')
len(y_train)

267

In [13]:
mnb.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
predictions = mnb.predict(x_testcv)
predictions

array([0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 2, 0, 0, 2, 0, 1, 0, 0, 2, 2,
       0, 0, 0, 1, 0, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 2, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 3, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 0])

In [15]:
actual = np.array(y_test)
actual

array([0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 1, 0, 1, 2, 0,
       0, 0, 0, 1, 0, 3, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 3, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 0], dtype=object)

In [16]:
accuracy = mnb.score(x_testcv, y_test.tolist())
accuracy

0.9304347826086956

In [17]:
confusion_matrix(actual.tolist(), predictions.tolist(), labels=[0, 1, 2, 3])

array([[87,  1,  3,  0],
       [ 4,  5,  0,  0],
       [ 0,  0,  8,  0],
       [ 0,  0,  0,  7]])

In [18]:
cv_results = cross_validate(mnb, x_testcv, y_test.tolist(), return_train_score=False)
cv_results['test_score']

array([0.775     , 0.76315789, 0.81081081])

In [19]:
f_score = f1_score(actual.tolist(), predictions.tolist(), labels=[0, 1, 2, 3], average=None)
f_score

array([0.95604396, 0.66666667, 0.84210526, 1.        ])

In [20]:
joblib.dump(mnb, 'mnb.pkl')
joblib.dump(cv, 'cv.pkl')

['cv.pkl']

## TF-IDF

In [21]:
tf = TfidfVectorizer(min_df=1, stop_words='english')
x_traintf = tf.fit_transform(x_train)
a2 = x_traintf.toarray()
tf.inverse_transform(a2[0])
x_testtf = tf.transform(x_test)

In [22]:
mnb2 = MultinomialNB()

In [23]:
mnb2.fit(x_traintf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
predictions2 = mnb2.predict(x_testtf)
predictions2

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0])

In [25]:
actual2 = np.array(y_test)
actual2

array([0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 1, 0, 1, 2, 0,
       0, 0, 0, 1, 0, 3, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 3, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 0], dtype=object)

In [26]:
accuracy2 = mnb2.score(x_testtf, y_test.tolist())
accuracy2

0.7913043478260869

In [27]:
confusion_matrix(actual2.tolist(), predictions2.tolist(), labels=[0, 1, 2, 3])

array([[91,  0,  0,  0],
       [ 9,  0,  0,  0],
       [ 8,  0,  0,  0],
       [ 7,  0,  0,  0]])

In [28]:
tf_results = cross_validate(mnb2, x_testtf, y_test.tolist(), return_train_score=False)
tf_results['test_score']

array([0.775     , 0.78947368, 0.81081081])

In [29]:
f_score2 = f1_score(actual2.tolist(), predictions2.tolist(), labels=[0, 1, 2, 3], average=None)
f_score2

  'precision', 'predicted', average, warn_for)


array([0.88349515, 0.        , 0.        , 0.        ])