In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, KFold
import numpy as np
from scipy.stats import sem
import string
import warnings
import pickle
warnings.filterwarnings('ignore',category=DeprecationWarning)

df = pd.read_json('articles.json')

In [2]:
df.head()

Unnamed: 0,category,text,title
0,Tekno,"Liputan6.com, Jakarta - Pertumbuhan startup te...",Kiat Sukses Berbisnis Teknologi
1,News,By Eri Komar Sinaga Mantan Kepala Biro Adminis...,KPK Periksa Politikus Demokrat Terkait Korupsi...
2,News,JAKARTA - Komisi Pemilihan Umum (KPU) DKI Jaka...,"Pendaftaran Ditutup, KPU Pastikan Pilgub DKI D..."
3,Tekno,ArenaLTE.com - Perkuat industri gaming yang se...,"Perkuat Industri Gaming, AMD Akuisisi Pengemba..."
4,News,"VIVA.co.id - Hingga Senin malam, 26 September ...","Bertambah, Korban Tewas Banjir Garut Jadi 34 O..."


In [3]:
df.isnull().sum()

category    0
text        0
title       0
dtype: int64

In [3]:
X_train,X_test,y_train,y_test = train_test_split(df['text'],df['category'],test_size=0.2)

In [53]:
clf_1 = Pipeline ([
    ('vect',CountVectorizer()),
    ('clf',MultinomialNB(alpha=0.01)),
])

In [6]:
clf_2 = Pipeline([
    ('vect',HashingVectorizer(non_negative=True)),
     ('clf',MultinomialNB()),
])

In [7]:
clf_3 = Pipeline([
    ('vect',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

In [8]:
def evaluate_cross_validation(clf,X,y,K):
    cv = KFold(K,shuffle=True,random_state=0)
    scores = cross_val_score(clf,X,y,cv=cv)
    print(scores)
    print("Mean score:  {0:.3f} (+/-{1:.3f})".format(np.mean(scores),sem(scores)))

In [38]:
clfs = [clf_1,clf_2,clf_3]
for clf in clfs:
    evaluate_cross_validation(clf,df['title'],df['category'],5)

[0.99    0.99375 0.98625 0.9925  0.99   ]
Mean score:  0.991 (+/-0.001)
[0.98875 0.99125 0.9875  0.9925  0.99   ]
Mean score:  0.990 (+/-0.001)
[0.9875  0.98875 0.98125 0.995   0.98875]
Mean score:  0.988 (+/-0.002)


In [39]:
clfs = [clf_1,clf_2,clf_3]
for clf in clfs:
    evaluate_cross_validation(clf,df['text'],df['category'],5)

[0.98625 0.99125 0.98875 0.99    0.995  ]
Mean score:  0.990 (+/-0.001)
[0.9875  0.985   0.98625 0.98625 0.985  ]
Mean score:  0.986 (+/-0.000)
[0.99    0.98875 0.985   0.985   0.99   ]
Mean score:  0.988 (+/-0.001)


In [4]:

with open('stopwords_id.txt','r') as text:
    stopwords_id = []
    data = text.read().splitlines()
    for word in data:
        stopwords_id.append(word)

In [5]:
def text_process(mess):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords_id]

In [18]:
df_text_process = df['text'].apply(text_process)

In [29]:
df_text_process.head()

0    [Liputan6com, Jakarta, Pertumbuhan, startup, t...
1    [By, Eri, Komar, Sinaga, Mantan, Kepala, Biro,...
2    [JAKARTA, Komisi, Pemilihan, KPU, DKI, Jakarta...
3    [ArenaLTEcom, Perkuat, industri, gaming, fokus...
4    [VIVAcoid, Senin, malam, 26, September, 2016, ...
Name: text, dtype: object

In [17]:
clf_4 = Pipeline ([
    ('vect',CountVectorizer()),
    ('clf',MultinomialNB()),
])

In [7]:
clf_5 = Pipeline([
    ('vect',HashingVectorizer(non_negative=True,analyzer=text_process)),
     ('clf',MultinomialNB()),
])

In [8]:
clf_6 = Pipeline([
    ('vect',TfidfVectorizer(analyzer=text_process)),
    ('clf',MultinomialNB()),
])

In [56]:
clfs2 = [clf_4,clf_5,clf_6]
for clf in clfs2:
    evaluate_cross_validation(clf,df['text'],df['category'],5)

[0.98875 0.99    0.99125 0.99    0.995  ]
Mean score:  0.991 (+/-0.001)
[0.99125 0.99    0.98875 0.98625 0.99375]
Mean score:  0.990 (+/-0.001)
[0.98875 0.99    0.98875 0.98875 0.99625]
Mean score:  0.991 (+/-0.001)


In [57]:
clfs2 = [clf_4,clf_5,clf_6]
for clf in clfs2:
    evaluate_cross_validation(clf,df['title'],df['category'],5)

[0.98375 0.9875  0.98625 0.98    0.99125]
Mean score:  0.986 (+/-0.002)
[0.98625 0.99125 0.99125 0.98875 0.99   ]
Mean score:  0.989 (+/-0.001)
[0.975   0.98125 0.98125 0.97125 0.98375]
Mean score:  0.979 (+/-0.002)


In [46]:
def train_and_evaluate(clf,X_train,X_test,y_train,y_test):
    clf.fit(X_train,y_train)
    print("Accuracy on training set:")
    print(clf.score(X_train,y_train))
    print('Accuracy on testing set:')
    print(clf.score(X_test,y_test))
    y_pred = clf.predict(X_test)
    
    print("Classification Report")
    print(metrics.classification_report(y_test,y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test,y_pred))

In [58]:
train_and_evaluate(clf_1,X_train,X_test,y_train,y_test)

Accuracy on training set:
0.99875
Accuracy on testing set:
0.99125
Classification Report
             precision    recall  f1-score   support

     Bisnis       1.00      0.98      0.99       213
     Celebs       1.00      0.99      1.00       183
       News       0.98      1.00      0.99       206
      Tekno       0.98      0.99      0.99       198

avg / total       0.99      0.99      0.99       800

Confusion Matrix:
[[208   0   3   2]
 [  0 182   0   1]
 [  0   0 206   0]
 [  0   0   1 197]]


In [59]:
train_and_evaluate(clf_4,X_train,X_test,y_train,y_test)

Accuracy on training set:
0.999375
Accuracy on testing set:
0.99375
Classification Report
             precision    recall  f1-score   support

     Bisnis       1.00      0.98      0.99       213
     Celebs       1.00      0.99      1.00       183
       News       0.99      1.00      1.00       206
      Tekno       0.99      1.00      0.99       198

avg / total       0.99      0.99      0.99       800

Confusion Matrix:
[[209   0   2   2]
 [  0 182   0   1]
 [  0   0 206   0]
 [  0   0   0 198]]


In [60]:
train_and_evaluate(clf_5,X_train,X_test,y_train,y_test)

Accuracy on training set:
0.994375
Accuracy on testing set:
0.9925
Classification Report
             precision    recall  f1-score   support

     Bisnis       1.00      0.99      0.99       213
     Celebs       0.99      0.99      0.99       183
       News       0.98      1.00      0.99       206
      Tekno       0.99      1.00      1.00       198

avg / total       0.99      0.99      0.99       800

Confusion Matrix:
[[210   0   2   1]
 [  0 181   2   0]
 [  0   1 205   0]
 [  0   0   0 198]]


In [66]:
train_and_evaluate(clf_6,X_train,X_test,y_train,y_test)

Accuracy on training set:
0.9971875
Accuracy on testing set:
0.99625
Classification Report
             precision    recall  f1-score   support

     Bisnis       1.00      0.99      0.99       213
     Celebs       1.00      1.00      1.00       183
       News       0.99      1.00      1.00       206
      Tekno       0.99      1.00      1.00       198

avg / total       1.00      1.00      1.00       800

Confusion Matrix:
[[210   0   2   1]
 [  0 183   0   0]
 [  0   0 206   0]
 [  0   0   0 198]]


In [18]:
model = clf_4.fit(X_train,y_train)

In [26]:
import json

In [19]:
with open('model.pickle', 'wb') as fh:
    pickle.dump(model, fh)

In [27]:
with open('columns.json', 'w') as fh:
    json.dump(df[['text']].columns.tolist(), fh)

In [28]:
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)