In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [48]:
!kaggle datasets download -d uciml/news-aggregator-dataset

Downloading news-aggregator-dataset.zip to /content
 32% 9.00M/28.0M [00:00<00:00, 30.8MB/s]
100% 28.0M/28.0M [00:00<00:00, 70.7MB/s]


In [50]:
import zipfile
zip_ref = zipfile.ZipFile('news-aggregator-dataset.zip', 'r')
zip_ref.extractall('TMP_3')
zip_ref.close()

In [1]:
# Dataset : https://www.kaggle.com/uciml/news-aggregator-dataset
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

In [2]:
# Just use 2 column, the object, and the classes
df = pd.read_csv('/content/TMP_3/uci-news-aggregator.csv', usecols=['TITLE', 'CATEGORY'])

In [3]:
# Left side to total of classes, right side to all data that included into the classes before
df.CATEGORY.value_counts()

e    152469
b    115967
t    108344
m     45639
Name: CATEGORY, dtype: int64

In [4]:
df.head(10)

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b
5,Plosser: Fed May Have to Accelerate Tapering Pace,b
6,Fed's Plosser: Taper pace may be too slow,b
7,Fed's Plosser expects US unemployment to fall ...,b
8,US jobs growth last month hit by weather:Fed P...,b
9,ECB unlikely to end sterilisation of SMP purch...,b


In [5]:
category = pd.get_dummies(df.CATEGORY) # Create the new system!
category

Unnamed: 0,b,e,m,t
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
422414,0,0,1,0
422415,0,0,1,0
422416,0,0,1,0
422417,0,0,1,0


In [6]:
df_baru = pd.concat([df, category], axis=1)
df_baru

Unnamed: 0,TITLE,CATEGORY,b,e,m,t
0,"Fed official says weak data caused by weather,...",b,1,0,0,0
1,Fed's Charles Plosser sees high bar for change...,b,1,0,0,0
2,US open: Stocks fall after Fed official hints ...,b,1,0,0,0
3,"Fed risks falling 'behind the curve', Charles ...",b,1,0,0,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,1,0,0,0
...,...,...,...,...,...,...
422414,Surgeons to remove 4-year-old's rib to rebuild...,m,0,0,1,0
422415,Boy to have surgery on esophagus after battery...,m,0,0,1,0
422416,Child who swallowed battery to have reconstruc...,m,0,0,1,0
422417,Phoenix boy undergoes surgery to repair throat...,m,0,0,1,0


In [7]:
df_baru = df_baru.drop(columns='CATEGORY')
df_baru

Unnamed: 0,TITLE,b,e,m,t
0,"Fed official says weak data caused by weather,...",1,0,0,0
1,Fed's Charles Plosser sees high bar for change...,1,0,0,0
2,US open: Stocks fall after Fed official hints ...,1,0,0,0
3,"Fed risks falling 'behind the curve', Charles ...",1,0,0,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,1,0,0,0
...,...,...,...,...,...
422414,Surgeons to remove 4-year-old's rib to rebuild...,0,0,1,0
422415,Boy to have surgery on esophagus after battery...,0,0,1,0
422416,Child who swallowed battery to have reconstruc...,0,0,1,0
422417,Phoenix boy undergoes surgery to repair throat...,0,0,1,0


In [8]:
text = df_baru['TITLE'].values # Convert to value
text

array(['Fed official says weak data caused by weather, should not slow taper',
       "Fed's Charles Plosser sees high bar for change in pace of tapering",
       'US open: Stocks fall after Fed official hints at accelerated tapering',
       ...,
       'Child who swallowed battery to have reconstructive surgery at Cincinnati  ...',
       'Phoenix boy undergoes surgery to repair throat damage - WFSB 3 Connecticut',
       'Phoenix boy undergoes surgery to repair throat damage - CBS 3 Springfield  ...'],
      dtype=object)

In [9]:
label = df_baru[['e', 'b', 't', 'm']].values # Look at the classes
label

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=uint8)

In [10]:
# Split them
from sklearn.model_selection import train_test_split
text_latih, text_test, label_latih, label_test = train_test_split(text, label, test_size=0.2)

In [11]:
# Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
tokenizer = Tokenizer(num_words=5000, oov_token='x')

In [12]:
tokenizer.fit_on_texts(text_latih)

In [13]:
tokenizer.fit_on_texts(text_test)

In [14]:
sekuens_latih = tokenizer.texts_to_sequences(text_latih)
sekuens_test = tokenizer.texts_to_sequences(text_test)

In [15]:
padded_latih = pad_sequences(sekuens_latih,maxlen = 400) # Make 400, to avoid crazy user. If you wanna change it? You can change it
padded_latih

array([[   0,    0,    0, ...,   87,  784, 1809],
       [   0,    0,    0, ..., 1435,  276,  383],
       [   0,    0,    0, ...,  236,  383,   11],
       ...,
       [   0,    0,    0, ...,  378,   89,  867],
       [   0,    0,    0, ...,  102,  383,  383],
       [   0,    0,    0, ...,  383,    3,  383]], dtype=int32)

In [16]:
padded_test = pad_sequences(sekuens_test,maxlen=400)
padded_test

array([[   0,    0,    0, ...,    3,  258,  348],
       [   0,    0,    0, ...,   11,  383,   23],
       [   0,    0,    0, ...,  383,    7, 4021],
       ...,
       [   0,    0,    0, ...,   54,  383,  383],
       [   0,    0,    0, ...,  228,   20, 4767],
       [   0,    0,    0, ...,  383,  383,  383]], dtype=int32)

In [21]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000,output_dim=18),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

In [22]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_accuracy') > 1.0):
      print("\nValidasi akurasi di atas 75%, hentikan training!")
      self.model.stop_training = True

callbacks = myCallback()

In [25]:
%%time
num_epochs = 1
history = model.fit(padded_latih, label_latih, epochs=num_epochs, 
                    validation_data=(padded_test, label_test),callbacks=[callbacks])

CPU times: user 1min, sys: 9.24 s, total: 1min 10s
Wall time: 48.6 s


In [27]:
# Before use the prediction model, first retokenize!
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
tokenizer = Tokenizer(num_words=5000, oov_token='x')



txt = ["Regular fast food eating linked to fertility issues in women"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=400)
pred = model.predict(padded)
labels = ['entertainment', 'bussiness', 'science/tech', 'health']
print(pred, labels[np.argmax(pred)])

[[0.0028258  0.03072634 0.00145322 0.96499455]] health


In [29]:
model.outputs

[<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'dense_3')>]