In [117]:
import numpy as np
import pandas as pd
import re

In [118]:
df = pd.read_json("News_Category_Dataset_v2.json", lines=True)

In [119]:
df =  df.loc[df["category"].isin(["SPORTS", "TECH", "BUSINESS", "ENTERTAINMENT", "POLITICS"])]

In [120]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26


In [121]:
df['Text'] = df.headline + " " + df.short_description

In [122]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\?[\]\|@,;#+_].')
BAD_SYMBOLS_RE = re.compile('[^a-z ]')
def text_prepare(text):
    text = str(text)
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub(REPLACE_BY_SPACE_RE,' ', text)
    text = re.sub(BAD_SYMBOLS_RE,' ', text)
    text = re.sub(r"\s+" , " " , text)
    return text

In [123]:
df["Text"] = df["Text"].apply(lambda x: text_prepare(x))

In [124]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date,Text
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,will smith joins diplo and nicky jam for the w...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,hugh grant marries for the first time at age t...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,jim carrey blasts castrato adam schiff and dem...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,julianna margulies uses donald trump poop bags...
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26,morgan freeman devastated that sexual harassme...


In [125]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [126]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.Text)
X = tokenizer.texts_to_sequences(df.Text)
df['words'] = X

In [127]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date,Text,words
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,will smith joins diplo and nicky jam for the w...,"[35, 919, 1750, 10794, 7, 13335, 4591, 9, 1, 9..."
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,hugh grant marries for the first time at age t...,"[3271, 4236, 7195, 9, 1, 66, 63, 19, 655, 1, 3..."
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,jim carrey blasts castrato adam schiff and dem...,"[1346, 5082, 2266, 28014, 1412, 5564, 7, 139, ..."
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,julianna margulies uses donald trump poop bags...,"[21754, 16204, 1628, 33, 11, 16205, 7725, 2, 6..."
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26,morgan freeman devastated that sexual harassme...,"[2621, 6735, 5083, 13, 313, 781, 386, 78, 3007..."


In [128]:
maxlen = 100
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))

In [129]:
len(X)

61700

In [130]:
cat = df.groupby('category')
print("total categories:", cat.ngroups)
print(cat.size())

total categories: 5
category
BUSINESS          5937
ENTERTAINMENT    16058
POLITICS         32739
SPORTS            4884
TECH              2082
dtype: int64


In [131]:
len(df)

61700

In [132]:
categories = df.groupby('category').size().index.tolist()
category_int = {}
int_category = {}
for i, k in enumerate(categories):
    category_int.update({k:i})
    int_category.update({i:k})

df['id'] = df['category'].apply(lambda x: category_int[x])

In [133]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date,Text,words,id
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,will smith joins diplo and nicky jam for the w...,"[35, 919, 1750, 10794, 7, 13335, 4591, 9, 1, 9...",1
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,hugh grant marries for the first time at age t...,"[3271, 4236, 7195, 9, 1, 66, 63, 19, 655, 1, 3...",1
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,jim carrey blasts castrato adam schiff and dem...,"[1346, 5082, 2266, 28014, 1412, 5564, 7, 139, ...",1
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,julianna margulies uses donald trump poop bags...,"[21754, 16204, 1628, 33, 11, 16205, 7725, 2, 6...",1
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26,morgan freeman devastated that sexual harassme...,"[2621, 6735, 5083, 13, 313, 781, 386, 78, 3007...",1


In [134]:
X = np.array(X)
Y = np_utils.to_categorical(list(df.id))

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.1)

In [135]:
word_index = tokenizer.word_index

EMBEDDING_DIM = 100

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s unique tokens.' % len(word_index))
print('Total %s word vectors.' % len(embeddings_index))

Found 43673 unique tokens.
Total 400000 word vectors.


In [136]:
embeddings_index["rain"]

array([-1.0335   ,  0.1923   ,  0.75185  ,  0.052949 , -0.13     ,
        0.32704  ,  0.13995  ,  1.029    , -1.2252   , -0.56681  ,
       -0.08998  ,  0.087026 ,  0.12782  , -0.39067  ,  0.46194  ,
       -0.52275  ,  0.043902 , -0.25056  ,  0.46998  ,  0.59955  ,
        0.22225  ,  0.6047   ,  0.74104  ,  1.013    ,  0.4105   ,
        1.0875   , -0.77584  , -0.14632  ,  0.24677  , -0.50827  ,
       -0.38744  , -0.79767  ,  0.095715 ,  0.49008  , -0.77654  ,
       -0.28072  ,  0.26816  , -0.56382  ,  0.30244  ,  0.80363  ,
       -1.0137   , -0.1754   , -0.12751  , -0.29914  ,  0.91612  ,
       -0.26122  , -0.16641  , -0.09657  ,  0.69747  , -1.387    ,
       -0.1499   , -0.069914 ,  0.54274  ,  0.57055  ,  0.56829  ,
       -1.8202   , -0.331    ,  0.82934  ,  1.2996   ,  0.67671  ,
       -0.26366  ,  1.0841   , -0.55754  ,  0.39118  ,  0.0038266,
        0.20567  ,  0.23146  , -0.8063   ,  0.36182  , -1.3672   ,
       -0.45553  , -0.30046  ,  0.65406  ,  0.17487  ,  0.6837

In [137]:
import tensorflow as tf
from tensorflow.keras import layers

In [138]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = layers.Embedding(len(word_index)+1,
                            EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

In [139]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [140]:
input1 = layers.Input(shape=(100,))
x = embedding_layer(input1)
x = layers.Bidirectional(layers.LSTM(32, dropout = 0.2, return_state = True, return_sequences = True))(x)
lstm, forward_h, forward_c, backward_h, backward_c = layers.Bidirectional(layers.LSTM(32, dropout = 0.2, return_state = True, return_sequences = True))(x)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

context_vector, attention_weights = Attention(64)(lstm, state_h)

x = layers.Dense(32, activation = "relu")(context_vector)
x = layers.Dropout(0.5)(x)
x = layers.Dense(16, activation = "relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(5, activation = "softmax")(x)
model = tf.keras.Model(inputs = input1, outputs = x)

In [141]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 100)     4367400     input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) [(None, 100, 64), (N 34048       embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) [(None, 100, 64), (N 24832       bidirectional_4[0][0]            
                                                                 bidirectional_4[0][1]      

In [142]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [143]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [144]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=40, verbose=1,batch_size=64, callbacks=[es])

Train on 55530 samples, validate on 6170 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 00011: early stopping


<tensorflow.python.keras.callbacks.History at 0x1fad0a5f488>

In [175]:
y_val_pred = np.argmax(model.predict(x_val),1)
y_val_true = np.argmax(y_val, 1)

In [176]:
from sklearn.metrics import classification_report

In [177]:
target_names = [int_category[0], int_category[1], int_category[2], int_category[3], int_category[4]]

In [178]:
print(classification_report(y_val_true, y_val_pred, target_names=target_names))

               precision    recall  f1-score   support

     BUSINESS       0.73      0.67      0.70       607
ENTERTAINMENT       0.94      0.88      0.91      1627
     POLITICS       0.90      0.96      0.93      3234
       SPORTS       0.90      0.83      0.86       512
         TECH       0.67      0.56      0.61       190

     accuracy                           0.89      6170
    macro avg       0.83      0.78      0.80      6170
 weighted avg       0.89      0.89      0.89      6170

