In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt

from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
from keras.layers import Reshape, merge, Concatenate, Lambda, Average
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import np_utils

from sklearn.model_selection import train_test_split


Using TensorFlow backend.


In [6]:
# load data

df = pd.read_json('C:\\Users\\jana\\Desktop\\new project\\DeepResearch-master\\Hierarchical_Attention_Network\\News_Category_Dataset\\News_Category_Dataset.json', lines=True)
df.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."


In [7]:
cates = df.groupby('category')
print("total categories:", cates.ngroups)
print(cates.size())

total categories: 31
category
ARTS               1509
ARTS & CULTURE     1339
BLACK VOICES       3858
BUSINESS           4254
COLLEGE            1144
COMEDY             3971
CRIME              2893
EDUCATION          1004
ENTERTAINMENT     14257
FIFTY              1401
GOOD NEWS          1398
GREEN              2622
HEALTHY LIVING     6694
IMPACT             2602
LATINO VOICES      1129
MEDIA              2815
PARENTS            3955
POLITICS          32739
QUEER VOICES       4995
RELIGION           2556
SCIENCE            1381
SPORTS             4167
STYLE              2254
TASTE              2096
TECH               1231
THE WORLDPOST      3664
TRAVEL             2145
WEIRD NEWS         2670
WOMEN              3490
WORLD NEWS         2177
WORLDPOST          2579
dtype: int64


In [8]:
# In the above category there are two WORLDPOST so they should be merged into one, thus a lambda function is used

df.category = df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

In [10]:
# using headlines and short_description as input X

df['text'] = df.headline + " " + df.short_description 

# tokenizing

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X

# delete some empty and short data

df['word_length'] = df.words.apply(lambda i: len(i))
df = df[df.word_length >= 5]

df.head()

Unnamed: 0,authors,category,date,headline,link,short_description,text,words,word_length
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,"[87, 95, 260, 917, 2154, 6, 453, 133, 119, 30,...",27
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,"[34, 1516, 2197, 20046, 5, 18729, 5873, 8, 1, ...",20
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 5...,"[5201, 5146, 8954, 8, 1, 69, 59, 19, 463, 7901...",25
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,"[2198, 9428, 2458, 47694, 2030, 8956, 5, 287, ...",26
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,"[36179, 26511, 1605, 55, 20, 6883, 4637, 2, 95...",26


In [21]:
# using 50 for padding length

maxlen = 50
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
print(X[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0   87   95  260  917 2154
    6  453  133  119   30  120  225    9  392   89  424   50 1003   38
  323   44  202   51  185   73    6  168]


In [22]:
# converting the category to id

categories = df.groupby('category').size().index.tolist() # grouping the categories to a list
category_int = {}  # assign a mapping var for cat to int 
int_category = {}  # assign a mapping var for int to cat
for i, k in enumerate(categories):
    category_int.update({k:i}) # assigning values from enum as cat ->  int
    int_category.update({i:k}) # vice versa

df['char2id'] = df['category'].apply(lambda x: category_int[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [25]:
# just for some visual understanding of the data

categories = df.groupby('category').size().index.tolist()
print(categories[9],'\n', df['headline'][9])
print(df['char2id'][:10])

FIFTY 
 What To Watch On Hulu That’s New This Week
0    6
1    8
2    8
3    8
4    8
5    8
6    8
7    8
8    8
9    8
Name: char2id, dtype: int64


In [26]:
#glove embedding is one of the most usefull model for representing distributed word representation.
word_index = tokenizer.word_index
emb_dim = 100
embeddings_index = {}
# use a pretrained model 
f = open('C:\\Users\\jana\\Desktop\\new project\\DeepResearch-master\\Hierarchical_Attention_Network\\glove.6B.100d.txt', encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s unique tokens.' % len(word_index))

Found 86627 unique tokens.
Total 400000 word vectors.


In [31]:
#create an embedding_matrix and prepare the embedding layer for LSTM operation

embedding_matrix = np.zeros((len(word_index) + 1, emb_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
# i/p dim = len(word_index) 
# o/p dim = emb_dim = 100
embedding_layer = Embedding(len(word_index)+1, emb_dim, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=False)                          
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.038194   -0.24487001  0.72812003 ... -0.1459      0.82779998
   0.27061999]
 [-0.18970001  0.050024    0.19084001 ... -0.39804     0.47646999
  -0.15983   ]
 ...
 [-0.07705     0.15891001 -0.071052   ...  0.12487    -0.27191001
   0.017928  ]
 [ 0.40114     0.87053001  0.049884   ... -0.030224   -0.19456001
   0.57744002]
 [ 0.1707      0.28426999 -0.055979   ...  0.098962   -0.22001
   0.20998   ]]


In [45]:
len(word_index)
emb_dim

100

# split dataset

In [39]:
# prepared dataset 
X = np.array(X) # convert value to array for further calculation.
Y = np_utils.to_categorical(list(df.char2id)) # one-hot array encoding format.

# and split to training set and validation set
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=0)
print(x_train)
print('\n')
print(x_val)

[[    0     0     0 ...   556    19 26748]
 [    0     0     0 ...  1270     6   541]
 [    0     0     0 ...   358  2925  3531]
 ...
 [    0     0     0 ...   142     6   229]
 [    0     0     0 ...   794     1  1195]
 [    0     0     0 ...   706 69332   735]]


[[    0     0     0 ... 14401  1663   643]
 [    0     0     0 ...  2827   384  1282]
 [    0     0     0 ...    94   178    16]
 ...
 [    0     0     0 ...   120     3  3760]
 [    0     0     0 ...   100     2  1546]
 [    0     0     0 ...     4     3  9772]]


# Bidirectional GRU(LSTM) + Conv nets for better results 

In [43]:
# Using convolution nets as the input layer to the text data for better recognization of sentiment from the text 
# and this will be passed to the  LSTM(GRU) layer for further analysis of data.

# CONV NETS FOR FEATURE RECOG.

inp = Input(shape=(maxlen,), dtype='int32')     # max length is 50 which is the padding value.
x = embedding_layer(inp)                        # this embeddign layer turns positive integers into dense vectors for calculation
x = SpatialDropout1D(0.2)(x)

# set the return_seq to True for returning the values into the lstm cell again.

x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) # LSTM LAYER 
x = Conv1D(64, kernel_size=3)(x)       #  CONV LAYER
avg_pool = GlobalAveragePooling1D()(x) # AVERAGE POOLING FOR TIME SERIES DATA
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
outp = Dense(len(int_category), activation="softmax")(x)

BiGRU = Model(inp, outp)
BiGRU.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

BiGRU.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_5 (Embedding)          (None, 50, 100)       8662800     input_5[0][0]                    
____________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDrop (None, 50, 100)       0           embedding_5[2][0]                
____________________________________________________________________________________________________
bidirectional_3 (Bidirectional)  (None, 50, 256)       175872      spatial_dropout1d_3[0][0]        
___________________________________________________________________________________________

In [None]:
# training

bigru_history = BiGRU.fit(x_train, 
                          y_train, 
                          batch_size=128, 
                          epochs=20, 
                          validation_data=(x_val, y_val))

In [None]:
acc = bigru_history.history['acc']
val_acc = bigru_history.history['val_acc']
loss = bigru_history.history['loss']
val_loss = bigru_history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.title('Accuracy')
plt.plot(epochs, acc, 'green', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()

plt.figure()
plt.title('Loss')
plt.plot(epochs, loss, 'green', label='Training loss')
plt.plot(epochs, val_loss, 'blue', label='Validation loss')
plt.legend()

plt.show()

In [None]:
# to calculate the accuracy of the model.
def evaluate_accuracy(model):
    predicted = model.predict(x_val)
    diff = y_val.argmax(axis=-1) - predicted.argmax(axis=-1)
    corrects = np.where(diff == 0)[0].shape[0]
    total = y_val.shape[0]
    return float(corrects/total)
print("model Bidirectional GRU + Conv:  %.3f*100" % evaluate_accuracy(BiGRU))