In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (6,6)

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
from keras.layers import Reshape, merge, Concatenate, Lambda, Average
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers.merge import add

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


# prepare data

In [2]:
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [3]:
cates = df.groupby('category')
print("total categories: ", cates.ngroups)
print(cates.size())

total categories:  41
category
ARTS               1509
ARTS & CULTURE     1339
BLACK VOICES       4528
BUSINESS           5937
COLLEGE            1144
COMEDY             5175
CRIME              3405
CULTURE & ARTS     1030
DIVORCE            3426
EDUCATION          1004
ENTERTAINMENT     16058
ENVIRONMENT        1323
FIFTY              1401
FOOD & DRINK       6226
GOOD NEWS          1398
GREEN              2622
HEALTHY LIVING     6694
HOME & LIVING      4195
IMPACT             3459
LATINO VOICES      1129
MEDIA              2815
MONEY              1707
PARENTING          8677
PARENTS            3955
POLITICS          32739
QUEER VOICES       6314
RELIGION           2556
SCIENCE            2178
SPORTS             4884
STYLE              2254
STYLE & BEAUTY     9649
TASTE              2096
TECH               2082
THE WORLDPOST      3664
TRAVEL             9887
WEDDINGS           3651
WEIRD NEWS         2670
WELLNESS          17827
WOMEN              3490
WORLD NEWS         2177
WORLDPOST

In [4]:
# THE WORLDPOST and WORLDPOST should be the same category, so merge them.
df.category = df.category.map(lambda x: "WORDPOST" if x == "THE WORLDPOST" else x)

In [5]:
# using headlines and short_description as input X
df['text'] = df.headline + " " + df.short_description

In [6]:
# tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X

In [7]:
# delete some empty and short data
df['word_length'] = df.words.apply(lambda i: len(i))
df = df[df.word_length >= 5]

df.head()

Unnamed: 0,category,headline,authors,link,short_description,date,text,words,word_length
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,"[74, 101, 257, 1331, 3001, 6, 698, 134, 96, 26...",27
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,"[42, 1604, 2960, 27762, 5, 25929, 5237, 8, 1, ...",20
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,Hugh Grant Marries For The First Time At Age 5...,"[5877, 5334, 8083, 8, 1, 76, 54, 21, 414, 8469...",25
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,"[2710, 13374, 3596, 64143, 2295, 13055, 5, 569...",26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,"[41003, 36082, 1513, 97, 48, 7915, 3134, 2, 96...",26


In [8]:
df.word_length.describe()

count    199914.000000
mean         29.725032
std          14.024717
min           5.000000
25%          20.000000
50%          29.000000
75%          36.000000
max         248.000000
Name: word_length, dtype: float64

In [9]:
# using 50 for padding length
maxlen = 50
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))

In [14]:
# category to id
categories = df.groupby('category').size().index.tolist()
category_int = {}
int_category = {}
for i, k in enumerate(categories):
    category_int.update({k:i})
    int_category.update({i:k})

df['c2id'] = df['category'].apply(lambda x: category_int[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


# glove embedding

In [20]:
word_index = tokenizer.word_index

EMBEDDING_DIM = 100

embeddings_index = {}
f = open('glove.6B.100d.txt', 'rt', encoding='UTF8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s unique tokens.' % len(word_index))
print('Total %s word vectors.' % len(embeddings_index))

Found 116617 unique tokens.
Total 400000 word vectors.


In [21]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM, 
                            embeddings_initializer = Constant(embedding_matrix),
                            input_length = maxlen, 
                            trainable=False)

# split dataset

In [22]:
# prepared data

X = np.array(X)
Y = np_utils.to_categorical(list(df.c2id))

#and split to training set and validation set
seed = 29
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

# Bidirectional GRU + Conv

In [23]:
# Bidrectional LSTM with convolution
# from https://www.kaggle.com/eashish/bidirectional-gru-with-convolution

inp = Input(shape=(maxlen,), dtype='int32')
x = embedding_layer(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size=3)(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
outp = Dense(len(int_category), activation="softmax")(x)

BiGRU = Model(inp, outp)
BiGRU.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

BiGRU.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 100)      11661800    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 50, 100)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 50, 256)      175872      spatial_dropout1d_1[0][0]        
____________________________________________________________________________________________

In [24]:
# training
bigru_history = BiGRU.fit(x_train,
                         y_train,
                         batch_size=128,
                         epochs=20,
                         validation_data=(x_val, y_val))

Train on 159931 samples, validate on 39983 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
pip install newsapi-python

Collecting newsapi-python
  Downloading newsapi_python-0.2.6-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.6
Note: you may need to restart the kernel to use updated packages.


In [30]:
from newsapi import NewsApiClient
# Init
newsapi = NewsApiClient(api_key='1442af938d214ef09a688b70293e9bea')

# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(category='business',
                                          language='en',
                                          country='us')

In [31]:
top_headlines

{'status': 'ok',
 'totalResults': 70,
 'articles': [{'source': {'id': 'the-washington-post',
    'name': 'The Washington Post'},
   'author': 'Antonia Farzan, Katie Shepherd, Jennifer Hassan, Rick Noack',
   'title': 'Live updates: Fed chair says millions of Americans may never get their jobs back, predicts slow recovery from coronavirus - The Washington Post',
   'description': 'Congress will likely need to extend additional aid as unemployment remains at historic highs, while the OECD said the global economy is “experiencing the deepest recession since the Great Depression."',
   'url': 'https://www.washingtonpost.com/nation/2020/06/11/coronavirus-update-us/',
   'urlToImage': 'https://www.washingtonpost.com/wp-apps/imrs.php?src=https://arc-anglerfish-washpost-prod-washpost.s3.amazonaws.com/public/BV5OHKVJY4I6VJB3X2PWJFFIPU.jpg&w=1440',
   'publishedAt': '2020-06-11T10:45:17Z',
   'content': 'MOSCOW Russias coronavirus cases passed the half-million mark Thursday, reaching 502,436, as

In [73]:
df = pd.DataFrame(top_headlines['articles'])

In [74]:
df.head(3)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': 'the-washington-post', 'name': 'The Was...","Antonia Farzan, Katie Shepherd, Jennifer Hassa...",Live updates: Fed chair says millions of Ameri...,Congress will likely need to extend additional...,https://www.washingtonpost.com/nation/2020/06/...,https://www.washingtonpost.com/wp-apps/imrs.ph...,2020-06-11T10:45:17Z,MOSCOW Russias coronavirus cases passed the ha...
1,"{'id': None, 'name': 'New York Times'}",,Jobless Claims in the U.S. Set to Rise - The N...,The latest on stock market and business news d...,https://www.nytimes.com/2020/06/11/business/jo...,https://www.nytimes.com/newsgraphics/2020/04/0...,2020-06-11T10:25:00Z,"But Li Keqiang, Chinas premier, had publicly c..."
2,"{'id': None, 'name': 'MarketWatch'}",Steve Goldstein,Musk tweets 'lol' at stock market - MarketWatch,,http://www.marketwatch.com/story/musk-tweets-l...,https://s.wsj.net/public/resources/MWimages/MW...,2020-06-11T10:17:00Z,Steven Goldstein is based in London and respon...


In [91]:
df['text'] = df.description + " " + df.title
drop_df = df.dropna(axis=0)
df = drop_df
df

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,text,words,word_length,sub_category
0,"{'id': 'the-washington-post', 'name': 'The Was...","Antonia Farzan, Katie Shepherd, Jennifer Hassa...",Live updates: Fed chair says millions of Ameri...,Congress will likely need to extend additional...,https://www.washingtonpost.com/nation/2020/06/...,https://www.washingtonpost.com/wp-apps/imrs.ph...,2020-06-11T10:45:17Z,MOSCOW Russias coronavirus cases passed the ha...,Congress will likely need to extend additional...,"[195, 196, 9, 52, 197, 1, 198, 199, 200, 37, 2...",86,QUEER VOICES
2,"{'id': None, 'name': 'MarketWatch'}",Steve Goldstein,Musk tweets 'lol' at stock market - MarketWatch,,http://www.marketwatch.com/story/musk-tweets-l...,https://s.wsj.net/public/resources/MWimages/MW...,2020-06-11T10:17:00Z,Steven Goldstein is based in London and respon...,Musk tweets 'lol' at stock market - MarketWatch,"[101, 246, 20, 247, 4, 248, 7, 102, 6, 249, 25...",39,TRAVEL
3,"{'id': None, 'name': 'Barron's'}",Daren Fonda,Delta Stock Could Be Grounded by Debt - Barron's,,https://www.barrons.com/articles/delta-stock-d...,https://images.barrons.com/im-197064/social,2020-06-11T10:08:40Z,Delta Air Lines\r\n warning that it is seeking...,Delta Stock Could Be Grounded by Debt - Barron's,"[107, 262, 263, 264, 22, 11, 20, 265, 2, 266, ...",43,DIVORCE
4,"{'id': None, 'name': 'Barron's'}",Al Root,Tesla Truck Competitor Nikola Met With Investo...,,https://www.barrons.com/articles/tesla-truck-c...,https://images.barrons.com/im-196788/social,2020-06-11T10:08:36Z,Battery and fuel cell heavy duty trucking pion...,Tesla Truck Competitor Nikola Met With Invest...,"[278, 7, 279, 280, 281, 282, 283, 284, 285, 28...",45,WELLNESS
5,"{'id': 'fox-news', 'name': 'Fox News'}",David Aaro,Johnson & Johnson says coronavirus vaccine's h...,"Johnson & Johnson, the largest health care com...",https://www.foxnews.com/health/johnson-johnson...,https://static.foxnews.com/foxnews.com/content...,2020-06-11T09:40:23Z,"Johnson &amp; Johnson, the largest health care...","Johnson & Johnson, the largest health care com...","[25, 307, 25, 1, 61, 116, 117, 62, 4, 1, 40, 1...",78,WELLNESS
6,"{'id': None, 'name': 'CNBC'}",Elliot Smith,Global stocks reel on gloomy Fed outlook and f...,Stock markets around the world retreated Thurs...,https://www.cnbc.com/2020/06/11/global-stocks-...,https://image.cnbcfm.com/api/v1/image/10645007...,2020-06-11T09:29:10Z,Stock markets around the world retreated Thurs...,Stock markets around the world retreated Thurs...,"[21, 39, 127, 1, 40, 128, 37, 10, 67, 32, 3, 2...",79,PARENTING
7,"{'id': None, 'name': 'Forbes'}",Naeem Aslam,Dow Jones Futures Plunge: Covid-19 Second Wave...,The Dow Jones futures plunge due to the fear o...,https://www.forbes.com/sites/naeemaslam/2020/0...,https://thumbor.forbes.com/thumbor/fit-in/1200...,2020-06-11T08:05:32Z,The Dow Jones futures are trading lower as inv...,The Dow Jones futures plunge due to the fear o...,"[1, 71, 72, 73, 74, 319, 134, 10, 23, 112, 320...",98,WELLNESS
8,"{'id': None, 'name': 'MarketWatch'}",Steve Goldstein,Stocks in Europe skid as reversal from rally c...,,https://www.marketwatch.com/story/stocks-in-eu...,https://s.marketwatch.com/public/resources/ima...,2020-06-11T07:29:31Z,European stocks slumped Thursday in early trad...,Stocks in Europe skid as reversal from rally ...,"[344, 31, 345, 37, 4, 346, 347, 348, 349, 6, 3...",44,QUEER VOICES
9,"{'id': None, 'name': 'CNBC'}",Evelyn Cheng,U.S. pressure could accelerate growth for mark...,While U.S. authorities put pressure on Chinese...,https://www.cnbc.com/2020/06/11/us-pressure-co...,https://image.cnbcfm.com/api/v1/image/10641203...,2020-06-11T04:21:47Z,Chinese tourists with facial masks stand in fr...,While U.S. authorities put pressure on Chinese...,"[77, 363, 18, 44, 364, 139, 4, 365, 5, 1, 19, ...",73,POLITICS
10,"{'id': None, 'name': 'New York Times'}","Neil Vigdor, Elisha Brown",Walmart Says It Will No Longer Lock Up African...,The policy had been the subject of a racial di...,https://www.nytimes.com/2020/06/10/business/wa...,https://static01.nyt.com/images/2020/06/10/mul...,2020-06-11T04:18:47Z,The change came as a host of major corporation...,The policy had been the subject of a racial di...,"[1, 387, 388, 10, 3, 389, 5, 390, 391, 392, 39...",73,STYLE & BEAUTY


In [92]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X

In [93]:
df

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,text,words,word_length,sub_category
0,"{'id': 'the-washington-post', 'name': 'The Was...","Antonia Farzan, Katie Shepherd, Jennifer Hassa...",Live updates: Fed chair says millions of Ameri...,Congress will likely need to extend additional...,https://www.washingtonpost.com/nation/2020/06/...,https://www.washingtonpost.com/wp-apps/imrs.ph...,2020-06-11T10:45:17Z,MOSCOW Russias coronavirus cases passed the ha...,Congress will likely need to extend additional...,"[98, 38, 99, 100, 2, 101, 102, 103, 10, 104, 1...",86,QUEER VOICES
2,"{'id': None, 'name': 'MarketWatch'}",Steve Goldstein,Musk tweets 'lol' at stock market - MarketWatch,,http://www.marketwatch.com/story/musk-tweets-l...,https://s.wsj.net/public/resources/MWimages/MW...,2020-06-11T10:17:00Z,Steven Goldstein is based in London and respon...,Musk tweets 'lol' at stock market - MarketWatch,"[131, 132, 133, 24, 18, 43, 44]",39,TRAVEL
3,"{'id': None, 'name': 'Barron's'}",Daren Fonda,Delta Stock Could Be Grounded by Debt - Barron's,,https://www.barrons.com/articles/delta-stock-d...,https://images.barrons.com/im-197064/social,2020-06-11T10:08:40Z,Delta Air Lines\r\n warning that it is seeking...,Delta Stock Could Be Grounded by Debt - Barron's,"[134, 18, 45, 27, 135, 46, 136, 47]",43,DIVORCE
4,"{'id': None, 'name': 'Barron's'}",Al Root,Tesla Truck Competitor Nikola Met With Investo...,,https://www.barrons.com/articles/tesla-truck-c...,https://images.barrons.com/im-196788/social,2020-06-11T10:08:36Z,Battery and fuel cell heavy duty trucking pion...,Tesla Truck Competitor Nikola Met With Invest...,"[137, 138, 139, 140, 141, 48, 28, 142, 143, 19...",45,WELLNESS
5,"{'id': 'fox-news', 'name': 'Fox News'}",David Aaro,Johnson & Johnson says coronavirus vaccine's h...,"Johnson & Johnson, the largest health care com...",https://www.foxnews.com/health/johnson-johnson...,https://static.foxnews.com/foxnews.com/content...,2020-06-11T09:40:23Z,"Johnson &amp; Johnson, the largest health care...","Johnson & Johnson, the largest health care com...","[20, 20, 1, 49, 144, 145, 146, 6, 1, 50, 12, 1...",78,WELLNESS
6,"{'id': None, 'name': 'CNBC'}",Elliot Smith,Global stocks reel on gloomy Fed outlook and f...,Stock markets around the world retreated Thurs...,https://www.cnbc.com/2020/06/11/global-stocks-...,https://image.cnbcfm.com/api/v1/image/10645007...,2020-06-11T09:29:10Z,Stock markets around the world retreated Thurs...,Stock markets around the world retreated Thurs...,"[18, 55, 161, 1, 50, 162, 163, 10, 56, 30, 3, ...",79,PARENTING
7,"{'id': None, 'name': 'Forbes'}",Naeem Aslam,Dow Jones Futures Plunge: Covid-19 Second Wave...,The Dow Jones futures plunge due to the fear o...,https://www.forbes.com/sites/naeemaslam/2020/0...,https://thumbor.forbes.com/thumbor/fit-in/1200...,2020-06-11T08:05:32Z,The Dow Jones futures are trading lower as inv...,The Dow Jones futures plunge due to the fear o...,"[1, 60, 61, 62, 63, 170, 2, 1, 64, 4, 13, 21, ...",98,WELLNESS
8,"{'id': None, 'name': 'MarketWatch'}",Steve Goldstein,Stocks in Europe skid as reversal from rally c...,,https://www.marketwatch.com/story/stocks-in-eu...,https://s.marketwatch.com/public/resources/ima...,2020-06-11T07:29:31Z,European stocks slumped Thursday in early trad...,Stocks in Europe skid as reversal from rally ...,"[59, 6, 189, 190, 10, 191, 17, 32, 192, 44]",44,QUEER VOICES
9,"{'id': None, 'name': 'CNBC'}",Evelyn Cheng,U.S. pressure could accelerate growth for mark...,While U.S. authorities put pressure on Chinese...,https://www.cnbc.com/2020/06/11/us-pressure-co...,https://image.cnbcfm.com/api/v1/image/10641203...,2020-06-11T04:21:47Z,Chinese tourists with facial masks stand in fr...,While U.S. authorities put pressure on Chinese...,"[39, 14, 15, 193, 194, 67, 11, 195, 196, 28, 1...",73,POLITICS
10,"{'id': None, 'name': 'New York Times'}","Neil Vigdor, Elisha Brown",Walmart Says It Will No Longer Lock Up African...,The policy had been the subject of a racial di...,https://www.nytimes.com/2020/06/10/business/wa...,https://static01.nyt.com/images/2020/06/10/mul...,2020-06-11T04:18:47Z,The change came as a host of major corporation...,The policy had been the subject of a racial di...,"[1, 206, 68, 33, 1, 207, 4, 3, 69, 208, 209, 2...",73,STYLE & BEAUTY


In [94]:
df['word_length'] = df.words.apply(lambda i: len(i))
df = df[df.word_length >= 5]
df.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,text,words,word_length,sub_category
0,"{'id': 'the-washington-post', 'name': 'The Was...","Antonia Farzan, Katie Shepherd, Jennifer Hassa...",Live updates: Fed chair says millions of Ameri...,Congress will likely need to extend additional...,https://www.washingtonpost.com/nation/2020/06/...,https://www.washingtonpost.com/wp-apps/imrs.ph...,2020-06-11T10:45:17Z,MOSCOW Russias coronavirus cases passed the ha...,Congress will likely need to extend additional...,"[98, 38, 99, 100, 2, 101, 102, 103, 10, 104, 1...",52,QUEER VOICES
2,"{'id': None, 'name': 'MarketWatch'}",Steve Goldstein,Musk tweets 'lol' at stock market - MarketWatch,,http://www.marketwatch.com/story/musk-tweets-l...,https://s.wsj.net/public/resources/MWimages/MW...,2020-06-11T10:17:00Z,Steven Goldstein is based in London and respon...,Musk tweets 'lol' at stock market - MarketWatch,"[131, 132, 133, 24, 18, 43, 44]",7,TRAVEL
3,"{'id': None, 'name': 'Barron's'}",Daren Fonda,Delta Stock Could Be Grounded by Debt - Barron's,,https://www.barrons.com/articles/delta-stock-d...,https://images.barrons.com/im-197064/social,2020-06-11T10:08:40Z,Delta Air Lines\r\n warning that it is seeking...,Delta Stock Could Be Grounded by Debt - Barron's,"[134, 18, 45, 27, 135, 46, 136, 47]",8,DIVORCE
4,"{'id': None, 'name': 'Barron's'}",Al Root,Tesla Truck Competitor Nikola Met With Investo...,,https://www.barrons.com/articles/tesla-truck-c...,https://images.barrons.com/im-196788/social,2020-06-11T10:08:36Z,Battery and fuel cell heavy duty trucking pion...,Tesla Truck Competitor Nikola Met With Invest...,"[137, 138, 139, 140, 141, 48, 28, 142, 143, 19...",12,WELLNESS
5,"{'id': 'fox-news', 'name': 'Fox News'}",David Aaro,Johnson & Johnson says coronavirus vaccine's h...,"Johnson & Johnson, the largest health care com...",https://www.foxnews.com/health/johnson-johnson...,https://static.foxnews.com/foxnews.com/content...,2020-06-11T09:40:23Z,"Johnson &amp; Johnson, the largest health care...","Johnson & Johnson, the largest health care com...","[20, 20, 1, 49, 144, 145, 146, 6, 1, 50, 12, 1...",44,WELLNESS


In [95]:
df.word_length.describe()

count    18.000000
mean     33.666667
std      15.669378
min       7.000000
25%      25.500000
50%      39.500000
75%      44.000000
max      59.000000
Name: word_length, dtype: float64

In [96]:
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
X = np.array(X)

In [97]:
cate = BiGRU.predict(X)

In [98]:
cate_list = []
for i in range(len(cate)):
    y = cate[i]
    p = np.argmax(y)
    cate_list.append(int_category[p])
    print(int_category[p])

HEALTHY LIVING
HEALTHY LIVING
HEALTHY LIVING
POLITICS
PARENTING
FOOD & DRINK
WEDDINGS
STYLE
PARENTING
QUEER VOICES
DIVORCE
COMEDY
DIVORCE
HEALTHY LIVING
BUSINESS
WELLNESS
ARTS
COMEDY


In [99]:
cate_list

['HEALTHY LIVING',
 'HEALTHY LIVING',
 'HEALTHY LIVING',
 'POLITICS',
 'PARENTING',
 'FOOD & DRINK',
 'WEDDINGS',
 'STYLE',
 'PARENTING',
 'QUEER VOICES',
 'DIVORCE',
 'COMEDY',
 'DIVORCE',
 'HEALTHY LIVING',
 'BUSINESS',
 'WELLNESS',
 'ARTS',
 'COMEDY']

In [100]:
df['sub_category'] = cate_list

In [101]:
view = df[['title','description','sub_category']]
view

Unnamed: 0,title,description,sub_category
0,Live updates: Fed chair says millions of Ameri...,Congress will likely need to extend additional...,HEALTHY LIVING
2,Musk tweets 'lol' at stock market - MarketWatch,,HEALTHY LIVING
3,Delta Stock Could Be Grounded by Debt - Barron's,,HEALTHY LIVING
4,Tesla Truck Competitor Nikola Met With Investo...,,POLITICS
5,Johnson & Johnson says coronavirus vaccine's h...,"Johnson & Johnson, the largest health care com...",PARENTING
6,Global stocks reel on gloomy Fed outlook and f...,Stock markets around the world retreated Thurs...,FOOD & DRINK
7,Dow Jones Futures Plunge: Covid-19 Second Wave...,The Dow Jones futures plunge due to the fear o...,WEDDINGS
8,Stocks in Europe skid as reversal from rally c...,,STYLE
9,U.S. pressure could accelerate growth for mark...,While U.S. authorities put pressure on Chinese...,PARENTING
10,Walmart Says It Will No Longer Lock Up African...,The policy had been the subject of a racial di...,QUEER VOICES


In [46]:
pip install dill

Collecting dill
  Downloading dill-0.3.1.1.tar.gz (151 kB)
Building wheels for collected packages: dill
  Building wheel for dill (setup.py): started
  Building wheel for dill (setup.py): finished with status 'done'
  Created wheel for dill: filename=dill-0.3.1.1-py3-none-any.whl size=78597 sha256=9de58988816a6ba7d2fdb709725e56c36ede30f83dc17fe9ae41459cea535475
  Stored in directory: c:\users\inghy\appdata\local\pip\cache\wheels\a4\61\fd\c57e374e580aa78a45ed78d5859b3a44436af17e22ca53284f
Successfully built dill
Installing collected packages: dill
Successfully installed dill-0.3.1.1
Note: you may need to restart the kernel to use updated packages.


In [128]:
import dill as pickle

In [129]:
filename = 'NewsClf'

In [130]:
with open(filename, 'wb') as file:
    pickle.dump(BiGRU, file)

In [131]:
with open(filename, 'rb') as f:
    loaded_model = pickle.load(f)

In [132]:
loaded_model.predict(X)

array([[3.18178936e-04, 2.59100561e-05, 3.33355332e-04, 3.04321270e-03,
        5.06694475e-03, 2.19207779e-02, 2.68598297e-03, 4.01008219e-05,
        1.00252149e-03, 3.84998471e-02, 3.54311639e-03, 4.97574103e-04,
        3.07378359e-02, 9.56490039e-05, 1.84163020e-03, 8.83957196e-04,
        2.68137758e-03, 1.09407178e-03, 6.88933628e-03, 3.24617082e-04,
        9.93738067e-04, 1.18401954e-02, 5.25422037e-01, 2.33715057e-01,
        8.80574659e-02, 2.95235659e-03, 1.48796666e-04, 4.66531783e-05,
        3.23745859e-04, 2.40769965e-04, 5.54667087e-04, 6.35273318e-05,
        1.79493392e-04, 3.45036434e-03, 4.88653139e-04, 2.94133555e-04,
        5.04247798e-03, 4.35832608e-03, 4.61571726e-05, 7.78409594e-05,
        1.77513677e-04],
       [3.08426842e-03, 6.61908276e-03, 6.34519057e-03, 7.41450954e-03,
        2.73685786e-03, 2.74663456e-02, 3.86550045e-03, 6.47854409e-04,
        3.26814363e-03, 5.96683798e-03, 3.75550501e-02, 2.64790212e-03,
        5.76955685e-03, 2.84312665e-02,