In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
train = pd.read_excel('/Users/sathishkumars/Documents/Anaconda-Python/Hackathons/NewsCategory/Data_Train.xlsx')
test = pd.read_excel('/Users/sathishkumars/Documents/Anaconda-Python/Hackathons/NewsCategory/Data_Test.xlsx')
train.shape, test.shape

((7628, 2), (2748, 1))

In [85]:
train.head(10)

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3
5,BEIJING: Chinese tech giant Huawei has announc...,1
6,Mumbai: India Inc's external commercial borrow...,3
7,"On Wednesday, Federal Reserve Chairman Jerome ...",3
8,What more can you give to the audience? I have...,2
9,"com, Arbaaz Khan spoke about getting back to D...",2


In [86]:
train.SECTION.value_counts()

1    2772
2    1924
0    1686
3    1246
Name: SECTION, dtype: int64

In [87]:
test.head(10)

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...
5,"""Imagine if every message you sent was kept wi..."
6,Positioned along the four sides of the Asus RO...
7,"In fact, when I applied to USC film school the..."
8,"As spotted by Android Police, Netflix is testi..."
9,Her moves were immaculately choreographed as s...


In [88]:
train_sentences = train['STORY'].values
test_sentences = test['STORY'].values

# Importing Glove Vectors

In [7]:
word_to_vec_map = {}
with open('/Users/sathishkumars/Documents/Anaconda-Python/glove/glove.6B.50d.txt') as file:
    for line in file:
        values = line.strip().split()
        curr_word = values[0]
        word_to_vec_map[curr_word] = np.array(values[1:], dtype = np.float64)
len(word_to_vec_map)

400000

In [8]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 1000)
from keras.preprocessing.sequence import pad_sequences

In [89]:
max_seq_length = 1000
tokenizer.fit_on_texts(train_sentences)
train_seq = tokenizer.texts_to_sequences(train_sentences)
train_seq_pad = pad_sequences(train_seq, maxlen = max_seq_length)

In [90]:
index = tokenizer.word_index
len(index)

38286

In [11]:
index

{'the': 1,
 'to': 2,
 'and': 3,
 'of': 4,
 'in': 5,
 'a': 6,
 'is': 7,
 'on': 8,
 'for': 9,
 'that': 10,
 'with': 11,
 'it': 12,
 'has': 13,
 'as': 14,
 'will': 15,
 'from': 16,
 'at': 17,
 'by': 18,
 'be': 19,
 'are': 20,
 'said': 21,
 'have': 22,
 'its': 23,
 'was': 24,
 'this': 25,
 'an': 26,
 'which': 27,
 'also': 28,
 'not': 29,
 'but': 30,
 'he': 31,
 'been': 32,
 'their': 33,
 'india': 34,
 'new': 35,
 'up': 36,
 'or': 37,
 'year': 38,
 'you': 39,
 'more': 40,
 'we': 41,
 'his': 42,
 'can': 43,
 'bjp': 44,
 'congress': 45,
 'about': 46,
 'had': 47,
 'who': 48,
 'one': 49,
 'all': 50,
 'they': 51,
 'party': 52,
 'while': 53,
 'like': 54,
 'out': 55,
 'after': 56,
 'two': 57,
 'people': 58,
 'i': 59,
 '1': 60,
 'time': 61,
 'would': 62,
 'other': 63,
 'when': 64,
 'first': 65,
 'than': 66,
 'were': 67,
 'there': 68,
 'last': 69,
 'over': 70,
 'only': 71,
 'us': 72,
 'so': 73,
 'if': 74,
 'data': 75,
 'elections': 76,
 'some': 77,
 '2': 78,
 'government': 79,
 'market': 80,
 'users

In [91]:
embedding_matrix = np.zeros((len(index) +1, 50))
for word, i in index.items():
    temp = word_to_vec_map.get(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [13]:
from sklearn.model_selection import train_test_split

In [22]:
from keras.utils import to_categorical

In [92]:
y = to_categorical(train['SECTION'], num_classes = 4)

In [93]:
X = train_seq_pad
Y = train['SECTION']

In [74]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.3, random_state = 21)

In [16]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, Dense, LSTM, GlobalMaxPooling1D, Dropout

In [94]:
embed_layer = Embedding(len(index) + 1, 50, input_length = max_seq_length, weights = [embedding_matrix])

In [95]:
model = Sequential()
model.add(embed_layer)
model.add(Bidirectional(LSTM(50, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1)))
model.add(GlobalMaxPooling1D())
model.add(Dense(50, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation = 'sigmoid'))

In [96]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 50)          1914350   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 1000, 100)         40400     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 204       
Total params: 1,960,004
Trainable params: 1,960,004
Non-trainable params: 0
____________________________________________

In [97]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'Adam', metrics = ['accuracy'])

In [98]:
history = model.fit(xTrain, yTrain, epochs = 10, batch_size = 64, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [99]:
result = model.evaluate(xTest, yTest) 



In [32]:
test_seq = tokenizer.texts_to_sequences(test_sentences)
test_seq_pad = pad_sequences(test_seq, maxlen = max_seq_length)

In [58]:
pred = model.predict(test_seq_pad)

In [34]:
sample = pd.read_excel('/Users/sathishkumars/Documents/Anaconda-Python/Hackathons/NewsCategory/Sample_submission.xlsx')

In [35]:
sample

Unnamed: 0,SECTION
0,3
1,3
2,3
3,3
4,3
...,...
2743,2
2744,2
2745,2
2746,2


In [59]:
np.argmax(pred, axis =1)

array([1, 2, 1, ..., 1, 0, 1])

In [60]:
submission1 = pd.DataFrame(np.argmax(pred, axis = 1), columns = ['SECTION'])

In [61]:
submission1

Unnamed: 0,SECTION
0,1
1,2
2,1
3,0
4,1
...,...
2743,1
2744,1
2745,1
2746,0


In [62]:
submission.to_excel('Submission.xlsx')