Group 1:
Name -- Roll Number
1. Raj Chandrashekhar Pandey -- 173
2. Sanika Pareek -- 103
3. Shreya Gaikwad -- 101
4. Kartik Gunturi -- 218

Implement Sequence model using LSTM/RNN/GRU

In [17]:
import pandas as pd
import numpy as np


#Train Test Split
from sklearn.model_selection import train_test_split

#Model Evaluation
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
#from mlxtend.plotting import plot_confusion_matrix

#Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model

In [18]:
data = pd.read_csv("train.csv")
testdata = pd.read_csv("test.csv")

#Setting the column names
data.columns = ['ClassIndex', 'Title', 'Description']
testdata.columns = ['ClassIndex', 'Title', 'Description']

#Combining title and Description
X_train = data['Title'] + " " + data['Description'] 
#Assigning class labels 
y_train = data['ClassIndex'].apply(lambda x: x-1).values 

#The same for the test dataset
x_test = testdata['Title'] + " " + testdata['Description'] 
y_test = testdata['ClassIndex'].apply(lambda x: x-1).values 

#Max Length of sentences in Train Dataset
maxlen = X_train.map(lambda x: len(x.split())).max()
data.head()

Unnamed: 0,ClassIndex,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [4]:
vocab_size = 10000 # arbitrarily chosen
embed_size = 32 # arbitrarily chosen

# Create and Fit tokenizer
tok = Tokenizer(num_words=vocab_size)
tok.fit_on_texts(X_train.values)

# Tokenize data
X_train = tok.texts_to_sequences(X_train)
x_test = tok.texts_to_sequences(x_test)

# Pad data
X_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [5]:
#LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, embed_size, input_length=maxlen))
model.add(Bidirectional(LSTM(100, return_sequences=True))) 
model.add(Bidirectional(LSTM(60, return_sequences=True)))
model.add(GlobalMaxPooling1D()) 
model.add(Dense(1024))
model.add(Dropout(0.25)) 
model.add(Dense(512))
model.add(Dropout(0.25))
model.add(Dense(4, activation='softmax')) #softmax is used as the activation function for multi-class classification problems where class membership is required on more than two class labels.
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 177, 32)           320000    
                                                                 
 bidirectional (Bidirectiona  (None, 177, 200)         106400    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 177, 120)         125280    
 nal)                                                            
                                                                 
 global_max_pooling1d (Globa  (None, 120)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 1024)              123904    
                                                        

In [6]:
callbacks = [
    EarlyStopping(     #EarlyStopping is used to stop at the epoch where val_accuracy does not improve significantly
        monitor='val_accuracy',
        min_delta=1e-4,
        patience=4,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='weights.h5',
        monitor='val_accuracy', 
        mode='max', 
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    )
   
]

In [8]:
#Compile and Fit Model
model.compile(loss='sparse_categorical_crossentropy', #Sparse Categorical Crossentropy Loss because data is not one-hot encoded
              optimizer='adam', 
              metrics=['accuracy']) 

model.fit(X_train, 
          y_train, 
          batch_size=2048, 
          validation_data=(x_test, y_test), 
          epochs=5, 
          callbacks=callbacks)

Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.90145, saving model to weights.h5
Epoch 2/5
Epoch 2: val_accuracy improved from 0.90145 to 0.91289, saving model to weights.h5
Epoch 3/5
Epoch 3: val_accuracy did not improve from 0.91289
Epoch 4/5
Epoch 4: val_accuracy did not improve from 0.91289
Epoch 5/5
Epoch 5: val_accuracy did not improve from 0.91289


<keras.callbacks.History at 0x1f6976bae80>

In [9]:
model.load_weights('weights.h5')
model.save('model.hdf5')

In [15]:
labels = ['World News', 'Sports News', 'Business News', 'Science-Technology News']
preds = [np.argmax(i) for i in model.predict(x_test)]



In [16]:
print("Accuracy of the model is {:.2f} %".format(accuracy_score(y_test, preds)*100))


Accuracy of the model is 91.29 %
