In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# understanding dataset
import csv
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
sentences = []
labels = []
titles = []
with open("/kaggle/input/bbcnewsarchive/bbc-news-data.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:
        labels.append(row[0])
        titles.append(row[2])
        sentence = row[3]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)


print(len(sentences))
print(sentences[0])
print(labels[0])
print(titles[0])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 30000
embedding_dim = 32
max_length = 256

#sentence tokenizer
sentence_tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
sentence_tokenizer.fit_on_texts(sentences)
word_index = sentence_tokenizer.word_index
print(len(word_index))
sequences = sentence_tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
print(padded[0])
print(padded.shape)

#label tokenizer
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_word_index = label_tokenizer.word_index
label_seq = label_tokenizer.texts_to_sequences(labels)
#print(label_seq)
print(label_word_index)

In [None]:
print(len(sequences))
print(len(padded[0]))
print(len(label_seq))

In [None]:
#train test split
training_size = 1780
training_sentences = padded[0:training_size]
testing_sentences = padded[training_size:]
training_labels = label_seq[0:training_size]
testing_labels = label_seq[training_size:]

print(len(training_sentences))
print(len(testing_sentences))
print(len(training_labels))
print(len(testing_labels))

In [None]:
print(training_sentences.shape)
print(testing_sentences.shape)
trainlabels = np.array(training_labels)
testlabels = np.array(testing_labels)
print(trainlabels.shape)
print(testlabels.shape)

In [None]:
#simple model
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
#training
history = model.fit(training_sentences, trainlabels, epochs=10, validation_data=(testing_sentences, testlabels), verbose=1)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
    
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [None]:
#making prediction
labels_pred = model.predict_classes(testing_sentences)

In [None]:
#some random results (predicted vs actual labels)
import random

def show_pred_vs_actual(predictions, actual):
    for i in range(0,10):
        ind = random.randrange(0,len(actual))
        print(str(predictions[ind]) + " vs " + str(actual[ind]))

show_pred_vs_actual(labels_pred,testlabels)

Clearly its a bad model, as loss is increasing for validation set and accuracy is constant..it need to improved..

In [None]:
#lets shuffle data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, train_size = 0.8, random_state = 42, shuffle = True)

In [None]:
#tokenize data
newtokenizer = Tokenizer(num_words = 15000, oov_token='<OOV>')
newtokenizer.fit_on_texts(X_train)
word_index = newtokenizer.word_index
X_train_seq = newtokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=max_length)
X_test_seq = newtokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, padding='post', maxlen=max_length)
new_label_tokenizer = Tokenizer()
new_label_tokenizer.fit_on_texts(labels)
new_label_index = new_label_tokenizer.word_index
y_train_label_seq = np.array(new_label_tokenizer.texts_to_sequences(y_train))
y_test_label_seq = np.array(new_label_tokenizer.texts_to_sequences(y_test))

In [None]:
#cnn model
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model1.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()
history1 = model1.fit(X_train_padded, y_train_label_seq, epochs=15, validation_data=(X_test_padded, y_test_label_seq), verbose=2)

In [None]:
plot_graphs(history1, 'accuracy')
plot_graphs(history1, 'loss')

In [None]:
labels_pred1 = model1.predict_classes(X_test_padded)
show_pred_vs_actual(labels_pred1,y_test_label_seq)

In [None]:
#lstm model
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model2.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.summary()
history2 = model2.fit(X_train_padded, y_train_label_seq, epochs=20, validation_data=(X_test_padded, y_test_label_seq), verbose=2)

In [None]:
plot_graphs(history2, 'accuracy')
plot_graphs(history2, 'loss')

In [None]:
labels_pred2 = model2.predict_classes(X_test_padded)
show_pred_vs_actual(labels_pred2,y_test_label_seq)

In [None]:
#accuracy scores
from sklearn.metrics import accuracy_score
print("NN model")
print(accuracy_score(labels_pred,testlabels))
print("CNN model")
print(accuracy_score(labels_pred1,y_test_label_seq))
print("LSTM model")
print(accuracy_score(labels_pred2,y_test_label_seq))

In [None]:
#plotting comparison between 3 DL models
import pandas as pd
from pandas import DataFrame
accuracy = [max(history.history['val_accuracy']),max(history1.history['val_accuracy']), max(history2.history['val_accuracy'])]
loss = [max(history.history['val_loss']),max(history1.history['val_loss']),max(history2.history['val_loss'])]

col={'Accuracy':accuracy,'Loss':loss}
models=['NN','CNN','LSTM']
df=DataFrame(data=col,index=models)
df

In [None]:
df.plot(kind='bar')