In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import string
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.preprocessing.text import Tokenizer

import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import *

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

In [2]:
df = pd.read_csv("./data/news-train.csv")

In [3]:
df

Unnamed: 0,ArticleId,Text,Category
0,1429,sfa awaits report over mikoliunas the scottish...,sport
1,1896,parmalat to return to stockmarket parmalat th...,business
2,1633,edu blasts arsenal arsenal s brazilian midfiel...,sport
3,2178,henman decides to quit davis cup tim henman ha...,sport
4,194,french suitor holds lse meeting european stock...,business
...,...,...,...
995,1250,blair damaged by blunkett row a majority of ...,politics
996,1639,a november to remember last saturday one news...,sport
997,916,highbury tunnel players in clear the football ...,sport
998,2217,top stars join us tsunami tv show brad pitt r...,entertainment


In [4]:
X = df['Text']
y = df['Category']

In [5]:
y = pd.get_dummies(y)
y = np.asarray(y)

In [6]:
tokenizer = Tokenizer(num_words = 21223, lower = True)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index

In [7]:
sequences = tokenizer.texts_to_sequences(X)

In [8]:
data = pad_sequences(sequences)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(data, y, test_size = 0.2, random_state=42)

In [10]:
glove_directory = './glove/glove.6B.100d.txt'

embeddings_index = {}
f = open(glove_directory, encoding ='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print(f"Found {len(embeddings_index)} word vectors. ")

Found 400000 word vectors. 


In [11]:
embeddings_index['happy']

array([-0.090436 ,  0.19636  ,  0.29474  , -0.47706  , -0.80436  ,
        0.3078   , -0.55205  ,  0.58453  , -0.17056  , -0.84846  ,
        0.19528  ,  0.23671  ,  0.46827  , -0.58977  , -0.12163  ,
       -0.24697  , -0.072944 ,  0.17259  , -0.0485   ,  0.9527   ,
        0.50629  ,  0.58497  , -0.19367  , -0.45459  , -0.031095 ,
        0.51633  , -0.24052  , -0.1007   ,  0.53627  ,  0.024225 ,
       -0.50162  ,  0.73692  ,  0.49468  , -0.34744  ,  0.89337  ,
        0.057439 , -0.19127  ,  0.39333  ,  0.21182  , -0.89837  ,
        0.078704 , -0.16344  ,  0.45261  , -0.41096  , -0.19499  ,
       -0.13489  , -0.016313 , -0.021849 ,  0.17136  , -1.2413   ,
        0.079503 , -0.91144  ,  0.35699  ,  0.36289  , -0.24934  ,
       -2.1196   ,  0.14534  ,  0.52964  ,  0.90134  ,  0.033603 ,
        0.022809 ,  0.70625  , -1.0362   , -0.59809  ,  0.70592  ,
       -0.072793 ,  0.67033  ,  0.52763  , -0.47807  , -0.67374  ,
        0.36632  , -0.38284  , -0.10349  , -0.6402   ,  0.1810

In [12]:
max_words = 21223

In [13]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [14]:
max_len = len(max(sequences, key=len))

In [33]:
model = Sequential()
model.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_shape=(max_len,)))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.summary()

In [34]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [35]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001)

In [36]:
model.compile(optimizer=optimizer,
             loss=tf.keras.losses.CategoricalCrossentropy(),
             metrics = ['accuracy'])
history = model.fit(X_train, y_train, epochs = 20, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.3979 - loss: 1.5391 - val_accuracy: 0.7100 - val_loss: 0.8430
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9261 - loss: 0.4301 - val_accuracy: 0.6400 - val_loss: 0.8736
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9929 - loss: 0.0843 - val_accuracy: 0.9000 - val_loss: 0.4184
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 1.0000 - loss: 0.0173 - val_accuracy: 0.8750 - val_loss: 0.3706
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 1.0000 - loss: 0.0056 - val_accuracy: 0.8700 - val_loss: 0.4380
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 1.0000 - loss: 0.0025 - val_accuracy: 0.9100 - val_loss: 0.2947
Epoch 7/20
[1m50/50[0m [32m━━━━

In [37]:
model.save('val_acc_91.keras')