In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import string
import re

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
filename = "/content/drive/MyDrive/SDLC/news_analysis_project/data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [5]:
df_3 = df[df.category.isin(df.category.value_counts()[:3].index.values)]
df_3

Unnamed: 0,category,headline,authors,link,short_description,date
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26
...,...,...,...,...,...,...
200800,WELLNESS,The Sleep Library: 11 Soothing Books For Bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,Do you toss and turn until you finally sit up ...,2012-01-28
200802,WELLNESS,The Benefits of Caring for a Pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,"For the young as well as the old, especially i...",2012-01-28
200805,WELLNESS,This Is Only the Beginning: Surprising Advice ...,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,"My great-aunt Ida loves to say, ""This is only ...",2012-01-28
200838,ENTERTAINMENT,"Sundance, Ice-T, and Shades of the American Ra...","Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,Representation of the collective diaspora has ...,2012-01-28


# Finding the ratio between the number samples to an average words in a sample

The above ratio being above 1500 signals that one should use a sequence model while less than 1500 means that bag-of-ngrams should work better

In [6]:
def find_ratio(df):
    col = 'short_description'
    return len(df)/df[col].apply(lambda x: len([j.translate(str.maketrans('', '', string.punctuation)) for i in x.split('.') for j in i.split()])).mean()

In [7]:
%time
find_ratio(df)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.58 µs


10123.495628334394

In [8]:
find_ratio(df_3)

3490.620950847106

As we can see, it is much better to go for a sequence model 

In [9]:
from sklearn.model_selection import train_test_split
y = df_3['category'].astype('category').cat.codes
y_encoded = keras.utils.to_categorical(y)


sentences_train, sentences_test, y_train, y_test = train_test_split(df_3['short_description'], y_encoded, test_size=0.2, stratify=y, random_state=1000)

In [10]:
batch_size = 32

train_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(sentences_train.values, tf.string),
            y_train
        )
    )
)

test_ds = (
    tf.data.Dataset.from_tensor_slices(
        (
          tf.cast(sentences_test.values, tf.string),
          y_test
        )
    )
)

train_ds = train_ds.batch(batch_size)
test_ds = test_ds.batch(batch_size)

# Simple model on unigrams

In [11]:
text_vectorization = keras.layers.TextVectorization(
        max_tokens=20000,
        output_mode="multi_hot",
)

def custom_standardization_fn(string_tensor): 
    lowercase_string = tf.strings.lower(string_tensor) 
    return tf.strings.regex_replace(lowercase_string, f"[{re.escape(string.punctuation)}]", "")


In [12]:
text_only_train_ds = train_ds.map(lambda x, y: x) 

text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=-1)

binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=-1)

In [13]:
for inputs, targets in binary_1gram_train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print(targets)
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32, 3)
targets.dtype: <dtype: 'float32'>
inputs[0]: tf.Tensor([0. 0. 0. ... 0. 0. 0.], shape=(20000,), dtype=float32)
tf.Tensor(
[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]], shape=(32, 3), dtype=float32)


In [14]:
def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = keras.layers.Dense(hidden_dim, activation="relu")(inputs) 
    x = keras.layers.Dropout(0.5)(x)
    outputs = keras.layers.Dense(3, activation="softmax")(x)
    model = keras.Model(inputs, outputs) 
    model.compile(optimizer="rmsprop",
                    loss="categorical_crossentropy",
                    metrics=["accuracy"])
    return model

In [15]:
model = get_model()
model.summary()

model.fit(binary_1gram_train_ds.cache(),
          epochs=10
)

print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 51        
                                                                 
Total params: 320,067
Trainable params: 320,067
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.824


# Simple model on bigrams

In [16]:
text_vectorization_bigrams = keras.layers.TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
)

In [17]:
text_vectorization_bigrams.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization_bigrams(x), y), num_parallel_calls=-1)
binary_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization_bigrams(x), y), num_parallel_calls=-1)

In [18]:
model_bigrams = get_model()
model_bigrams.summary()

model_bigrams.fit(binary_2gram_train_ds.cache(),
          epochs=10
)

print(f"Test acc: {model_bigrams.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 3)                 51        
                                                                 
Total params: 320,067
Trainable params: 320,067
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.817


# Model with LSTM layers

In [7]:
tokenizer_raw = keras.preprocessing.text.Tokenizer(num_words=60000)
tokenizer_raw.fit_on_texts(sentences_train)

X_train_raw = tokenizer_raw.texts_to_sequences(sentences_train)
X_test_raw = tokenizer_raw.texts_to_sequences(sentences_test)

In [8]:
vocab_size_raw = len(tokenizer_raw.word_index) + 1

In [9]:
maxlen_raw = 100

X_train_raw = keras.preprocessing.sequence.pad_sequences(X_train_raw, padding='post', maxlen=maxlen_raw)
X_test_raw = keras.preprocessing.sequence.pad_sequences(X_test_raw, padding='post', maxlen=maxlen_raw)

## Building a model with a pretrained embedding layer

In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-02-19 19:42:19--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-02-19 19:42:19--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-02-19 19:42:19--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [11]:
!head -n 1 glove.6B.50d.txt | cut -c-50

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445


In [12]:
import numpy as np

embedding_dim = 100


def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_matrix_raw = create_embedding_matrix('glove.6B.100d.txt', 
                                               tokenizer_raw.word_index,
                                               embedding_dim)    

### Model with single UniDirectional LSTM layer

In [13]:
model_simple_lstm = keras.models.Sequential()
model_simple_lstm.add(keras.layers.Embedding(vocab_size_raw, embedding_dim, weights=[embedding_matrix_raw], input_length=maxlen_raw, trainable=False))
model_simple_lstm.add(keras.layers.LSTM(maxlen_raw))
model_simple_lstm.add(keras.layers.Dense(3, activation='softmax'))

In [14]:
# opt = keras.optimizers.RMSprop()
opt = keras.optimizers.Adam()


In [15]:
model_simple_lstm.compile(loss = "categorical_crossentropy", optimizer = opt, metrics=['accuracy'])

In [16]:
model_simple_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          4053900   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 4,134,603
Trainable params: 80,703
Non-trainable params: 4,053,900
_________________________________________________________________


In [17]:
callbacks = [
             keras.callbacks.EarlyStopping(patience=3)
]

In [18]:
history_raw = model_simple_lstm.fit(X_train_raw, y_train,
                    epochs=30,
                    validation_data=(X_test_raw, y_test), callbacks=callbacks, batch_size=32)
loss_training_raw, accuracy_training_raw = model_simple_lstm.evaluate(X_train_raw, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_raw))
loss_test_raw, accuracy_test_raw = model_simple_lstm.evaluate(X_test_raw, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_raw))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Training Accuracy: 0.8430
Testing Accuracy:  0.8247


# Model with single BiDirectional LSTM Layer and Embedding dim set to 100

In [20]:
model_simple_bilstm = keras.models.Sequential()
model_simple_bilstm.add(keras.layers.Embedding(vocab_size_raw, embedding_dim, weights=[embedding_matrix_raw], input_length=maxlen_raw, trainable=False))
model_simple_bilstm.add(keras.layers.Bidirectional(keras.layers.LSTM(maxlen_raw)))
model_simple_bilstm.add(keras.layers.Dense(3, activation='softmax'))

In [23]:
model_simple_bilstm.compile(loss = "categorical_crossentropy", optimizer = keras.optimizers.RMSprop(), metrics=['accuracy'])

In [24]:
model_simple_bilstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          4053900   
                                                                 
 bidirectional (Bidirectiona  (None, 200)              160800    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3)                 603       
                                                                 
Total params: 4,215,303
Trainable params: 161,403
Non-trainable params: 4,053,900
_________________________________________________________________


In [27]:
history_raw_bilstm = model_simple_bilstm.fit(X_train_raw, y_train,
                    epochs=30,
                    validation_data=(X_test_raw, y_test), callbacks=callbacks, batch_size=32)
loss_training_raw_bilstm, accuracy_training_raw_bilstm = model_simple_bilstm.evaluate(X_train_raw, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_raw_bilstm))
loss_test_raw_bilstm, accuracy_test_raw_bilstm = model_simple_bilstm.evaluate(X_test_raw, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_raw_bilstm))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Training Accuracy: 0.8976
Testing Accuracy:  0.8422


### Model with a single BiDirectional LSTM layer and another dense layer

In [28]:
model_bilstm = keras.models.Sequential()
model_bilstm.add(keras.layers.Embedding(vocab_size_raw, embedding_dim, weights=[embedding_matrix_raw], input_length=maxlen_raw, trainable=False))
model_bilstm.add(keras.layers.Bidirectional(keras.layers.LSTM(maxlen_raw)))
model_bilstm.add(keras.layers.Dense(50, activation='relu'))
model_bilstm.add(keras.layers.Dense(3, activation='softmax'))

In [29]:
model_bilstm.compile(loss = "categorical_crossentropy", optimizer = keras.optimizers.RMSprop(), metrics=['accuracy'])

In [30]:
model_bilstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          4053900   
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              160800    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 50)                10050     
                                                                 
 dense_2 (Dense)             (None, 3)                 153       
                                                                 
Total params: 4,224,903
Trainable params: 171,003
Non-trainable params: 4,053,900
_________________________________________________________________


In [31]:
history_bilstm = model_bilstm.fit(X_train_raw, y_train,
                    epochs=30,
                    validation_data=(X_test_raw, y_test), callbacks=callbacks, batch_size=32)
loss_training_bilstm, accuracy_training_bilstm = model_bilstm.evaluate(X_train_raw, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_raw_bilstm))
loss_test_bilstm, accuracy_test_bilstm = model_bilstm.evaluate(X_test_raw, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_bilstm))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Training Accuracy: 0.8976
Testing Accuracy:  0.8370


# Checking the performance on entire dataset, with all (41) categories

In [32]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size_raw, embedding_dim, weights=[embedding_matrix_raw], input_length=maxlen_raw, trainable=False))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(maxlen_raw)))
model.add(keras.layers.Dense(82, activation='relu'))
model.add(keras.layers.Dense(41, activation='softmax'))

Splitting into train/test datasets

In [34]:
y_full = df['category'].astype('category').cat.codes
y_encoded_full = keras.utils.to_categorical(y_full)


sentences_train_full, sentences_test_full, y_train_full, y_test_full = train_test_split(df['short_description'], y_encoded_full, test_size=0.2, stratify=df['category'], random_state=1000)

In [45]:
print("Finding the number of unique words in the short_description column: ")
words_set = set()
for article in df['short_description'].values:
    for sentence in article.split('.'):
        for word in sentence.split(' '):
            word = "".join([i for i in word if i.isalpha() or i.isspace()]).lower()
            if word:
                words_set.add(word)

print(len(words_set))

Finding the number of unique words in the short_description column: 
90376


In [47]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=80000)
tokenizer.fit_on_texts(sentences_train_full)

X_train = tokenizer.texts_to_sequences(sentences_train_full)
X_test = tokenizer.texts_to_sequences(sentences_test_full)

In [48]:
maxlen_raw = 100

X_train = keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen_raw)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen_raw)

In [35]:
model.compile(loss = "categorical_crossentropy", optimizer = keras.optimizers.RMSprop(), metrics=['accuracy'])

In [36]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          4053900   
                                                                 
 bidirectional_2 (Bidirectio  (None, 200)              160800    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 82)                16482     
                                                                 
 dense_4 (Dense)             (None, 41)                3403      
                                                                 
Total params: 4,234,585
Trainable params: 180,685
Non-trainable params: 4,053,900
_________________________________________________________________


In [49]:
history = model.fit(X_train, y_train_full,
                    epochs=40,
                    validation_data=(X_test, y_test_full), callbacks=callbacks, batch_size=32)
loss_training, accuracy_training = model.evaluate(X_train, y_train_full, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training))
loss_test, accuracy_test = model.evaluate(X_test, y_test_full, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Training Accuracy: 0.3563
Testing Accuracy:  0.3256
