In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [None]:
import pandas as pd
import tensorflow as tf

In [None]:
filename = "/content/drive/MyDrive/SDLC/news_analysis_project/data/final_news_category_dataset.json"
df = pd.read_json(filename, orient='split')
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


Making a smaller dataframe with only top 3 most frequent categories

In [None]:
df_3 = df[df.category.isin(df.category.value_counts()[:3].index.values)]
df_3

Unnamed: 0,category,headline,authors,link,short_description,date
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26
...,...,...,...,...,...,...
200800,WELLNESS,The Sleep Library: 11 Soothing Books For Bedtime,,https://www.huffingtonpost.comhttp://www.oprah...,Do you toss and turn until you finally sit up ...,2012-01-28
200802,WELLNESS,The Benefits of Caring for a Pet,"Rita Altman, R.N., Contributor\nSenior Vice Pr...",https://www.huffingtonpost.com/entry/pets-seni...,"For the young as well as the old, especially i...",2012-01-28
200805,WELLNESS,This Is Only the Beginning: Surprising Advice ...,"Ellie Knaus, Contributor\nAtomic Moms Podcast ...",https://www.huffingtonpost.com/entry/life-tips...,"My great-aunt Ida loves to say, ""This is only ...",2012-01-28
200838,ENTERTAINMENT,"Sundance, Ice-T, and Shades of the American Ra...","Courtney Garcia, Contributor\nI tell stories a...",https://www.huffingtonpost.com/entry/sundance-...,Representation of the collective diaspora has ...,2012-01-28


# Text processing

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
import string


def cleaning_function(sentence):
    """
    Function needed to perform data preprocessing: removing punctuation symbols,
    stop_words and other random things in order to obtain clean text
    """
    # the following line removes numbers from text
    result = re.sub(r'\d+', '', sentence.lower())

    # the following line removes any punctuation from the text
    result = result.translate(str.maketrans('','',string.punctuation))
    return [word for word in result.split() if not word in stop_words]
    

In [None]:
df_3['processed_description'] = df_3['short_description'].apply(lambda x: ' '.join(cleaning_function(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# Splitting into train and test

In [None]:
from sklearn.model_selection import train_test_split
y = df_3['category'].astype('category').cat.codes

sentences_train, sentences_test, y_train, y_test = train_test_split(df_3, y, test_size=0.2, stratify=y, random_state=1000)

In [None]:
words_processed = set([k for i in df_3.processed_description.values for j in i.split('.') for k in j.split()])
words_raw = set([k for i in df_3.short_description.values for j in i.split('.') for k in j.split()])
print(f"Number of unique words in the processed dataset: {len(words_processed)}") 
print(f"Number of unique words in the raw dataset: {len(words_raw)}")

Number of unique words in the processed dataset: 47482
Number of unique words in the raw dataset: 82908


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer_processed = Tokenizer(num_words=40000)
tokenizer_processed.fit_on_texts(sentences_train['processed_description'].values)

X_train_processed = tokenizer_processed.texts_to_sequences(sentences_train['processed_description'])
X_test_processed = tokenizer_processed.texts_to_sequences(sentences_test['processed_description'])

tokenizer_raw = Tokenizer(num_words=60000)
tokenizer_raw.fit_on_texts(sentences_train['processed_description'])

X_train_raw = tokenizer_raw.texts_to_sequences(sentences_train['short_description'])
X_test_raw = tokenizer_raw.texts_to_sequences(sentences_test['short_description'])

In [None]:
vocab_size_processed = len(tokenizer_processed.word_index) + 1  
vocab_size_raw = len(tokenizer_raw.word_index) + 1

# Padding sequences

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen_processed = 25
maxlen_raw = 100

X_train_processed = pad_sequences(X_train_processed, padding='post', maxlen=maxlen_processed)
X_test_processed = pad_sequences(X_test_processed, padding='post', maxlen=maxlen_processed)

X_train_raw = pad_sequences(X_train_raw, padding='post', maxlen=maxlen_raw)
X_test_raw = pad_sequences(X_test_raw, padding='post', maxlen=maxlen_raw)

print(sentences_train['short_description'][2])
print(X_train_raw[2, :])

The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# Converting y-vector to a compatible format, to avoid exploding gradients

In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test) 

# Loading the pretrained embedding layer

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-02-11 11:19:44--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-02-11 11:19:44--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-02-11 11:19:45--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [None]:
!head -n 1 glove.6B.50d.txt | cut -c-50

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445


In [None]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
embedding_dim = 50
embedding_matrix_processed = create_embedding_matrix('glove.6B.50d.txt',
                                           tokenizer_processed.word_index, 
                                           embedding_dim)

embedding_matrix_raw = create_embedding_matrix('glove.6B.50d.txt', 
                                               tokenizer_raw.word_index,
                                               embedding_dim)

# Building models on pretrained embedding layer

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import SGD

In [None]:
def create_basic_model(vocab_size, embedding_dim, embedding_matrix, maxlen, pretrained_embedding=False):
    model = Sequential()
    if pretrained_embedding:
        model.add(layers.Embedding(vocab_size, embedding_dim, 
                                  weights=[embedding_matrix], 
                                  input_length=maxlen, 
                                  trainable=False))
    else:
        model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.GlobalMaxPool1D())
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(10))
    model.add(layers.Dense(3, activation='softmax'))
    opt = SGD(learning_rate=0.01)
    model.compile(loss = "categorical_crossentropy", optimizer = opt, metrics=['accuracy'])
    return model

## Working on processed texts

In [None]:
model_processed = create_basic_model(vocab_size_processed, 
                                     embedding_dim, 
                                     embedding_matrix_processed, 
                                     maxlen_processed, pretrained_embedding=True)

In [None]:
history_processed = model_processed.fit(X_train_processed, y_train,
                    epochs=30,
                    validation_data=(X_test_processed, y_test))
loss_training_processed, accuracy_training_processed = model_processed.evaluate(X_train_processed, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_processed))
loss_test_processed, accuracy_test_processed = model_processed.evaluate(X_test_processed, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_processed))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Accuracy: 0.6647
Testing Accuracy:  0.6513


## Working on raw texts

In [None]:
model_raw = create_basic_model(vocab_size_raw, 
                                     embedding_dim, 
                                     embedding_matrix_raw, 
                                     maxlen_raw, pretrained_embedding=True)

In [None]:
model_raw.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 50)           2132300   
                                                                 
 global_max_pooling1d_3 (Glo  (None, 50)               0         
 balMaxPooling1D)                                                
                                                                 
 flatten_3 (Flatten)         (None, 50)                0         
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_6 (Dense)             (None, 10)                510       
                                                                 
 dense_7 (Dense)             (None, 3)                 33        
                                                      

In [None]:
history_raw = model_raw.fit(X_train_raw, y_train,
                    epochs=30,
                    validation_data=(X_test_raw, y_test))
loss_training_raw, accuracy_training_raw = model_raw.evaluate(X_train_raw, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_raw))
loss_test_raw, accuracy_test_raw = model_raw.evaluate(X_test_raw, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_raw))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Accuracy: 0.6768
Testing Accuracy:  0.6702


# Building models with training embedding layers as well

## Working on processed texts

In [None]:
model_processed_new = create_basic_model(vocab_size_processed, 8, None, maxlen_processed)
model_processed_new.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 25, 8)             341168    
                                                                 
 global_max_pooling1d_4 (Glo  (None, 8)                0         
 balMaxPooling1D)                                                
                                                                 
 flatten_4 (Flatten)         (None, 8)                 0         
                                                                 
 dropout_4 (Dropout)         (None, 8)                 0         
                                                                 
 dense_8 (Dense)             (None, 10)                90        
                                                                 
 dense_9 (Dense)             (None, 3)                 33        
                                                      

In [None]:
history_processed_new = model_processed_new.fit(X_train_processed, y_train,
                    epochs=30,
                    validation_data=(X_test_processed, y_test))
loss_training_processed_new, accuracy_training_processed_new = model_processed_new.evaluate(X_train_processed, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_processed_new))
loss_test_processed_new, accuracy_test_processed_new = model_processed_new.evaluate(X_test_processed, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_processed))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Accuracy: 0.8310
Testing Accuracy:  0.6513


## Working on raw data

In [None]:
model_raw_new = create_basic_model(vocab_size_raw, 8, None, maxlen_raw)
model_raw_new.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 8)            341168    
                                                                 
 global_max_pooling1d_5 (Glo  (None, 8)                0         
 balMaxPooling1D)                                                
                                                                 
 flatten_5 (Flatten)         (None, 8)                 0         
                                                                 
 dropout_5 (Dropout)         (None, 8)                 0         
                                                                 
 dense_10 (Dense)            (None, 10)                90        
                                                                 
 dense_11 (Dense)            (None, 3)                 33        
                                                      

In [None]:
history_raw_new = model_raw_new.fit(X_train_raw, y_train,
                    epochs=30,
                    validation_data=(X_test_raw, y_test))
loss_training_raw_new, accuracy_training_raw_new = model_raw_new.evaluate(X_train_raw, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training_raw_new))
loss_test_raw_new, accuracy_test_raw_new = model_raw_new.evaluate(X_test_raw, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test_raw))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Accuracy: 0.7687
Testing Accuracy:  0.6702


# Checking performance of the model trained on more epochs and with a different optimizer; *embedding_dim* will be set to 20, instead of 8, and there will be 50 epochs

In [None]:
from tensorflow.keras.optimizers import RMSprop
model = Sequential()
model.add(layers.Embedding(vocab_size_raw, 20, input_length=maxlen_raw))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(30))
model.add(layers.Dense(3, activation='softmax'))
rmsprop = RMSprop()
model.compile(loss = "categorical_crossentropy", optimizer = rmsprop, metrics=['accuracy'])

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 20)           852920    
                                                                 
 global_max_pooling1d_7 (Glo  (None, 20)               0         
 balMaxPooling1D)                                                
                                                                 
 flatten_7 (Flatten)         (None, 20)                0         
                                                                 
 dropout_7 (Dropout)         (None, 20)                0         
                                                                 
 dense_14 (Dense)            (None, 30)                630       
                                                                 
 dense_15 (Dense)            (None, 3)                 93        
                                                      

In [None]:
history = model.fit(X_train_raw, y_train,
                    epochs=50,
                    validation_data=(X_test_raw, y_test))
loss_training, accuracy_training = model.evaluate(X_train_raw, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_training))
loss_test, accuracy_test = model.evaluate(X_test_raw, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training Accuracy: 0.8315
Testing Accuracy:  0.7907


# Conclusions 

All models were trained on the train data which is 0.8 portion of original dataset with only top 3 most frequent categories (validation set is 0.2). Models were overfitting more when they had to also update parameters for the Embedding layer; compared to relatively similar accuracy results between train/test in the models with pretrained embedding layer (of dimension 50). Also, all models were trained on 30 epochs only