Dataset: https://www.kaggle.com/competitions/fake-news/data

In [67]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential

nltk.download('stopwords')

df = pd.read_csv('/content/train.csv')
df = df.dropna()  # Drop NaN values

x = df.drop('label', axis=1)
y = df['label']

voc_size = 5000  # Vocabulary Size
msgs = x.copy()
msgs.reset_index(inplace=True)

ps = PorterStemmer()
corpus = []

for i in range(0, len(msgs)):
    review = re.sub('[^a-zA-Z]', ' ', msgs['title'][i])  # Keeping only alphabets
    review = review.lower()  # Converting alphabets to lowercase
    review = review.split()  # Splitting them and storing string as a list of words
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]  # If the word is not a stopword, we will keep it after stemming it
    review = ' '.join(review)
    corpus.append(review)

onehot_repr = [one_hot(words, voc_size) for words in corpus]

sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
embedding_vector_features = 40

model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_9 (LSTM)               (None, 100)               56400     
                                                                 
 dense_9 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [69]:
x_final = embedded_docs
y_final = y.values.astype(int)  # Convert y to array of integers

x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd18de144c0>

In [70]:

y_pred_prob = model.predict(x_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[3105  314]
 [ 224 2392]]
0.9108533554266777


Trying adding dropout layer to add regularization and prevent overfitting

In [74]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

embedding_vector_features = 40

model = Sequential()

# Embedding layer
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Dropout(0.3))  # Dropout regularization to prevent overfitting

# LSTM layer
model.add(LSTM(100, recurrent_dropout=0.3, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.3))  # Dropout regularization

# Dense layer
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model training
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100, batch_size=64, callbacks=[early_stop])




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x7fd18668bc40>

In [75]:
x_final = embedded_docs
y_final = y.values.astype(int)  # Convert y to array of integers

x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd17744f760>

In [76]:

y_pred_prob = model.predict(x_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[3044  375]
 [ 193 2423]]
0.9058823529411765


Dealing with it: We are splitting the data after using the embedding which might lead to data leakage.

In [81]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Load the dataset
df = pd.read_csv('/content/train.csv')
df = df.dropna()  # Drop NaN values

# Split into features and labels
x = df.drop('label', axis=1)
y = df['label']

# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# Perform text preprocessing
nltk.download('stopwords')
ps = PorterStemmer()
corpus = []
for i in range(len(x_train)):
    review = re.sub('[^a-zA-Z]', ' ', x_train.iloc[i]['title'])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Vocabulary size
voc_size = 5000

# Perform one-hot encoding
onehot_repr = [one_hot(words, voc_size) for words in corpus]

# Pad sequences
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

embedding_vector_features = 40

model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100, recurrent_dropout=0.3, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model training
model.fit(embedded_docs, y_train, validation_split=0.2, epochs=100, batch_size=64, callbacks=[early_stop])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.callbacks.History at 0x7fd184fb4910>

In [95]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_test_list = y_test.values.tolist()

y_pred_list = [int(val) for val in y_pred]

cm = confusion_matrix(y_test_list, y_pred_list)
print(cm)

accuracy = accuracy_score(y_test_list, y_pred_list)

[[3044  375]
 [ 193 2423]]


Doing hyperparameter tuning with the help of keras-tuner

In [97]:
pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [99]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow import keras
import tensorflow.keras.layers as layers
import keras_tuner as kt

In [101]:
# Load the train dataset
train_df = pd.read_csv('/content/train.csv')

# Drop NaN values
train_df = train_df.dropna()

# Split into features and labels
x_train = train_df.drop('label', axis=1)
y_train = train_df['label']

In [102]:
# Perform text preprocessing
nltk.download('stopwords')
ps = PorterStemmer()
corpus_train = []
for i in range(len(x_train)):
    review = re.sub('[^a-zA-Z]', ' ', x_train.iloc[i]['title'])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus_train.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Vocabulary size
voc_size = 5000

# Perform one-hot encoding
onehot_repr_train = [keras.preprocessing.text.one_hot(words, voc_size) for words in corpus_train]

# Pad sequences
sent_length = 20
embedded_docs_train = keras.preprocessing.sequence.pad_sequences(onehot_repr_train, padding='pre', maxlen=sent_length)

In [106]:
# Define the hypermodel
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Embedding(voc_size, 40, input_length=sent_length))

    # Tune the dropout rate
    hp_dropout = hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)
    model.add(layers.Dropout(hp_dropout))

    # Tune the number of LSTM units
    hp_units = hp.Int('units', min_value=50, max_value=150, step=10)
    model.add(layers.LSTM(units=hp_units))

    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [107]:
# Define the tuner
tuner = kt.Hyperband(build_model, objective='val_accuracy', max_epochs=10, factor=3, directory='kt_dir', project_name='my_model')

In [108]:
# Perform hyperparameter tuning
tuner.search(embedded_docs, y, epochs=10, validation_split=0.2)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)

# Train the model
model.fit(embedded_docs, y, epochs=10, validation_split=0.2)

Trial 30 Complete [00h 00m 38s]
val_accuracy: 0.5759183764457703

Best val_accuracy So Far: 0.5759183764457703
Total elapsed time: 00h 14m 17s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd18139d270>

In [115]:
# Preprocess test data
corpus_test = []
for i in range(len(x_test)):
    review = re.sub('[^a-zA-Z]', ' ',str( x_test[i]))
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus_test.append(review)

onehot_repr_test = [one_hot(words, voc_size) for words in corpus_test]
embedded_docs_test = pad_sequences(onehot_repr_test, padding='pre', maxlen=sent_length)


In [116]:
# Evaluate the model on the test data
import numpy as np
y_pred_prob = model.predict(embedded_docs_test)
y_pred = np.round(y_pred_prob).astype(int)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(cm)


Accuracy: 0.49063794531897265
Confusion Matrix:
[[1558 1861]
 [1213 1403]]
