In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from google.colab import drive

In [2]:
drive.mount('/content/myDrive')

Mounted at /content/myDrive


In [3]:
train = pd.read_csv('/content/myDrive/MyDrive/fake-news/train.csv')
test = pd.read_csv('/content/myDrive/MyDrive/fake-news/test.csv')
train_data = train.copy()
test_data = test.copy()

train_data = train_data.set_index('id', drop = True)

print('train data shape', train_data.shape)
print(train_data.head())

print('test data shape', test_data.shape)
print(test_data.head())

train data shape (20800, 4)
                                                title              author  \
id                                                                          
0   House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2                   Why the Truth Might Get You Fired  Consortiumnews.com   
3   15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                 text  label  
id                                                            
0   House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1   Ever get the feeling your life circles the rou...      0  
2   Why the Truth Might Get You Fired October 29, ...      1  
3   Videos 15 Civilians Killed In Single US Airstr...      1  
4   Print \nAn Iranian woman has been sentenced to... 

In [4]:
print('missing values counts\n', train_data.isnull().sum())

# dropping missing values from text columns alone.
train_data[['title', 'author']] = train_data[['title', 'author']].fillna(value = 'Missing')
train_data = train_data.dropna()
print('missing values counts n', train_data.isnull().sum())

length = []
[length.append(len(str(text))) for text in train_data['text']]
train_data['length'] = length
print('train data length\n', train_data.head())

print('min data length', min(train_data['length']), ', max data length', max(train_data['length']), ', average data length', round(sum(train_data['length'])/len(train_data['length'])))

print('count of less then 50 character', len(train_data[train_data['length'] < 50]))

# dropping the outliers
train_data = train_data.drop(train_data['text'][train_data['length'] < 50].index, axis = 0)
print('min data length', min(train_data['length']), ', max data length', max(train_data['length']), ', average data length', round(sum(train_data['length'])/len(train_data['length'])))

missing values counts
 title      558
author    1957
text        39
label        0
dtype: int64
missing values counts n title     0
author    0
text      0
label     0
dtype: int64
train data length
                                                 title              author  \
id                                                                          
0   House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2                   Why the Truth Might Get You Fired  Consortiumnews.com   
3   15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                 text  label  length  
id                                                                    
0   House Dem Aide: We Didn’t Even See Comey’s Let...      1    4930  
1   Ever get the feeling your life circles the 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['length'] = length


In [5]:
max_features = 4500

# Tokenizing the text - converting the words, letters into counts or numbers.
# We dont need to explicitly remove the punctuations. we have an inbuilt option in Tokenizer for this purpose
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = train_data['text'])
X = tokenizer.texts_to_sequences(texts = train_data['text'])

# now applying padding to make them even shaped.
X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')

print('X shape', X.shape)
y = train_data['label'].values
print('Y shape', y.shape)

# splitting the data training data for training and validation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

X shape (20554, 4500)
Y shape (20554,)


In [6]:
# LSTM Neural Network
lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 120,  activation = 'relu', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y)),  activation = 'sigmoid', name = 'output_layer'))
# compiling the model
lstm_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

lstm_model_fit = lstm_model.fit(X_train, y_train, epochs = 10, batch_size = 256)

Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1762s[0m 27s/step - accuracy: 0.7070 - loss: 0.5470
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1761s[0m 27s/step - accuracy: 0.9294 - loss: 0.2051
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1755s[0m 27s/step - accuracy: 0.9414 - loss: 0.1712
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1758s[0m 27s/step - accuracy: 0.9410 - loss: 0.1706
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1767s[0m 27s/step - accuracy: 0.9573 - loss: 0.1325
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1812s[0m 27s/step - accuracy: 0.9274 - loss: 0.2149
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1756s[0m 27s/step - accuracy: 0.9494 - loss: 0.1399
Epoch 8/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1759s[0m 27s/step - accuracy: 0.8689 - loss: 0.3024
Epoch 9/10
[1m65/65[0m [32m━━

In [7]:
test_data = test.copy()
print('test_data shape', test_data.shape)

test_data = test_data.set_index('id', drop = True)
print('test_data shape', test_data.shape)

test_data = test_data.fillna(' ')
print('test_data shape', test_data.shape)
print(test_data.isnull().sum())

tokenizer.fit_on_texts(texts = test_data['text'])
test_text = tokenizer.texts_to_sequences(texts = test_data['text'])

test_text = pad_sequences(sequences = test_text, maxlen = max_features, padding = 'pre')

lstm_prediction = lstm_model.predict(test_text)

lstm_prediction_vec = np.argmax(lstm_prediction, axis=1)

print("lstm_prediction", lstm_prediction_vec)

test_data shape (5200, 4)
test_data shape (5200, 3)
test_data shape (5200, 3)
title     0
author    0
text      0
dtype: int64
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 2s/step
lstm_prediction [0 1 1 ... 0 1 0]


In [8]:
real_results = pd.read_csv('/content/myDrive/MyDrive/fake-news/submit.csv')
real_results = real_results["label"]

print("test_results", real_results)
print(real_results.shape)

accuracy = accuracy_score(real_results, lstm_prediction_vec)
precision = precision_score(real_results, lstm_prediction_vec, average='weighted')
recall = recall_score(real_results, lstm_prediction_vec, average='weighted')
f1 = f1_score(real_results, lstm_prediction_vec, average='weighted')
confisiun_matrix = confusion_matrix(real_results, lstm_prediction_vec)

classification_rep = classification_report(real_results, lstm_prediction_vec)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:\n", classification_rep)
print(f"Confisiun Matrix:\n {confisiun_matrix}")

test_results 0       0
1       1
2       0
3       1
4       1
       ..
5195    0
5196    1
5197    0
5198    1
5199    0
Name: label, Length: 5200, dtype: int64
(5200,)
Accuracy: 0.61
Precision: 0.62
Recall: 0.61
F1-Score: 0.62

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.60      0.58      2339
           1       0.66      0.63      0.64      2861

    accuracy                           0.61      5200
   macro avg       0.61      0.61      0.61      5200
weighted avg       0.62      0.61      0.62      5200

Confisiun Matrix:
 [[1401  938]
 [1065 1796]]
