## Loading and Reading the data

In [1]:
# Load dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Read dataset
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/Deep Learning/fake-news/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Deep Learning/fake-news/test.csv")

In [None]:
# Shape of train and test data
print(f'Train shape {train.shape}')
print(f'Test shape {test.shape}')

Train shape (20800, 5)
Test shape (5200, 4)


In [None]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


There are 5 columns. ID and Author columns are not useful, we will not conider them. Label is the dependent variable. 1 means unreliable and 0 means reliable news.

In [None]:
# Checking NaN values
train.isnull().values.any()

True

In [3]:
# Dropping NaN values. Since this is a text data, we cannot fill NaN values.
train = train.filter(['text', 'label'])
train.reset_index(drop=True, inplace=True)
train.dropna(inplace=True)
train.shape

(20761, 2)

In [4]:
# Dependent and Independent Features
X = train['text']
y = train['label']

In [5]:
# Checking value counts
print(f'Value counts in percentage terms:\n{round(y.value_counts(normalize=True)*100, 2)}')

Value counts in percentage terms:
0    50.03
1    49.97
Name: label, dtype: float64


We have 50.03% of points belonging to reliable news category and 49.97% of data points labelled as unreliable data. We can consider this data without balancing it.

## Normalizing the text data

In [None]:
# Required libraries for preprocessing text data
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
stemming = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def normalize(data):
    
  i = data.lower()                            # Lower case
  i = re.sub('https?://\S+|www\.\S+', '', i)  # Remove urls
  i = re.sub('\\W', ' ', i)                   # Remove white spaces and non words
  i = re.sub(r'\d+', '', i)                   # Remove numbers
  i = re.sub('\n', '', i)
  i = re.sub(' +', ' ', i)
  i = re.sub('^ ', '', i)
  i = re.sub(' $', '', i)

  # Tokenzing and stopword removal
  text_tokens = word_tokenize(i)
  tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
  filtered_sentence = (" ").join(tokens_without_sw)

  # Stemming
  text_tokens = word_tokenize(filtered_sentence)
  stemmed_tokens = [stemming.stem(word) for word in text_tokens]
  normalized_text = (" ").join(stemmed_tokens)
  return normalized_text

X_norm = X.apply(normalize)

In [None]:
X_df = pd.DataFrame({'Text': X_norm})
X_df.to_csv('normalized_vals.csv', index=False)

In [13]:
X_norm = pd.read_csv("/content/drive/MyDrive/Deep Learning/normalized_vals.csv")
X_norm = X_norm['Text'].astype("str")

## Encoding

In [8]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [15]:
# Encoding each word from the corpus
vocab_size = 5000  # Considering 5000 words
X_encoded = [tf.keras.preprocessing.text.one_hot(words,vocab_size) for words in X_norm]  # Converts a set of sparse labels to a dense one-hot representation

This is how one-hot encoded text looks like. Each word is represented by it's index number present in the vocabulary size.

In [16]:
# Looking at the first sentence and it's one hot representation
print(X_norm[0])
print(X_encoded[0])

hous aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel case know anthoni weiner sext teenag appar littl thing fact matter chaffetz utah republ

In [18]:
# Embedded Representation
# Adding Padding 
embedded_repr = tf.keras.preprocessing.sequence.pad_sequences(X_encoded, maxlen = 500, padding = 'pre')
print(embedded_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 2153 3976  616  778 1045  974 4784 1710 1784
 4455 3504 2562 4034 4784 1710  849 2392 3864 2440  470 3547 4273 1604
 3246  379 2527 2491 2127 2785 1371  792 1075 1850 2264 3816 3577  843
 1296 1045 1787 2153  980 3976 4568   75  978 2517 1075 1850 3066 4024
 1045 3400 4984  974 4275 3577 4568 4764  342 1934 4550 2374 4764 3967
  775  980  288 2625 3528 1045 2371    8 1784 1829 2625 1839  978 1045
  505 1829 1839  980  775 4281 2153 4297 3651 2299 2625 1141 4286 4764
  100 3222 2390  778 3707 2099  448 1655  974 3754 2299 2625 3857 4784
 1710  325 1916 2264 2920 1784 3577  448 3577 2771 1168 4764 1900 1318
  695 

## Train Test Split

In [23]:
import numpy as np
X = np.array(embedded_repr)
y = np.array(y)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Building Bidirectional RNN

In [30]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length=500),  # Taking 50 features for embedding and input length is 500 because we are considering 500 words.
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 500, 128)          58880     
_________________________________________________________________
dense_6 (Dense)              (None, 500, 32)           4128      
_________________________________________________________________
batch_normalization_3 (Batch (None, 500, 32)           128       
_________________________________________________________________
dense_7 (Dense)              (None, 500, 16)           528       
_________________________________________________________________
dropout_4 (Dropout)          (None, 500, 16)           0         
_________________________________________________________________
dense_8 (Dense)              (None, 500, 1)           

In [31]:
model.fit(X_train, y_train, batch_size = 128, validation_data=(X_test, y_test), epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f150ee163d0>

In [32]:
model.evaluate(X_test, y_test)



[0.559907853603363, 0.9056671857833862]