<a href="https://colab.research.google.com/github/saanikagupta/Fake-News-Detection/blob/master/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Importing libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from keras import utils
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers, callbacks
from keras.losses import categorical_crossentropy
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


# Preprocessing

In [0]:
# Reading the excel file into DataFrame
train = pd.read_excel('train.xlsx', header = None, names = ['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])
test = pd.read_excel('test.xlsx', header = None, names = ['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])
val = pd.read_excel('valid.xlsx', header = None, names = ['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',	'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c',	'venue'])

In [0]:
# Dropping the 'id' column
train.drop('id', axis = 1, inplace = True)
test.drop('id', axis = 1, inplace = True)
val.drop('id', axis = 1, inplace = True)

In [0]:
train.head(5)

Unnamed: 0,label,statement,subject,speaker,job,state,party,barely_true_c,false_c,half_true_c,mostly_true_c,pants_on_fire_c,venue
0,False,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0,0,0,a mailer
1,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
3,False,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
4,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN


In [0]:
# Checking the shape of data
print(train.shape)
print(val.shape)
print(test.shape)

(10269, 13)
(1284, 13)
(1283, 13)


In [0]:
# list(train.columns)

In [0]:
# Function for cleaning the dataset
def dataPreprocessing(filename, corpus):
  
  length = filename.shape[0]
  
  # Missing values
  filename["job"].fillna("no-job", inplace = True)
  filename["state"].fillna("no-state", inplace = True)
  
  for x in range(length):
    statement = re.sub('[^a-zA-Z]', ' ', train['statement'][x]) # Removing all numbers and special characters
    statement = statement.lower() # Converting uppercase to lowercase
    statement = statement.split()
    ps = PorterStemmer()
    statement = [ps.stem(word) for word in statement if not word in set(stopwords.words('english'))] # Stemming the dataset and removing stopwords
    statement = ' '.join(statement)
    subject = train['subject'][x].replace(',', ' ')
    speaker = train['speaker'][x]
    job = train['job'][x].lower()
    # job = job.replace(' ', '-')
    state = train['state'][x].lower()
    party = train['party'][x].lower()
    corpus.append(statement + ' '  + subject + ' ' + job + ' ' + state + ' ' + party)
  return corpus

In [0]:
corpus = []
corpus = dataPreprocessing(train, corpus) # This returns a corpus containing only the train dataset
corpus = dataPreprocessing(val, corpus) # This returns a corpus containing train and val dataset
corpus = dataPreprocessing(test, corpus) # This returns a corpus containing train, val and test dataset

In [0]:
len(corpus) # 10269 + 1284 + 1283

12836

In [0]:
# Converting the corpus into bag-of-words
cv = CountVectorizer(max_features = 8000)
X = cv.fit_transform(corpus).toarray()

In [0]:
X.shape

(12836, 8000)

In [0]:
# Obtaining the x_train, x_val and x_test from the bag-of-words (As the dataset was merged during preprocessing)
x_train = X[: 10269, :]
x_val = X[10269 : 11553, :]
x_test = X[11553 : 12836, :]

In [0]:
# x_val.shape

In [0]:
# Selecting the columns 'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',	'pants_on_fire_c'
x_train2 = train.iloc[:, 7: 12]
x_val2 = val.iloc[:, 7: 12]
x_test2 = test.iloc[:, 7: 12]

In [0]:
type(x_train)

numpy.ndarray

In [0]:
print(x_train.shape)
print(x_train2.shape)

(10269, 8000)
(10269, 5)


In [0]:
# Stacking x_train and x_train2 horizontally
x_train = np.hstack((x_train, x_train2))
x_val = np.hstack((x_val, x_val2))
x_test = np.hstack((x_test, x_test2))

In [0]:
x_train.shape

(10269, 8005)

# Six-way classification

## Preprocessing

In [0]:
num_classes = 6
# Preprocessing function for the labels
def categorize(filename):
  y = filename["label"].tolist()
  
  # Encoding the Dependent Variable
  labelencoder_y = LabelEncoder()
  y = labelencoder_y.fit_transform(y)
  
  # Converting to binary class matrix
  y = utils.to_categorical(y, num_classes)
  return y

In [0]:
y_train = categorize(train)
y_test = categorize(test)
y_val = categorize(val)

In [0]:
y_test.shape

(1283, 6)

In [0]:
# Checking for missing values
np.where(np.isnan(x_train)) 

(array([], dtype=int64), array([], dtype=int64))

In [0]:
# Checking for missing values
np.where(np.isnan(x_test)) 

(array([], dtype=int64), array([], dtype=int64))

In [0]:
# Checking for missing values
np.where(np.isnan(x_val)) 

(array([], dtype=int64), array([], dtype=int64))

## Artificial Neural Network

In [0]:
# Initializing hyperparameters
learn_rate = 0.001
batch_size = 500
epochs = 10
num_classes = 6

In [0]:
seed = 2
np.random.seed(seed)

# Creating the model
model = Sequential()
model.add(Dense(8005, activation = 'relu', kernel_initializer = 'glorot_uniform'))
model.add(Dropout(0.85))
model.add(Dense(121, activation = 'relu'))
model.add(Dropout(0.75))
model.add(Dense(num_classes, activation = 'softmax'))

In [0]:
rmsprop = optimizers.RMSprop(learn_rate)
model.compile(loss = categorical_crossentropy, optimizer = rmsprop, metrics = ['accuracy']) # Compile model

# Checkpoint
filepath = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = callbacks.ModelCheckpoint(filepath, monitor = 'val_acc', save_best_only = False, save_weights_only = False)
callbacks_list = [checkpoint]

In [0]:
# Model fitting
model.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, callbacks = callbacks_list, verbose = 1, validation_data = (x_val, y_val))

Train on 10269 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9081ee7b38>

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 8005)              64088030  
_________________________________________________________________
dropout_9 (Dropout)          (None, 8005)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 121)               968726    
_________________________________________________________________
dropout_10 (Dropout)         (None, 121)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 6)                 732       
Total params: 65,057,488
Trainable params: 65,057,488
Non-trainable params: 0
_________________________________________________________________


## Evaluation

In [0]:
# Loading weights
# epoch = 4, train_acc = 38.46%, val_acc = 41.82%
model.load_weights("weights-improvement-04-0.42.hdf5")
# test_acc = 40.84%

# Estimating the accuracy on the test dataset using loaded weights
scores = model.evaluate(x_test, y_test, verbose = 0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 40.84%


In [0]:
# y_test
y_pred = model.predict(x_test)
# y_pred

In [0]:
y_pred.argmax(axis = 1)

array([3, 2, 3, ..., 2, 3, 2])

In [0]:
y_test.argmax(axis = 1)

array([1, 0, 0, ..., 2, 2, 0])

In [0]:
# Building the confusion matrix
matrix = metrics.confusion_matrix(y_test.argmax(axis = 1), y_pred.argmax(axis = 1))

In [0]:
# Confusion matrix for six-way classification
matrix

array([[102,   8,  30,  81,  22,   7],
       [ 22,  25,  26,  92,  42,   4],
       [ 16,   3,  80,  97,  15,   3],
       [ 23,   0,  14, 194,  36,   0],
       [ 18,   3,  19, 108,  99,   2],
       [ 27,  12,   9,  17,   3,  24]])

# Binary Classification

## Preprocessing

In [0]:
num_classes = 2

In [0]:
# Function for preprocessing labels
def dataPreprocessingBinary(filename):
  y = filename["label"].tolist()
  
  # Changing the 'half-true', 'mostly-true', barely-true', 'pants-fire' labels to True/False for Binary Classification
  for x in range(len(y)):
    if(y[x] == 'half-true'):
       y[x] = 'True'
    elif(y[x] == 'mostly-true'):
       y[x] = 'True'
    elif(y[x] == 'barely-true'):
       y[x] = 'False'
    elif(y[x] == 'pants-fire'):
       y[x] = 'False'
  
  # Converting the lables into binary class matrix
  labelencoder_y = LabelEncoder()
  y = labelencoder_y.fit_transform(y)
  y = utils.to_categorical(y, num_classes)
  return y

In [0]:
y_train_binary = dataPreprocessingBinary(train)
y_test_binary = dataPreprocessingBinary(test)
y_val_binary = dataPreprocessingBinary(val)

## Artificial Neural Network

In [0]:
# Hyperparameters
learn_rate = 0.001
batch_size = 500
epochs = 20

In [0]:
seed = 1
np.random.seed(seed)

# Creating model
model = Sequential()
model.add(Dense(8005, activation = 'relu', kernel_initializer = 'glorot_uniform'))
model.add(Dropout(0.9))
model.add(Dense(121, activation = 'relu'))
model.add(Dropout(0.8))
model.add(Dense(num_classes, activation = 'softmax'))

In [0]:
rmsprop = optimizers.RMSprop(learn_rate)
model.compile(loss = categorical_crossentropy, optimizer = rmsprop, metrics = ['accuracy']) # Compiling the model

# Checkpoint
filepath = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = callbacks.ModelCheckpoint(filepath, monitor = 'val_acc', save_best_only = True, save_weights_only = False)
callbacks_list = [checkpoint]

W0809 02:33:41.323594 140237346809728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [0]:
# Model fitting
model.fit(x_train, y_train_binary, batch_size = batch_size, epochs = epochs, callbacks = callbacks_list, verbose = 1, validation_data = (x_val, y_val_binary))

W0809 02:34:00.414985 140237346809728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0809 02:34:00.420901 140237346809728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0809 02:34:00.452592 140237346809728 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0809 02:34:00.469230 140237346809728 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Ins

Train on 10269 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8b34bf4b00>

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8005)              64088030  
_________________________________________________________________
dropout_1 (Dropout)          (None, 8005)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 121)               968726    
_________________________________________________________________
dropout_2 (Dropout)          (None, 121)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 244       
Total params: 65,057,000
Trainable params: 65,057,000
Non-trainable params: 0
_________________________________________________________________


## Evaluation

In [0]:
# Loading weights
# epoch = 4, train_acc = 70.15%, val_acc = 67.6%
model.load_weights("weights-improvement-04-0.68.hdf5")
# test_acc = 69.91%

# Estimating the accuracy on the test dataset using loaded weights
scores = model.evaluate(x_test, y_test_binary, verbose = 0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 69.91%


In [0]:
y_pred = model.predict(x_test) # Predicting y for x_test
matrix = metrics.confusion_matrix(y_test_binary.argmax(axis = 1), y_pred.argmax(axis = 1)) # Building the confusion matrix

In [0]:
# Confusion matrix for binary classification
matrix

array([[309, 247],
       [139, 588]])