## Getting the data

In [1]:
# Clone the entire repo.
!git clone  https://github.com/Tariq60/LIAR-PLUS
%cd LIAR-PLUS/dataset
!ls

fatal: destination path 'LIAR-PLUS' already exists and is not an empty directory.
/home/jupyter/LIAR-PLUS/dataset
cache		   glove.6B.zip  test.csv    wwm_uncased_L-24_H-1024_A-16
glove.6B.100d.txt  labels.csv	 train2.tsv  wwm_uncased_L-24_H-1024_A-16.zip
glove.6B.200d.txt  LIAR-PLUS	 train.csv
glove.6B.300d.txt  lstm.h5	 val2.tsv
glove.6B.50d.txt   test2.tsv	 val.csv


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

## Loading the data

In [2]:
import os
import numpy as np
import csv
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

import seaborn as sb


# Defining Headers and Condstants like file names

test_filename = 'test2.tsv'
train_filename = 'train2.tsv'
valid_filename = 'val2.tsv'
header = ['id','label','statement','subjects','speaker','job_title','state','party','barely_true_counts'
         ,'false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','venue','justification']

# Reading the data 

train_news = pd.read_csv(train_filename,sep='\t',names=header)
test_news = pd.read_csv(test_filename,sep='\t',names=header)
valid_news = pd.read_csv(valid_filename,sep='\t',names=header)




## Inspecting the data

In [38]:
train_news.head(3)

Unnamed: 0,id,label,statement,subjects,speaker,job_title,state,party,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,venue,justification,text,output
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,dwayne-bohac abortion State representative Tex...,0
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...","scott-surovell energy,history,job-accomplishme...",1
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...,barack-obama foreign-policy President Illinois...,1


In [4]:
train_news.T[0]

id                                                              2635.json
label                                                               false
statement               Says the Annies List political group supports ...
subjects                                                         abortion
speaker                                                      dwayne-bohac
job_title                                            State representative
state                                                               Texas
party                                                          republican
barely_true_counts                                                      0
false_counts                                                            1
half_true_counts                                                        0
mostly_true_counts                                                      0
pants_on_fire_counts                                                    0
venue                                 

In [5]:
print(train_news.shape,valid_news.shape,test_news.shape)

(10240, 15) (1284, 15) (1267, 15)


In [6]:
# checking For nulls
train_news.info()
train_news.isna().sum()    

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10240 entries, 0 to 10239
Data columns (total 15 columns):
id                      10240 non-null object
label                   10240 non-null object
statement               10240 non-null object
subjects                10238 non-null object
speaker                 10238 non-null object
job_title               7343 non-null object
state                   8032 non-null object
party                   10238 non-null object
barely_true_counts      10238 non-null float64
false_counts            10238 non-null float64
half_true_counts        10238 non-null float64
mostly_true_counts      10238 non-null float64
pants_on_fire_counts    10238 non-null float64
venue                   10138 non-null object
justification           10156 non-null object
dtypes: float64(5), object(10)
memory usage: 1.6+ MB


id                         0
label                      0
statement                  0
subjects                   2
speaker                    2
job_title               2897
state                   2208
party                      2
barely_true_counts         2
false_counts               2
half_true_counts           2
mostly_true_counts         2
pants_on_fire_counts       2
venue                    102
justification             84
dtype: int64

### It can be seen from above analysis, that some rows do not have justification value.

In [37]:
# Creating a new Column by Combining justification and statement made by the speaker
# In case justification is not available, text column will then be filled by statement only.
# train_news['text'] = train_news[['speaker', 'subjects','job_title','state','party','statement','justification']].apply(lambda x: ' '.join(str(v) for v in x), axis=1)

train_news['text'] = train_news['statement']+train_news['justification']
train_news['text'] = train_news['text'].fillna(train_news['statement'])

valid_news['text'] = valid_news['statement']+valid_news['justification']
valid_news['text'] = valid_news['text'].fillna(valid_news['statement'])

test_news['text'] = test_news['statement']+test_news['justification']
test_news['text'] = test_news['text'].fillna(test_news['statement'])

In [8]:
train_news['label'].unique()

array(['false', 'half-true', 'mostly-true', 'true', 'barely-true',
       'pants-fire'], dtype=object)

### Making a new column 'output' to have a numerical value of the label column.

In [39]:
label_dict = {"pants-fire" : 0, "false" : 0, "barely-true" : 0, "half-true" : 1, "mostly-true" : 1, "true" : 1}
print(label_dict)

train_news['output'] = train_news['label'].apply(lambda x: label_dict[x])
valid_news['output'] = valid_news['label'].apply(lambda x: label_dict[x])
test_news['output'] = test_news['label'].apply(lambda x: label_dict[x])

num_classes = 2

{'pants-fire': 0, 'false': 0, 'barely-true': 0, 'half-true': 1, 'mostly-true': 1, 'true': 1}


In [40]:
train_news[['text','output']][:20]

Unnamed: 0,text,output
0,dwayne-bohac abortion State representative Tex...,0
1,"scott-surovell energy,history,job-accomplishme...",1
2,barack-obama foreign-policy President Illinois...,1
3,blog-posting health-care nan nan none Health c...,0
4,"charlie-crist economy,jobs nan Florida democra...",1
5,robin-vos education Wisconsin Assembly speaker...,1
6,republican-party-texas candidates-biography na...,0
7,barack-obama ethics President Illinois democra...,1
8,oregon-lottery jobs nan nan organization Howev...,1
9,"duey-stroebel energy,message-machine-2014,voti...",1


In [41]:
label_dict.keys()

dict_keys(['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'])

In [42]:
from tensorflow.python.client import device_lib
import numpy as np
import pandas as pd
from collections import defaultdict
import re
import sys

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model

MAX_SEQUENCE_LENGTH_OF_TEXT = 500
MAX_NB_WORDS_VOCAB = 200000
EMBEDDING_DIM = 100
# VALIDATION_SPLIT = 0.2

In [43]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17908459698638968742
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 18167850733431163702
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15753943450
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4616581694039505152
physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 3153260531934118978
physical_device_desc: "device: XLA_GPU device"
]


In [44]:
def getTextData(df):
    texts = []
    labels = []
    for i in range(df.text.shape[0]):
        texts.append(df.text[i])
        labels.append(df.output[i])
    return texts,labels


### Tokenizing the text column

In [45]:
train_texts,train_labels = getTextData(train_news)
test_texts,test_labels = getTextData(test_news)
val_texts,val_labels = getTextData(valid_news)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS_VOCAB)
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index




In [46]:
print('Found %s unique tokens.' % len(word_index))

Found 26645 unique tokens.


In [47]:
set(train_labels)

{0, 1}

In [48]:
def getDataPostPadding(texts,labels):
    sequences = tokenizer.texts_to_sequences(texts)
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH_OF_TEXT,padding='post',truncating='post') # Pad input sequences
    categorical_labels = to_categorical(np.asarray(labels))
    return data , categorical_labels

### Padding the text column 

Pad_sequences is used to ensure that all sequences in a list have the same length. By default this is done by padding 0 in the beginning of each sequence until each sequence has the same length as the longest sequence.

In [49]:
train_data,train_labels = getDataPostPadding(train_texts,train_labels)
val_data,val_labels = getDataPostPadding(val_texts,val_labels)
test_data,test_labels = getDataPostPadding(test_texts,test_labels)
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', train_labels.shape)

Shape of data tensor: (10240, 500)
Shape of label tensor: (10240, 2)


### Ignoring the warnings to keep the notebook clean

In [50]:
import warnings
warnings.filterwarnings('ignore')

## Creating the model for 2-way classification

In [51]:
from keras.models import Sequential
from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization
import numpy as np
from matplotlib import pyplot as plt
from keras.layers import Dense, Embedding, LSTM, GRU,Bidirectional

%matplotlib inline

In [52]:

#Using Pre-trained word embeddings
GLOVE_DIR = "." 
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    #print(values[1:])
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH_OF_TEXT)

Total 400000 word vectors in Glove.


In [53]:
embedding_vecor_length = 32
modell = Sequential()
# modell.add(Embedding(len(word_index), 100, input_length=1000))
modell.add(embedding_layer)
modell.add(Dropout(0.2))
modell.add(Conv1D(filters=32, kernel_size=5, padding='same', activation='relu'))
modell.add(MaxPooling1D(pool_size=2))
modell.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
modell.add(MaxPooling1D(pool_size=2))
modell.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
modell.add(BatchNormalization())
modell.add(Dense(256, activation='relu'))
modell.add(Dense(64, activation='relu'))
modell.add(Dense(2, activation='softmax'))
modell.compile(loss='categorical_crossentropy', optimizer='adam',
               metrics=['categorical_accuracy','binary_accuracy'])
print(modell.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 100)          2664600   
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 100)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 500, 32)           16032     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 250, 64)           6208      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 125, 64)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               132000    
__________

In [56]:
# Change the batch size to 16 when running in colab
modell.fit(train_data, train_labels, epochs=5, batch_size=64,validation_data=(val_data, val_labels))

Train on 10240 samples, validate on 1284 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc86e14ed68>

In [None]:
modell.save('Bilstm_2_way.h5')

In [None]:
modell.load_weights('Bilstm_2_way.h5')

### Testing

In [57]:
print('\n# Evaluate on test data')
results = modell.evaluate(test_data, test_labels, batch_size=256)
print('test loss, test acc:', results)


# Evaluate on test data
test loss, test acc: [0.9742754379782793, 0.5864246268863271, 0.5864246268863271]
