In [226]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news/submit.csv
/kaggle/input/fake-news/train.csv
/kaggle/input/fake-news/test.csv


In [227]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout

In [228]:
df = pd.read_csv('/kaggle/input/fake-news/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [229]:
df.isnull().values.any()

True

In [230]:
df=df.dropna()
df.isnull().values.any()

False

In [231]:
X = df.drop('label', axis = 1)
Y = df['label']

In [232]:
print(X.shape)
print(Y.shape)

(18285, 4)
(18285,)


## One Hot representation and Word embedding

In [233]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot

In [234]:
messages = X.copy()
messages.reset_index(inplace = True)

In [235]:
messages.head()

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [236]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [237]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [238]:
corpus[0:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

In [239]:
# using one_hot to do one hot encoding with a vocabuklary size of say 5000
voc_size = 5000
one_hotrep = [one_hot(word, voc_size) for word in corpus]

In [240]:
one_hotrep[0:5]  # gives us the index of the words in our vocabulary

[[4876, 3045, 2135, 2792, 893, 2692, 3939, 2857, 4553, 1854],
 [1886, 3228, 3140, 3333, 1801, 1638, 2644],
 [3526, 4350, 3924, 544],
 [2464, 4044, 4277, 2057, 2419, 3322],
 [710, 1801, 2895, 4354, 2770, 3443, 1801, 2421, 1245, 4461]]

## Padding

In [241]:
# done to make all sentences of equal length
sent_length = 20
embedded = pad_sequences(one_hotrep, maxlen = 20, padding = 'pre')
embedded[0:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 4876,
        3045, 2135, 2792,  893, 2692, 3939, 2857, 4553, 1854],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 1886, 3228, 3140, 3333, 1801, 1638, 2644],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 3526, 4350, 3924,  544],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 2464, 4044, 4277, 2057, 2419, 3322],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  710,
        1801, 2895, 4354, 2770, 3443, 1801, 2421, 1245, 4461]],
      dtype=int32)

## Buiding our LSTM Model

In [242]:
embedding_vector_features = 40  # for our embedded matrix
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [243]:
model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
lstm_17 (LSTM)               (None, 20, 100)           56400     
_________________________________________________________________
lstm_18 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 101       
Total params: 336,901
Trainable params: 336,901
Non-trainable params: 0
_________________________________________________________________


In [244]:
import numpy as np
from sklearn.model_selection import train_test_split

In [245]:
X_final = np.array(embedded)
Y_final = np.array(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size = 0.3, random_state = 42)

In [246]:
model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb13f1e0050>

In [247]:
Y_pred = model.predict_classes(X_test)

In [248]:
from sklearn import metrics

In [249]:
metrics.confusion_matrix(Y_test, Y_pred)

array([[2867,  240],
       [ 295, 2084]])

In [250]:
metrics.accuracy_score(Y_test, Y_pred)

0.9024790375501276

In [251]:
print(metrics.classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      3107
           1       0.90      0.88      0.89      2379

    accuracy                           0.90      5486
   macro avg       0.90      0.90      0.90      5486
weighted avg       0.90      0.90      0.90      5486



### Lets load the test dataset and start the testing

In [252]:
Test=pd.read_csv('../input/fake-news/test.csv') 
Test_id=Test["id"]

In [253]:
Test_id

0       20800
1       20801
2       20802
3       20803
4       20804
        ...  
5195    25995
5196    25996
5197    25997
5198    25998
5199    25999
Name: id, Length: 5200, dtype: int64

In [254]:
Test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [255]:
Test=Test.drop(['text','id','author'],axis=1)

In [256]:
Test.fillna('fake fake fake',inplace=True)

In [257]:
ps = PorterStemmer()
corpus_test = []
for i in range(0, len(Test)):
    review = re.sub('[^a-zA-Z]', ' ',Test['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_test.append(review)

In [258]:
one_hot_rep_Test=[one_hot(words,5000)for words in corpus_test] 

In [259]:
embedded_docs_test=pad_sequences(one_hot_rep_Test,padding='pre',maxlen = 20)
print(embedded_docs_test)

[[   0    0    0 ... 2589 1433 3004]
 [   0    0    0 ... 1742 4867 2930]
 [   0    0    0 ... 4253 4741 4916]
 ...
 [   0    0    0 ... 2589 1433 3004]
 [   0    0    0 ...  308 4261 3916]
 [   0    0    0 ... 2589 1433 3004]]


In [260]:
X_real_test=np.array(embedded_docs_test)

In [261]:
result = model.predict_classes(X_real_test)
val=[]
for i in result:
    val.append(i[0])
submission_lstm = pd.DataFrame({'id':Test_id, 'label':val})
submission_lstm.shape
import os
os.chdir(r'../working')
submission_lstm.to_csv(r'submission_lstm.csv',index = False)
from IPython.display import FileLink
FileLink(r'submission_lstm.csv')

## Bidirectional LSTM

In [262]:
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               112800    
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [263]:
model1.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0c114dfd0>

In [264]:
Y_pred = model1.predict_classes(X_test)
metrics.accuracy_score(Y_test, Y_pred)

0.9043018592781626

In [265]:
result = model1.predict_classes(X_real_test)
val=[]
for i in result:
    val.append(i[0])
submission_bi_lstm = pd.DataFrame({'id':Test_id, 'label':val})
import os
os.chdir(r'../working')
submission_bi_lstm.to_csv(r'submission_bi_lstm.csv',index = False)
from IPython.display import FileLink
FileLink(r'submission_bi_lstm.csv')

## Adding a droput layer plus an additiona bidirectional lstm layer

In [266]:
model2=Sequential()
model2.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model2.add(Bidirectional(LSTM(100,return_sequences = True)))
model2.add(Dropout(0.3))
model2.add(Bidirectional(LSTM(100)))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model2.summary())

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 20, 200)           112800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 20, 200)           0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 201       
Total params: 553,801
Trainable params: 553,801
Non-trainable params: 0
_________________________________________________________________
None


In [267]:
model2.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)
# much slower than above due to additon of another layer

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0beb13790>

In [268]:
Y_pred = model2.predict_classes(X_test)
metrics.accuracy_score(Y_test, Y_pred)

0.9026613197229311

## GRU Model

In [269]:
model3=Sequential()
model3.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model3.add(GRU(100, return_sequences=True))
model3.add(GRU(100))
model3.add(Dense(1,activation='sigmoid'))
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model3.summary())

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
gru_3 (GRU)                  (None, 20, 100)           42600     
_________________________________________________________________
gru_4 (GRU)                  (None, 100)               60600     
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 101       
Total params: 303,301
Trainable params: 303,301
Non-trainable params: 0
_________________________________________________________________
None


In [270]:
model3.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0bbb47cd0>

In [271]:
Y_pred = model3.predict_classes(X_test)
metrics.accuracy_score(Y_test, Y_pred)

0.909223477943857

In [272]:
result = model3.predict_classes(X_real_test)
val=[]
for i in result:
    val.append(i[0])
submission_gru = pd.DataFrame({'id':Test_id, 'label':val})
submission_gru.shape
import os
os.chdir(r'../working')
submission_gru.to_csv(r'submission_gru.csv',index = False)
from IPython.display import FileLink
FileLink(r'submission_gru.csv')

## Simple RNN model

In [273]:
model4=Sequential()
model4.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model4.add(SimpleRNN(100, return_sequences=True))
model4.add(SimpleRNN(100))
model4.add(Dense(1,activation='sigmoid'))
model4.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model4.summary())

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 20, 100)           14100     
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 101       
Total params: 234,301
Trainable params: 234,301
Non-trainable params: 0
_________________________________________________________________
None


In [274]:
model4.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0bae9dfd0>

In [275]:
Y_pred = model4.predict_classes(X_test)
metrics.accuracy_score(Y_test, Y_pred)

0.9125045570543201

In [276]:
result = model4.predict_classes(X_real_test)
val=[]
for i in result:
    val.append(i[0])
submission_rnn = pd.DataFrame({'id':Test_id, 'label':val})
submission_rnn.shape
import os
os.chdir(r'../working')
submission_rnn.to_csv(r'submission_rnn.csv',index = False)
from IPython.display import FileLink
FileLink(r'submission_rnn.csv')

## Simple Artificial Neural Network

In [277]:
model5=Sequential()
model5.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model5.add(Flatten())
model5.add(Dense(32,activation='relu'))
model5.add(Dense(1,activation='sigmoid'))
model5.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model5.summary())

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 32)                25632     
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 33        
Total params: 225,665
Trainable params: 225,665
Non-trainable params: 0
_________________________________________________________________
None


In [278]:
model5.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 20, batch_size = 64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fb0b7869110>

In [279]:
Y_pred = model5.predict_classes(X_test)
metrics.accuracy_score(Y_test, Y_pred)

0.9159679183375866

In [280]:
result = model5.predict_classes(X_real_test)
val=[]
for i in result:
    val.append(i[0])
submission_ann = pd.DataFrame({'id':Test_id, 'label':val})
submission_ann.shape
import os
os.chdir(r'../working')
submission_ann.to_csv(r'submission_ann.csv',index = False)
from IPython.display import FileLink
FileLink(r'submission_ann.csv')

## Logistic Regression

In [281]:
print(X_test.shape)
print(Y_test.shape)

(5486, 20)
(5486,)


In [282]:
from sklearn.linear_model import LogisticRegression

In [283]:
model6 = LogisticRegression()
model6.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [284]:
Y_pred = model6.predict(X_test)
metrics.accuracy_score(Y_test, Y_pred)

0.7065257017863653

In [285]:
X_train

array([[   0,    0,    0, ..., 2057, 2390, 3138],
       [   0,    0,    0, ..., 4684, 1067, 2644],
       [   0,    0,    0, ...,   39, 4096, 4331],
       ...,
       [   0,    0,    0, ..., 2589, 1433, 3004],
       [   0,    0,    0, ..., 3178, 1262, 2644],
       [   0,    0,    0, ..., 2895, 3080, 3633]], dtype=int32)

In [286]:
result = model6.predict(X_real_test)
result = result.reshape(-1,1)
print(result)
val=[]
for i in result:
    val.append(i[0])
submission_lr = pd.DataFrame({'id':Test_id, 'label':val})
import os
os.chdir(r'../working')
submission_lr.to_csv(r'submission_lr.csv',index = False)
from IPython.display import FileLink
FileLink(r'submission_lr.csv')

[[0]
 [1]
 [1]
 ...
 [0]
 [1]
 [1]]


## Changin our X i.e input slightly

In [287]:
X.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [288]:
Y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [289]:
print(X.shape)
print(Y.shape)

(18285, 4)
(18285,)


In [290]:
df['total'] = df['title']+' '+df['author']
X = df.drop('label',axis=1)
X.head()

Unnamed: 0,id,title,author,text,total
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...


In [291]:
voc_size = 5000
msg = X.copy()
msg.reset_index(inplace = True)
msg.head()

Unnamed: 0,index,id,title,author,text,total
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Consortiumne...
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...


In [292]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [293]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(msg)):
    review = re.sub('[^a-zA-Z]', ' ', msg['total'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [294]:
onehot_rep = [one_hot(words,voc_size)for words in corpus]
embedded_docs = pad_sequences(onehot_rep,padding='pre',maxlen=25)

In [315]:
model = Sequential()
model.add(Embedding(voc_size,40,input_length=25))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, 25, 40)            200000    
_________________________________________________________________
dropout_6 (Dropout)          (None, 25, 40)            0         
_________________________________________________________________
lstm_25 (LSTM)               (None, 100)               56400     
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [296]:
import numpy as np
from sklearn.model_selection import train_test_split

In [297]:
X_final = np.array(embedded_docs)
Y_final = np.array(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size = 0.3, random_state = 42)

In [316]:
model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0ae8e6650>

In [317]:
from sklearn.metrics import accuracy_score
Y_pred = model.predict_classes(X_test)
accuracy_score(Y_test, Y_pred)

0.988698505286183

## ANN - with modified data

In [312]:
model = Sequential()
model.add(Embedding(voc_size,40,input_length=25))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 25, 40)            200000    
_________________________________________________________________
flatten_9 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_39 (Dense)             (None, 32)                32032     
_________________________________________________________________
dense_40 (Dense)             (None, 1)                 33        
Total params: 232,065
Trainable params: 232,065
Non-trainable params: 0
_________________________________________________________________
None


In [313]:
model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0b134d650>

In [314]:
Y_pred = model.predict_classes(X_test)
accuracy_score(Y_test, Y_pred)

0.995989792198323

In [318]:
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_36 (Embedding)     (None, 20, 40)            200000    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 200)               112800    
_________________________________________________________________
dense_42 (Dense)             (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [319]:
model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb0ac9c4e90>

In [320]:
Y_pred = model.predict_classes(X_test)
accuracy_score(Y_test, Y_pred)

0.988698505286183

## Conclusion:
-> Simple Artificial Neural Network perfored the best!!


## * ANN > BI-LSTM > LSTM > Simple RNN > GRU