In [1]:
#import necessary packages
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
#read the training dataset
X1 = pd.read_excel('train.xlsx', index_col='id')

In [3]:
X1.head()

Unnamed: 0_level_0,text,genuinity,column
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3459,Happy no one was hurt when #wmata train derail...,true,0.658669
5136,Deputies: Dog dispute leads to fatal shooting ...,true,0.523674
5408,Former Township fire truck being used in Phili...,fake,0.251757
1877,It's raining outside I'm burning my favorite c...,fake,0.297897
9547,Is this the end of AustraliaÂ‰Ã›Âªs best burge...,fake,0.789958


In [4]:
#single out the text and the genuinity columns in your dataset as they are your feature and target labels
data = X1[['text', 'genuinity']]

In [5]:
#convert into lower case and remove junk characters so that our word vectors are free from additional discrepancies
data['text'].apply(lambda x: x.lower()) #transform text to lowercase
data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
data['text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


id
3459    Happy no one was hurt when wmata train deraile...
5136    Deputies Dog dispute leads to fatal shooting i...
5408    Former Township fire truck being used in Phili...
1877    Its raining outside Im burning my favorite can...
9547    Is this the end of Australias best burger http...
Name: text, dtype: object

In [6]:
#tokenize the words int the training dataset so that each portion of text is an embedding of a word vector
#zero pad the sequences to maintain uniformity and avoid misreading of dataset
tokenizer = Tokenizer(num_words=5000, split=" ")
tokenizer.fit_on_texts(data['text'].values)

X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X) # padding our text vector so they all have the same length
X[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         744,   41,   58,   23, 2382,   40, 1711,  118,  472,  287,    1,
        1360,  389,    8,   29,  183,  419,   76, 1272, 1712],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        2841,  959, 1713,    4,  180,  594,    3,  611,  369],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  960,
        1714,   42,  243,  121,  568,    3, 1182, 3496,  255],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,   35, 2842,  685,   31,   83,
          12,  918, 4522,    6,  278,  177,    4, 3497,   90],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        

In [7]:
#this is a simple model
#I would recommend using a more complex model like maybe increase the dense layers or incerase the dropout to improve 
#validation accuracy
#you can also import the famous pretrained BERT model to gain the best results
model=Sequential()
model.add(Embedding(5000, 256, input_length=X.shape[1]))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

In [8]:
#we use the adam optimizer(rmsprop + momentum)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 31, 256)           1280000   
_________________________________________________________________
dropout (Dropout)            (None, 31, 256)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 31, 256)           525312    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
Total params: 2,331,138
Trainable params: 2,331,138
Non-trainable params: 0
_________________________________________________________________


In [9]:
#true and fake is converted into dummies like [0,1] and [1,0] so that proper mapping of inputs to labels can take place
y = pd.get_dummies(data['genuinity']).values

In [10]:
y[0]

array([0, 1], dtype=uint8)

In [11]:
data['genuinity'].head()

id
3459    true
5136    true
5408    fake
1877    fake
9547    fake
Name: genuinity, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [13]:
model.fit(X_train,y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=2) #i used large batch size to quickly train the dataset. I would recommend that you choose your own allocation of data for better accuracy in predicions

Epoch 1/10
97/97 - 39s - loss: 0.5465 - accuracy: 0.7254 - val_loss: 0.4750 - val_accuracy: 0.7741
Epoch 2/10
97/97 - 42s - loss: 0.3753 - accuracy: 0.8462 - val_loss: 0.4915 - val_accuracy: 0.7784
Epoch 3/10
97/97 - 43s - loss: 0.2947 - accuracy: 0.8803 - val_loss: 0.5487 - val_accuracy: 0.7668
Epoch 4/10
97/97 - 43s - loss: 0.2347 - accuracy: 0.9066 - val_loss: 0.6952 - val_accuracy: 0.7434
Epoch 5/10
97/97 - 43s - loss: 0.1864 - accuracy: 0.9290 - val_loss: 0.7935 - val_accuracy: 0.7405
Epoch 6/10
97/97 - 42s - loss: 0.1467 - accuracy: 0.9473 - val_loss: 0.8356 - val_accuracy: 0.7362
Epoch 7/10
97/97 - 44s - loss: 0.1085 - accuracy: 0.9580 - val_loss: 1.0333 - val_accuracy: 0.7347
Epoch 8/10
97/97 - 42s - loss: 0.1018 - accuracy: 0.9607 - val_loss: 1.1848 - val_accuracy: 0.7201
Epoch 9/10
97/97 - 42s - loss: 0.0810 - accuracy: 0.9666 - val_loss: 1.5575 - val_accuracy: 0.7187
Epoch 10/10
97/97 - 45s - loss: 0.0773 - accuracy: 0.9682 - val_loss: 1.3634 - val_accuracy: 0.7216


<tensorflow.python.keras.callbacks.History at 0x1c232e7c788>

In [22]:
#you can save the model so that you can import and use it again
model.save('realvfake.h5')

In [23]:
predictions = model.predict(X_test)

#[print(data['text'][i], predictions[i], y_test[i]) for i in range(0, 5)]

In [24]:
predictions[0]

array([9.9960369e-01, 3.9633826e-04], dtype=float32)

In [25]:
#initialize a simple counter to get a better idea of precision and recall i.e. false positives and false negatives and correctly
#predicted values
true, fake= 0,0
real_true, real_fake = 0,0
for i, prediction in enumerate(predictions):
    if np.argmax(prediction)==1:
        true+=1
    else:
        fake+=1
    
    if np.argmax(y_test[i])==1:
        real_true+=1
    else:
        real_fake+=1

print("true, real_true", true, real_true)
print("fake, real_fake", fake, real_fake)

true, real_true 332 310
fake, real_fake 430 452


In [26]:
preds=predictions.round()

In [27]:
preds

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [28]:
y_test

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [31]:
#predict the accuracy of your dataset i.e how accurately your validation set was predicted
count = 0
for i in range(0,len(predictions)-1):
    if int(preds[i][0]) == y_test[i][0]:
        count+=1
    
    

In [33]:
val_accuracy = (count/len(preds))
print("Val accuracy:", val_accuracy)

Val accuracy: 0.7624671916010499


In [34]:
test_data = pd.read_csv('test.csv') #load the test file

In [36]:
bata = test_data[['id','text']]  #single out the id and text as your input features

In [37]:
bata.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [39]:
tokenizer.fit_on_texts(data['text'].values)
#tokenize the test set based on your tokenizer from the training set word vector 
Z = tokenizer.texts_to_sequences(bata['text'].values)
Z = pad_sequences(Z) # padding our text vector so they all have the same length
Z[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   28,  881,    2, 1926,  122,   84],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,  451,   52,
         244,    8, 1171, 2545,  600, 1984,  223],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   74,    8,    2,  181,   42,   16,  783, 3398,   21,  843,
           1,  739,    7, 1411,  324,   91,   38],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  430, 3523, 1412],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  494,
         746,  467, 4037,    3,  917,    6, 1127]])

In [41]:
preds2 = model.predict(Z)

In [42]:
#converting dummies to a list of 0 and 1 where 0 is fake and 1 is a real tweet
preds4=[]
for i in preds2:
    if i[0]>i[1]:
        preds4.append(1)
    else:
        preds4.append(0)
    

In [43]:
preds5=pd.DataFrame(preds4)  #convert to dataframe


In [44]:
preds5

Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0
...,...
3258,0
3259,1
3260,0
3261,1


In [46]:
preds5.to_csv('sub_probale2.csv') #save as csv and you can copy paste the id of the tweets into the csv file to get your sample submission