In [16]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Concatenate
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import datetime

In [2]:
df_true = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

In [3]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
stoplist = stopwords.words('english')
## used for standardization of words
ps = PorterStemmer()
def text_preprocessing(text):
    ## except a-z and A-Z all charectors will be replaced by space
    text = re.sub('[^a-zA-Z]', ' ', text) 
    ## then make every words in lower
    text = text.lower() 
    ## then split every words in list
    text = text.split() 
    
    text = [ps.stem(word) for word in text if not word in stoplist] 
    text = ' '.join(text)
    return text

In [5]:
## label fake as 1, true as 0
df_fake['label'], df_true["label"] = 1, 0
## Concat all useful text information
df_true['text'] = df_true['title'] + " " + df_true["text"] + " " + df_true['subject']
df_fake['text'] = df_fake['title'] + " " + df_fake["text"] + " " + df_fake['subject']
df = pd.concat([df_true[['text', 'label']], df_fake[['text', 'label']]])

In [6]:
df["text"] = df['text'].apply(text_preprocessing)

In [8]:
## Calculate maximum length for padding purpose
df["length"] = df["text"].apply(lambda x : len(x.split()))
maxlen = max(df["length"])
## Calculate unique vocabs
vocab_set = set()
for text in df["text"]:
    for char in text.split():
        vocab_set.add(char)

## vocab and index mapping
## The reseaon for plus one is that I want padding 0 to be the meaning of empty
vocab_to_int = {w: c + 1 for c, w in enumerate(list(vocab_set))}
int_to_vocab = {c + 1: w for c, w in enumerate(list(vocab_set))}
vocab_size = len(vocab_set) + 1
print ("vocab size : {}".format(vocab_size))
print ("maximum length : {}".format(maxlen))

vocab size : 89890
maximum length : 4991


In [9]:
## vocab to int index mapping
df["text_vectors"] = df['text'].apply(lambda  x : [vocab_to_int[word] for word in x.split()])

In [11]:
## sampling and datasets spliting
df = shuffle(df)
df_sample = df.sample(frac = 0.35,random_state=42)[['text_vectors', 'label']]
X = list(df_sample['text_vectors'])
y = np.array((df_sample['label']))
## padding with 0
X = pad_sequences(X, maxlen=maxlen, padding='post')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print ("train shape : {}".format(X_train.shape))
print ("test shape : {}".format(X_test.shape))

train shape : (11785, 4991)
test shape : (3929, 4991)


In [14]:
## TextCNN implementation
embedding_dims = 15
dropout_rate = 0.3
def TextCNN(vocab_size, maxlen, embedding_dims, dropout_rate):
    input = Input((maxlen,))
    embedding = Embedding(vocab_size, embedding_dims, input_length=maxlen)(input)
    convs = []
    for kernel_size in [3, 4, 5]:
        c = Conv1D(128, kernel_size, activation='relu')(embedding)
        c = GlobalMaxPooling1D()(c)
        convs.append(c)
    x = Concatenate()(convs)
    x = Dropout(dropout_rate)(x)
    output = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input, outputs=output)
    return model

## Initialize model
text_cnn_model = TextCNN(vocab_size, maxlen, embedding_dims, dropout_rate)
model = Sequential()
model.add(text_cnn_model)
print (model.summary())

## compile model
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
epochs = 5
optimizer =tf.keras.optimizers.Adam(
    learning_rate=0.001, 
    name='Adam'
)
model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
model_1 (Functional)         (None, 1)                 1372159   
Total params: 1,372,159
Trainable params: 1,372,159
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
## record history using tensorboard
log_dir = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, update_freq='epoch')
history = model.fit(X_train, y_train, validation_split=0.2, epochs=epochs, verbose=2, batch_size=64, callbacks=[tensorboard_callback])

Epoch 1/5
148/148 - 221s - loss: 0.2997 - binary_accuracy: 0.8936 - val_loss: 0.0054 - val_binary_accuracy: 1.0000
Epoch 2/5
148/148 - 306s - loss: 0.0025 - binary_accuracy: 1.0000 - val_loss: 9.3454e-04 - val_binary_accuracy: 1.0000
Epoch 3/5
148/148 - 358s - loss: 6.6466e-04 - binary_accuracy: 1.0000 - val_loss: 3.7085e-04 - val_binary_accuracy: 1.0000
Epoch 4/5
148/148 - 363s - loss: 3.0511e-04 - binary_accuracy: 1.0000 - val_loss: 1.9416e-04 - val_binary_accuracy: 1.0000
Epoch 5/5
148/148 - 361s - loss: 1.7678e-04 - binary_accuracy: 1.0000 - val_loss: 1.1879e-04 - val_binary_accuracy: 1.0000


In [18]:
## inference
y_hat = model.predict_classes(X_test, batch_size=64)
score = f1_score(y_test, y_hat)
print ("f1 score : {}".format(score))



f1 score : 1.0


In [29]:
%load_ext tensorboard
%tensorboard --logdir .

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
