# Preparation

In [None]:
# Use this to install libraries if you find them missing on your system:
#!pip install bs4
#!pip install sklearn
#!pip install nltk
#!pip install gensim
#!pip install lxml
#!pip install keras

In [None]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, model_from_json
from keras.callbacks import EarlyStopping
import gensim
import gensim.downloader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import tensorflow as tf
import json
import joblib


In [None]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [None]:
EMBEDDING_DIM = 300
MAX_VOCAB_SIZE = 262144
MAX_SEQUENCE_LENGTH = 200

In [None]:
# Adapted from Yoon Kim model
# Kim, Y. (2014). Convolutional neural networks for sentence classification. https://doi.org/10.48550/arXiv.1408.5882

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):

    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = Concatenate(axis=-1)(convs)

    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)
    else:
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(labels_index, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

# Part 1

In [None]:
df_fake_1 = pd.DataFrame(pd.read_pickle("fake_news_data/clean/fake.pkl"))
df_real_1 = pd.DataFrame(pd.read_pickle("fake_news_data/clean/real.pkl"))

In [None]:
df_fake_1.columns = ["text"]
df_real_1.columns = ["text"]

In [None]:
df_real_1["label"] = True
df_fake_1["label"] = False

In [None]:
df_1 = pd.concat([df_real_1,df_fake_1])
df_1 = df_1.sample(frac = 1)
df_1


In [None]:
X1 = df_1['text']
y1 = df_1['label']
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.1)
X1_train, X1_test, y1_train, y1_test = X_train, X_test, y_train, y_test

In [None]:
tokenizer1 = Tokenizer(num_words=MAX_VOCAB_SIZE,
                      lower=True,
                      char_level=False,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      split=' ')

tokenizer1.fit_on_texts(X1_train.tolist())

train_word_index_1 = tokenizer1.word_index
train_embedding_weights_1 = np.zeros((len(train_word_index_1)+1, EMBEDDING_DIM))

for word, index in train_word_index_1.items():
    if word in word2vec:
        train_embedding_weights_1[index,:] = word2vec[word]
    else:
        train_embedding_weights_1[index,:] = np.random.rand(EMBEDDING_DIM)

print("embedding dim:", train_embedding_weights_1.shape)

In [None]:
training_sequences_1 = tokenizer1.texts_to_sequences(X1_train.tolist())
train_cnn_data_1 = pad_sequences(training_sequences_1, maxlen=MAX_SEQUENCE_LENGTH-1)

test_sequences_1 = tokenizer1.texts_to_sequences(X1_test.tolist())
test_cnn_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH-1)

In [None]:
batch_size_1 = 256
labels_1 = 1
num_epochs_1 = 10

In [None]:
model1 = ConvNet(train_embedding_weights_1, MAX_SEQUENCE_LENGTH-1, len(train_word_index_1)+1, EMBEDDING_DIM, labels_1, False)


In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
hist = model1.fit(train_cnn_data_1, y1_train, epochs=num_epochs_1, validation_split=0.1, shuffle=True, batch_size=batch_size_1)

In [None]:
y1_predicted = model1.predict(test_cnn_data_1, batch_size=1024, verbose=1)

In [None]:
y1_pred = np.round(y1_predicted)


In [None]:
acc_model1 = round(accuracy_score(y1_test, y1_pred),4)
prec_model1 = round(precision_score(y1_test, y1_pred, average="weighted"),4)
rec_model1 = round(recall_score(y1_test, y1_pred, average="weighted"),4)
f1_model1 = round(f1_score(y1_test, y1_pred, average="weighted"),4)

In [None]:
evaluation_dict = {"accuracy:": acc_model1 ,"precision": prec_model1 ,"recall:": rec_model1 ,"f1:": f1_model1 }

# Part 2

In [None]:
df_fake_2 = pd.DataFrame(pd.read_csv("fake.csv"))
df_fake_2 = df_fake_2.drop(["title", "uuid", "ord_in_thread", "author", "published", "language", "crawled", "site_url",	"country"	,"domain_rank"	,"thread_title",	"spam_score",	"main_img_url",	"replies_count", "participants_count", "likes", "comments", "shares"], axis = 1)
df_fake_2["text"] = df_fake_2["text"].astype(str)

In [None]:
df_fake_2 = df_fake_2.join(pd.get_dummies(df_fake_2['type']))
df_fake_2 = df_fake_2.drop('type', axis = 1)

In [None]:
df_fake_bs = df_fake_2[df_fake_2.bs == True]
df_fake_notbs = df_fake_2[df_fake_2.bs != True]
print("Number of rows with label 'bs':", len(df_fake_bs),"\nNumber of rows with other labels:", len(df_fake_notbs))

In [None]:
df_fake_bs_sample = df_fake_bs.sample(frac= 0.05)
df_fake_new = pd.concat([df_fake_bs_sample, df_fake_notbs])
df_fake_new = df_fake_new.sample(frac=1)
df_fake_new

In [None]:
train, test = train_test_split(df_fake_new, test_size=0.2)

In [None]:
X2_train = train["text"]
X2_test = test["text"]

label_names = ["bias", "bs", "conspiracy", "fake", "hate", "junksci", "satire", "state"]
y2_train = train[label_names].values
y2_test = test[label_names].values

In [None]:
tokenizer2 = Tokenizer(num_words=MAX_VOCAB_SIZE,
                      lower=True,
                      char_level=False,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      split=' ')

tokenizer2.fit_on_texts(X2_train.tolist())

train_word_index_2 = tokenizer2.word_index
train_embedding_weights_2 = np.zeros((len(train_word_index_2)+1, EMBEDDING_DIM))

for word,index in train_word_index_2.items():
    if word in word2vec:
        train_embedding_weights_2[index,:] = word2vec[word]
    else:
        train_embedding_weights_2[index,:] = np.random.rand(EMBEDDING_DIM)

print("embedding dim:", train_embedding_weights_2.shape)

In [None]:
training_sequences_2 = tokenizer2.texts_to_sequences(X2_train.tolist())
train_cnn_data_2 = pad_sequences(training_sequences_2, maxlen=MAX_SEQUENCE_LENGTH-1)

test_sequences_2 = tokenizer2.texts_to_sequences(X2_test.tolist())
test_cnn_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH-1)

print(len(train_cnn_data_2))

In [None]:
batch_size_2 = 256
labels_2 = 8
num_epochs_2 = 30

In [None]:
model2 = ConvNet(train_embedding_weights_2, MAX_SEQUENCE_LENGTH-1, len(train_word_index_2)+1, EMBEDDING_DIM, labels_2, False)

In [None]:
model2.summary()

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
hist = model2.fit(train_cnn_data_2, y2_train, epochs=num_epochs_2, validation_split=0.1, shuffle=True, batch_size=batch_size_2)

In [None]:
y2_predicted = model2.predict(test_cnn_data_2, batch_size=1024, verbose=1)

In [None]:
df2_pred = pd.DataFrame(y2_predicted)
df2_pred = df2_pred.where(df2_pred!=0).rank(1, ascending=False, method='dense').eq(1).astype(int)
np2_pred = df2_pred.to_numpy()

In [None]:
acc_model2 = round(accuracy_score(y2_test, np2_pred),4)
prec_model2 = round(precision_score(y2_test, np2_pred, average="weighted"),4)
rec_model2 = round(recall_score(y2_test, np2_pred, average="weighted"),4)
f1_model2 = round(f1_score(y2_test, np2_pred, average="weighted"),4)

In [None]:
evaluation_dict = {"accuracy:": acc_model2 ,"precision": prec_model2 ,"recall:": rec_model2 ,"f1:": f1_model2 }

to save model

In [None]:
joblib.dump(model2, "model2.sav")

In [None]:
model_json = model2.to_json()
model2.save_weights('model2_weights')
with open('model2.json', 'w') as f:
    json.dump(model_json, f)

In [None]:
pd.DataFrame(np2_pred).sample(frac= 0.1)

# Check Models Compatibility
(using the model from Part 2 on the fake news obtained from Part 1)

In [None]:
df_combine = pd.DataFrame(X1_test)
df_combine["label"] = y1_pred
df_combine = df_combine.reset_index(drop=True)

df_combine_fake = df_combine.loc[df_combine['label'] == 0.0]

df_sample_fake = df_combine_fake.sample(frac=0.1)
df_sample_fake

In [None]:
combined_sample_tokenised = tokenizer2.texts_to_sequences(df_sample_fake["text"].tolist())
combined_sample_train_data = pad_sequences(combined_sample_tokenised, maxlen=MAX_SEQUENCE_LENGTH-1)

In [None]:
combined_predicted = model2.predict(combined_sample_train_data, batch_size=1024, verbose=1)

df_combined_pred = pd.DataFrame(combined_predicted)
df_combined_pred = df_combined_pred.where(df_combined_pred!=0).rank(1, ascending=False, method='dense').eq(1).astype(int)

df_combined_pred.columns = label_names 
df_combined_pred["text"] = df_sample_fake["text"].values
df_combined_pred


# Test both models on input

In [None]:
text_news = input("Insert News")

In [None]:
text_tokenised_1 = tokenizer1.texts_to_sequences([[text_news]])
text_tokenised_1_data = pad_sequences(text_tokenised_1, maxlen=MAX_SEQUENCE_LENGTH-1)

In [None]:
text_1_predicted = model1.predict(text_tokenised_1_data, batch_size=1024, verbose=1)
text_1_predicted = np.round(np.array(text_1_predicted))
text_1_predicted = np.bool8(text_1_predicted)
text_1_predicted[0,0]


In [None]:
df_text2_pred = pd.DataFrame(None)

if text_1_predicted[0,0] == 0:
    text_tokenised_2 = tokenizer2.texts_to_sequences([[text_news]])
    text_tokenised_2_data = pad_sequences(text_tokenised_2, maxlen=MAX_SEQUENCE_LENGTH-1)
    text_2_predicted = model2.predict(text_tokenised_2_data, batch_size=1024, verbose=1)
    df_text2_pred = pd.DataFrame(text_2_predicted)
    df_text2_pred = df_text2_pred.where(df_text2_pred!=0).rank(1, ascending=False, method='dense').eq(1).astype(int)
    df_text2_pred.columns = label_names 
    
print(df_text2_pred)