In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [None]:
df = pd.read_csv("./data/fake_news_data.csv")
df.dropna(how="any", inplace=True)

In [None]:
df.head()

In [None]:
df["real"].value_counts()

In [None]:
#Balance classes

df_fake = df.loc[df["real"] == 0]
df_real = df.loc[df["real"] == 1]

num_examples_per_class = min(len(df_fake),len(df_real))

df = pd.concat([df_fake.sample(num_examples_per_class), df_real.sample(num_examples_per_class)])

In [None]:
unique_domains = list(df["source_domain"].unique())
unique_titles = list(df["title"].unique())

In [None]:
num_words=10000
num_domains = len(unique_domains)

In [None]:

title_input = keras.Input(shape=(1,), name="title", dtype=tf.string)
source_domain_input = keras.Input(shape=(1,), name="domain", dtype=tf.string)
tweet_num_input = keras.Input(shape=(1,), name="num_tweets", dtype=tf.float32)

title_vectorizer = keras.layers.TextVectorization(max_tokens=num_words)
title_vectorizer.adapt(unique_titles)
title_vectorizer=title_vectorizer(title_input)

source_domain_lookup = keras.layers.StringLookup(vocabulary=unique_domains)(source_domain_input)

tweet_num_normalized = keras.layers.Normalization()
tweet_num_normalized.adapt(df["tweet_num"])
tweet_num_normalized = tweet_num_normalized(tweet_num_input)

In [None]:
title_embedding=keras.layers.Embedding(num_words+1, 64)(title_vectorizer)
source_domain_embedding = keras.layers.Embedding(num_domains+1, 64)(source_domain_lookup)


In [None]:
title_features = keras.layers.LSTM(128)(title_embedding)
source_domain_features = keras.layers.GlobalAveragePooling1D()(source_domain_embedding)

In [None]:
x = keras.layers.concatenate([title_features, source_domain_features, tweet_num_normalized])

In [None]:
output = keras.layers.Dense(1, name="real", activation="sigmoid")(x)

In [None]:
model = keras.Model(inputs=[title_input, source_domain_input, tweet_num_input], outputs=[output])

In [None]:
keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(
    optimizer=keras.optimizers.RMSprop(1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=["binary_accuracy"]
)

In [None]:
import numpy as np

In [None]:
title_data = np.array((df["title"].astype(str)))
domain_data = np.array(df["source_domain"].astype(str))
tweet_num_data = np.array(df["tweet_num"])

target = np.array(df["real"].astype(int))

In [None]:
history = model.fit({"title":title_data, 
            "domain":domain_data,
            "num_tweets":tweet_num_data},
            {"real":target}, 
            epochs=15)

In [None]:
df_samples = df.sample(n=10)
sample_titles = np.array(df_samples["title"])
sample_domains = np.array(df_samples["source_domain"])
sample_tweet_nums = np.array(df_samples["tweet_num"])

In [None]:
input_example = {
    "title":sample_titles,
    "domain":sample_domains,
    "num_tweets":sample_tweet_nums
}

In [None]:
input_example

In [None]:
predictions = model.predict(input_example)

In [None]:
print("Predictions", ["real" if pred >= .5 else "fake" for pred in predictions])
print("Actual", ["real" if target == 1 else "fake" for target in df_samples["real"]])


In [None]:
df_samples["predicted_real"] = [1 if pred >= .5 else 0 for pred in predictions]
df_samples["prediction_raw_value"] = np.array(predictions)
df_samples["prediction_certainty"] = [pred if pred > .5 else 100.0 - pred for pred in predictions]
df_samples