**In this script we train a neuronal network to classify fake news.**

![](http://)The usage of this trained model as a telegram bot can be found in the following github repo:
https://github.com/tschomacker/fake-news-detection-bot


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
news = list()

for i, entry in enumerate(open("/kaggle/input/fake-and-real-news-dataset/True.csv").readlines()):
    if i == 0:
        continue
    x = entry.split(",")
    if len(x) <2:
        continue
    news.append((0, x[1]))
    
for i, entry in enumerate(open("/kaggle/input/fake-and-real-news-dataset/Fake.csv").readlines()):
    if i == 0:
        continue
    x = entry.split(",")
    if len(x) < 2:
        continue
    news.append((1, x[1]))

from random import shuffle
shuffle(news)
    


In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/bert-base-multilingual-cased-sentence")

model = AutoModel.from_pretrained("DeepPavlov/bert-base-multilingual-cased-sentence").to(device)

_input = tokenizer("Das ist ein text", padding = True, truncation= True, return_tensors = "pt").to(device)


In [None]:
model(**_input)[0].mean(1).size()

In [None]:
class BinaryClassifier(torch.nn.Module):
    def __init__(self,input_dim, num_classes):
        super(BinaryClassifier, self).__init__()
        self.dropout = torch.nn.Dropout(0.1)
        self.f1 = torch.nn.Linear(input_dim, input_dim * 2)
        self.activation = torch.nn.Softsign()
        self.f2 = torch.nn.Linear( input_dim * 2, input_dim)
        self.f3 = torch.nn.Linear( input_dim, num_classes)
        self.softmax = torch.nn.Softmax()
        
        
    def forward(self,embedding, label = None):
        x = self.dropout(embedding)
        x = self.f1(x)
        x = self.activation(x)
        x = self.f2(x)
        x = self.activation(x)
        x = self.f3(x)
        loss = 0
        if label:
            label = torch.LongTensor(label).to(device)
            loss_fc = torch.nn.CrossEntropyLoss()
            loss = loss_fc(x, label)
        # todo use softmax, wenn noch labels enabled
        return (loss, x)

In [None]:
from tqdm.notebook import tqdm
with torch.no_grad():
    model.eval()
    new = list()
    for sent in tqdm(news):
        #print(sent)
        inp = tokenizer(sent[1], padding = True, truncation= True, return_tensors = "pt").to(device)
        embedding = model(**inp)[1]
        print(embedding.size())
        #print()
        label = sent[0]
        new.append((embedding, label, sent[1]))
    news = new





In [None]:

bs_size = 64
epochs = 200

batches = [news[i:i+bs_size] for i in range(0,len(news), bs_size)]
batches = batches[0:int(len(batches)*0.8)]
batches_eval = batches[int(len(batches)*0.8):len(batches)]

In [None]:
classifier = BinaryClassifier(768, 2).to(device)

optimizer = torch.optim.Adam(classifier.parameters(), lr = 0.00001)
pbar = tqdm
for epoch in range(epochs):
    losses = list()
    shuffle(batches)
    for batch in batches:
        classifier.train()
        optimizer.zero_grad()
        embeddings = torch.cat([e[0] for e in batch]).to(device)
        
        loss = classifier(embeddings, [e[1] for e in batch])[0]
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
    print(epoch,(sum(losses) / len(losses)))


classifier.cpu()
torch.save(classifier.state_dict(), "./classifier.pt")
classifier.to(device)

In [None]:
classifier.load_state_dict(torch.load('./classifier.pt'))
classifier.to(device)
true_count = 0
false_count = 0
fake = 0
not_fake = 0
classifier.eval()
a = torch.nn.Softmax()
for batch in tqdm(batches_eval):
    classifier.eval()
    embeddings = torch.cat([e[0] for e in batch]).to(device)
    with torch.no_grad():
        classes = a(classifier(embeddings, [e[1] for e in batch])[1]).tolist()
    for pred, true in zip(classes, [e[1] for e in batch]):
        if pred[0] > pred[1] and true == 0:
            true_count += 1
            not_fake += 1
        elif pred[0] < pred[1] and true == 1:
            true_count += 1
            fake += 1
        else:
            false_count += 1
print(true_count, false_count, fake, not_fake)

In [None]:
def predict(text):
    model.to(device)
    inp = tokenizer(text, padding = True, truncation= True, return_tensors = "pt").to(device)
    embedding = model(**inp)[1]
    classifier.eval()
    a = torch.nn.Softmax()
    x = a(classifier(embedding)[1])
    #print(x)
    return x

def pred_in_human_lang(text):
    x = predict(text).tolist()[0]
    if x[0] > x[1]:
        print("not fake")
        return "not fake"
    else:
        print("fake")
        return "fake"

In [None]:
pred_in_human_lang("Die US Wahlen wurden nicht manipuliert")
pred_in_human_lang("Die US Wahlen wurden manipuliert")
pred_in_human_lang("Election fraud decided the us elections")
pred_in_human_lang("QAnon reshaped Trump’s party and radicalized believers. The Capitol siege may just be the start.")
pred_in_human_lang("Die corona pandemie gerät außer Kontrolle")
