In [40]:
!pip install torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/cu118 -q

In [41]:
!pip install pandas nltk scikit-learn -q

In [43]:
import pandas as pd
import spacy
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [44]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

fake["is_fake"] = 1
true["is_fake"] = 0

df = pd.concat([true, fake])
del fake, true

df.shape

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_WORDS = 10000
SENTENCE_LENGTH = 100
EMBED_DIM = 1000
random_state = 42

nlp = spacy.load("en_core_web_sm")

tqdm.pandas()


def preprocess_text(text):
    return text.progress_apply(
        lambda x: " ".join(
            token.lemma_.lower() for token in nlp(x) if
            not token.is_stop
            and not token.is_punct
            and not token.is_digit
            and not token.like_email
            and not token.like_num
            and not token.is_space
        )
    )

(44898, 5)

In [None]:
%%time
df['cleaned_text'] = df['title'] + " " + df['text']
df['cleaned_text'] = preprocess_text(df['cleaned_text'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['is_fake'], test_size=0.2,
                                                    random_state=random_state)

In [5]:
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(df['cleaned_text'])
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(train_seq, maxlen=SENTENCE_LENGTH)
x_test = pad_sequences(test_seq, maxlen=SENTENCE_LENGTH)

In [26]:
class SpamDataset(Dataset):
    def __init__(self, data, label):
        self.data = torch.tensor(data).to(torch.int64)
        self.label = label

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.data)


batch_size = 256
torch.manual_seed(random_state)
train_dataset = SpamDataset(x_train, y_train.to_numpy())
test_dataset = SpamDataset(x_test, y_test.to_numpy())
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [27]:
class CNN(nn.Module):
    def __init__(self, embed_dim):
        super(CNN, self).__init__()
        self.embed = nn.Embedding(NUM_WORDS, embed_dim)
        self.conv1 = nn.Conv2d(1, 1, 3)
        self.conv2 = nn.Conv2d(1, 1, 3)
        self.conv3 = nn.Conv2d(1, 1, 3)
        self.fc = nn.Linear(93436, 2)

    def forward(self, x):
        out = self.embed(x)
        out = out.unsqueeze(1)
        out = F.relu(self.conv1(out))
        out = F.relu(self.conv2(out))
        out = F.relu(self.conv3(out))
        out = out.view(out.size()[0], -1)
        out = self.fc(out)
        return out

In [28]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, seq_len):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(seq_len * hidden_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = torch.reshape(x, (x.size(0), -1,))
        x = self.fc(x)
        return F.softmax(x, dim=-1)

In [45]:
class Trainer:
    def __init__(self, model, epochs, train_dataloader, test_dataloader, device, criterion, optimizer):
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.test_dataloader = test_dataloader
        self.train_dataloader = train_dataloader
        self.model = model
        self.epochs = epochs

    def train(self):
        for epoch in range(self.epochs):
            self.model.train()
            running_loss = 0
            correct_predictions = 0
            total = 0
            for i, (inputs, targets) in enumerate(tqdm(self.train_dataloader)):
                inputs, targets = inputs.to(device), targets.to(device)
                self.optimizer.zero_grad()

                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets.long())
                loss.backward()

                self.optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct_predictions += (predicted == targets).sum().item()

            train_loss = running_loss / len(train_dataloader)
            train_accuracy = correct_predictions / total
            print(f'Epoch: {epoch + 1}/{self.epochs}, Loss: {train_loss:.6f}, Train accuracy: {train_accuracy:.6f}')

    def evaluate(self):
        self.model.eval()
        val_loss = 0
        correct_predictions = 0

        with torch.no_grad():
            for inputs, targets in self.test_dataloader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets.long())
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == targets).sum().item()

        val_loss /= len(self.test_dataloader)
        val_accuracy = correct_predictions / len(test_dataset)

        print(f'Validation loss: {val_loss:.6f}, Validation accuracy: {val_accuracy:.3f}\n')

In [46]:
CNN_model = CNN(EMBED_DIM).to(device)
CNN_optimizer = torch.optim.Adam(CNN_model.parameters(), lr=0.001)
CNN_criterion = nn.CrossEntropyLoss()

In [53]:
CNN_trainer = Trainer(CNN_model, 5, train_dataloader, test_dataloader, device, CNN_criterion, CNN_optimizer)

In [54]:
CNN_trainer.train()

In [55]:
CNN_trainer.evaluate()

100%|██████████| 141/141 [00:09<00:00, 14.80it/s]


Epoch: 1/5, Loss: 0.925183, Train accuracy: 0.614038


100%|██████████| 141/141 [00:09<00:00, 15.13it/s]


Epoch: 2/5, Loss: 0.238966, Train accuracy: 0.930759


100%|██████████| 141/141 [00:09<00:00, 15.05it/s]


Epoch: 3/5, Loss: 0.066092, Train accuracy: 0.980261


100%|██████████| 141/141 [00:09<00:00, 14.96it/s]


Epoch: 4/5, Loss: 0.025205, Train accuracy: 0.994961


100%|██████████| 141/141 [00:09<00:00, 15.02it/s]

Epoch: 5/5, Loss: 0.008559, Train accuracy: 0.999248





In [56]:
LSTM_model = LSTM(vocab_size=NUM_WORDS, embedding_dim=EMBED_DIM, hidden_dim=100, n_layers=3,
                  seq_len=SENTENCE_LENGTH).to(device)
LSTM_optimizer = torch.optim.Adam(LSTM_model.parameters(), lr=0.001)
LSTM_criterion = nn.CrossEntropyLoss()

Validation loss: 0.059426, Validation accuracy: 0.980


In [57]:
LSTM_trainer = Trainer(LSTM_model, 2, train_dataloader, test_dataloader, device, LSTM_criterion, LSTM_optimizer)

In [58]:
LSTM_trainer.train()

In [59]:
LSTM_trainer.evaluate()

100%|██████████| 141/141 [00:03<00:00, 36.41it/s]


Epoch: 1/2, Loss: 0.158328, Train accuracy: 0.925636


100%|██████████| 141/141 [00:03<00:00, 38.70it/s]

Epoch: 2/2, Loss: 0.022552, Train accuracy: 0.993346





Validation loss: 0.045237, Validation accuracy: 0.986
