In [1]:
!pip install torch torchvision torchaudio torchtext --index-url https: // download.pytorch.org/whl/cu118 -q

In [2]:
!pip install pandas scikit-learn spacy tensorflow tqdm -q

In [3]:
!python3 -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import nltk
import pandas as pd
import spacy
import torch
import torch.nn.functional as F
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [7]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

fake["is_fake"] = 1
true["is_fake"] = 0

df = pd.concat([true, fake])
del fake, true

df.shape

(44898, 5)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_WORDS = 10000
SENTENCE_LENGTH = 100
EMBED_DIM = 1000
random_state = 42

nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)


def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    return text.progress_apply(lambda x: " ".join(
        lemmatizer.lemmatize(token.lower()) for token in word_tokenize(x)
        if token.isalnum()
        and token not in stop_words))

In [10]:
%%time
df['cleaned_text'] = df['title'] + " " + df['text']
df['cleaned_text'] = preprocess_text(df['cleaned_text'])

100%|██████████| 44898/44898 [01:19<00:00, 564.38it/s] 

CPU times: user 1min 19s, sys: 984 ms, total: 1min 20s
Wall time: 1min 20s





In [13]:
X_train, X_test, y_train, y_test = (train_test_split(df['cleaned_text'], df['is_fake'], test_size=0.2,
                                                     random_state=random_state))

In [14]:
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(df['cleaned_text'])
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(train_seq, maxlen=SENTENCE_LENGTH)
x_test = pad_sequences(test_seq, maxlen=SENTENCE_LENGTH)

In [15]:
class SpamDataset(Dataset):
    def __init__(self, data, label):
        self.data = torch.tensor(data).to(torch.int64)
        self.label = label

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.data)


batch_size = 256
torch.manual_seed(random_state)
train_dataset = SpamDataset(x_train, y_train.to_numpy())
test_dataset = SpamDataset(x_test, y_test.to_numpy())
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
class CNN(nn.Module):
    def __init__(self, embed_dim):
        super(CNN, self).__init__()
        self.embed = nn.Embedding(NUM_WORDS, embed_dim)
        self.conv1 = nn.Conv2d(1, 1, 3)
        self.conv2 = nn.Conv2d(1, 1, 3)
        self.conv3 = nn.Conv2d(1, 1, 3)
        self.fc = nn.Linear(93436, 2)

    def forward(self, x):
        out = self.embed(x)
        out = out.unsqueeze(1)
        out = F.relu(self.conv1(out))
        out = F.relu(self.conv2(out))
        out = F.relu(self.conv3(out))
        out = out.view(out.size()[0], -1)
        out = self.fc(out)
        return out

In [17]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, seq_len):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(seq_len * hidden_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = torch.reshape(x, (x.size(0), -1,))
        x = self.fc(x)
        return x

In [18]:
class Trainer:
    def __init__(self, model, epochs, train_dataloader, test_dataloader, device, criterion, optimizer):
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.test_dataloader = test_dataloader
        self.train_dataloader = train_dataloader
        self.model = model
        self.epochs = epochs

    def train(self):
        for epoch in range(self.epochs):
            self.model.train()
            running_loss = 0
            correct_predictions = 0
            total = 0
            for i, (inputs, targets) in enumerate(tqdm(self.train_dataloader)):
                inputs, targets = inputs.to(device), targets.to(device)
                self.optimizer.zero_grad()

                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets.long())
                loss.backward()

                self.optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct_predictions += (predicted == targets).sum().item()

            train_loss = running_loss / len(train_dataloader)
            train_accuracy = correct_predictions / total
            print(f'Epoch: {epoch + 1}/{self.epochs}, Loss: {train_loss:.6f}, Train accuracy: {train_accuracy:.6f}')

    def evaluate(self):
        self.model.eval()
        val_loss = 0
        correct_predictions = 0

        with torch.no_grad():
            for inputs, targets in self.test_dataloader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets.long())
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == targets).sum().item()

        val_loss /= len(self.test_dataloader)
        val_accuracy = correct_predictions / len(test_dataset)

        print(f'Validation loss: {val_loss:.6f}, Validation accuracy: {val_accuracy:.3f}\n')

In [19]:
CNN_model = CNN(EMBED_DIM).to(device)
CNN_optimizer = torch.optim.Adam(CNN_model.parameters(), lr=0.001)
CNN_criterion = nn.CrossEntropyLoss()

In [20]:
CNN_trainer = Trainer(CNN_model, 5, train_dataloader, test_dataloader, device, CNN_criterion, CNN_optimizer)

In [21]:
CNN_trainer.train()

  return F.conv2d(input, weight, bias, self.stride,
100%|██████████| 141/141 [00:25<00:00,  5.46it/s]


Epoch: 1/5, Loss: 3.244441, Train accuracy: 0.533827


100%|██████████| 141/141 [00:09<00:00, 15.17it/s]


Epoch: 2/5, Loss: 0.516596, Train accuracy: 0.794309


100%|██████████| 141/141 [00:09<00:00, 15.08it/s]


Epoch: 3/5, Loss: 0.125326, Train accuracy: 0.956039


100%|██████████| 141/141 [00:09<00:00, 14.98it/s]


Epoch: 4/5, Loss: 0.060348, Train accuracy: 0.981708


100%|██████████| 141/141 [00:09<00:00, 14.99it/s]

Epoch: 5/5, Loss: 0.025716, Train accuracy: 0.994682





In [22]:
CNN_trainer.evaluate()

Validation loss: 0.069707, Validation accuracy: 0.975


In [23]:
LSTM_model = LSTM(vocab_size=NUM_WORDS, embedding_dim=EMBED_DIM, hidden_dim=100, n_layers=3,
                  seq_len=SENTENCE_LENGTH).to(device)
LSTM_optimizer = torch.optim.Adam(LSTM_model.parameters(), lr=0.001)
LSTM_criterion = nn.CrossEntropyLoss()

In [24]:
LSTM_trainer = Trainer(LSTM_model, 2, train_dataloader, test_dataloader, device, LSTM_criterion, LSTM_optimizer)

In [25]:
LSTM_trainer.train()

100%|██████████| 141/141 [00:06<00:00, 20.46it/s]


Epoch: 1/2, Loss: 0.180451, Train accuracy: 0.914360


100%|██████████| 141/141 [00:03<00:00, 39.02it/s]

Epoch: 2/2, Loss: 0.027309, Train accuracy: 0.991481





In [26]:
LSTM_trainer.evaluate()

Validation loss: 0.051842, Validation accuracy: 0.984
