<a href="https://colab.research.google.com/github/sneharc16/DTU-MLR-Assignments-Deep-Learning/blob/main/Fake_News_Using_LSTM_DTU_MLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!kaggle datasets download -d algord/fake-news

!unzip fake-news

Dataset URL: https://www.kaggle.com/datasets/algord/fake-news
License(s): CC0-1.0
Downloading fake-news.zip to /content
  0% 0.00/1.68M [00:00<?, ?B/s]
100% 1.68M/1.68M [00:00<00:00, 73.3MB/s]
Archive:  fake-news.zip
  inflating: FakeNewsNet.csv         


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv('FakeNewsNet.csv')
data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['title'], data['real'], test_size=0.2, random_state=42)

we'll be using tf-idf, tranformers and pipelines. along with that dataloaders will be used then, other basic pytorch libs

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.unsqueeze(1)
        output, (hidden, cell) = self.lstm(x)
        hidden = self.dropout(hidden[-1])
        out = self.fc(hidden)
        return out

In [None]:
input_dim = X_train_tfidf.shape[1]
hidden_dim = 128
output_dim = 2

In [None]:
model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for texts, labels in iterator:
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        acc = accuracy_score(labels.cpu().numpy(), predictions.argmax(1).cpu().numpy())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in iterator:
            predictions = model(texts)
            loss = criterion(predictions, labels)
            acc = accuracy_score(labels.cpu().numpy(), predictions.argmax(1).cpu().numpy())
            epoch_loss += loss.item()
            epoch_acc += acc
            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return epoch_loss / len(iterator), epoch_acc / len(iterator), all_preds, all_labels


In [None]:
n_epochs = 10
for epoch in range(n_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, preds, labels = evaluate(model, test_loader, criterion)
    print(f'Epoch {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')

Epoch 1
	Train Loss: 0.544 | Train Acc: 75.05%
	Val Loss: 0.453 | Val Acc: 77.87%
Epoch 2
	Train Loss: 0.373 | Train Acc: 83.44%
	Val Loss: 0.376 | Val Acc: 83.30%
Epoch 3
	Train Loss: 0.304 | Train Acc: 86.95%
	Val Loss: 0.376 | Val Acc: 83.49%
Epoch 4
	Train Loss: 0.274 | Train Acc: 88.20%
	Val Loss: 0.389 | Val Acc: 83.51%
Epoch 5
	Train Loss: 0.253 | Train Acc: 89.18%
	Val Loss: 0.403 | Val Acc: 82.73%
Epoch 6
	Train Loss: 0.240 | Train Acc: 89.96%
	Val Loss: 0.422 | Val Acc: 82.75%
Epoch 7
	Train Loss: 0.228 | Train Acc: 90.40%
	Val Loss: 0.440 | Val Acc: 82.75%
Epoch 8
	Train Loss: 0.220 | Train Acc: 90.78%
	Val Loss: 0.460 | Val Acc: 82.39%
Epoch 9
	Train Loss: 0.212 | Train Acc: 91.08%
	Val Loss: 0.482 | Val Acc: 82.28%
Epoch 10
	Train Loss: 0.205 | Train Acc: 91.60%
	Val Loss: 0.504 | Val Acc: 82.54%


In [None]:
conf_matrix = confusion_matrix(labels, preds)
class_report = classification_report(labels, preds)

print(conf_matrix)
print(class_report)

[[ 640  491]
 [ 324 3185]]
              precision    recall  f1-score   support

           0       0.66      0.57      0.61      1131
           1       0.87      0.91      0.89      3509

    accuracy                           0.82      4640
   macro avg       0.77      0.74      0.75      4640
weighted avg       0.82      0.82      0.82      4640

