In [1]:
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install torchtext



In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText, CharNGram, Vectors
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.nn.init import xavier_uniform_

# Load the data
data = pd.read_csv('cleaned_merged_data.csv')

# Split the data into train, test, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(data['article'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert labels to numerical values
label_map = {'fake': 0, 'real': 1}
y_train = [label_map[label] for label in y_train]
y_test = [label_map[label] for label in y_test]
y_val = [label_map[label] for label in y_val]

# Tokenization
tokenizer = get_tokenizer('basic_english')
X_train = [tokenizer(text) for text in X_train]
X_test = [tokenizer(text) for text in X_test]
X_val = [tokenizer(text) for text in X_val]

# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Load pre-trained FastText embeddings
fasttext = FastText(language='en')

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Convert text to numerical vectors using pre-trained GloVe embeddings
        vectorized_text = [glove[word] if word in glove.stoi else torch.zeros(100) for word in self.X[idx]]
        
        # Stack the word embeddings to form the input tensor
        input_tensor = torch.stack(vectorized_text)
        return input_tensor, self.y[idx]

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.ReLU = nn.ReLU()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.ReLU(out)
        return out

# Hyperparameters
input_size = 100  # Assuming each word is represented by a 100-dimensional vector
hidden_size = 25
output_size = 1
batch_size = 8
learning_rate = 0.001
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels)

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs.squeeze(), batch_y.float())  # Use BCEWithLogitsLoss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        preds = (outputs > 0.5).float()
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 1, Loss: 0.6909533762171491
Epoch 2, Loss: 0.6903016907207515
Epoch 3, Loss: 0.6898202830403704
Epoch 4, Loss: 0.689986223919245
Epoch 5, Loss: 0.6892522508842799
Epoch 6, Loss: 0.6896039080511193
Epoch 7, Loss: 0.6846711595129044
Epoch 8, Loss: 0.6785655561232078
Epoch 9, Loss: 0.6771051336529587
Epoch 10, Loss: 0.6762124123904591
Accuracy: 0.5788514129443938
Precision: 0.9411764705882353
Recall: 0.06504065040650407
F1 Score: 0.12167300380228137
