In [1]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install torchtext

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from torch.nn.utils.rnn import pad_sequence  # Import pad_sequence for padding
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('merged_data_csv.csv')

# Split the data into features (X) and labels (y)
#X = data['article'].values
#y = data['real/fake'].values

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['article'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert string labels to numerical values and handle NaN values
label_map = {'fake': 0, 'real': 1}
#default_label = 0  # Default label value for NaN

y_train = [label_map.get(label) for label in y_train]
y_test = [label_map.get(label) for label in y_test]
y_val = [label_map.get(label) for label in y_val]

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    if isinstance(text, float):  # Check if the input is a float
        text = str(text)  # Convert float to string
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]


In [3]:
# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Convert text to numerical vectors using pre-trained word embeddings (GloVe)
        vectorized_text = [glove[word] for word in self.X[idx] if word in glove.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(100)]  # Use zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx]

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out

# Define hyperparameters
input_size = 100  # Assuming each article is represented by a 300-dimensional vector
hidden_size = 64
output_size = 1
batch_size = 16
learning_rate = 0.001
num_epochs = 3

# Create DataLoader for train, test, and validation sets with padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels)

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1).float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        preds = (outputs > 0.5).float()  # Thresholding output probabilities
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')


Epoch 1, Loss: 0.6830716352970874
Epoch 2, Loss: 0.6652882350489958
Epoch 3, Loss: 0.6573909099853321
Accuracy: 0.577939835916135
Precision: 0.967741935483871
Recall: 0.06097560975609756
F1 Score: 0.1147227533460803


In [2]:
# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=50)

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Convert text to numerical vectors using pre-trained word embeddings (GloVe)
        vectorized_text = [glove[word] for word in self.X[idx] if word in glove.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(50)]  # Use zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx]

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out

# Define hyperparameters
input_size = 50  # Assuming each article is represented by a 300-dimensional vector
hidden_size = 32
output_size = 1
batch_size = 16
learning_rate = 0.001
num_epochs = 3

# Create DataLoader for train, test, and validation sets with padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels)

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1).float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        preds = (outputs > 0.5).float()  # Thresholding output probabilities
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')


100%|█████████▉| 399999/400000 [00:09<00:00, 43851.28it/s]


Epoch 1, Loss: 0.6847956086117062
Epoch 2, Loss: 0.6774432889316468
Epoch 3, Loss: 0.6647739705573665
Accuracy: 0.5747493163172288
Precision: 0.9180327868852459
Recall: 0.056910569105691054
F1 Score: 0.10717703349282297


In [10]:
# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Convert text to numerical vectors using pre-trained word embeddings (GloVe)
        vectorized_text = [glove[word] for word in self.X[idx] if word in glove.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(100)]  # Use zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx]

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out

# Define hyperparameters
input_size = 100  # Assuming each article is represented by a 300-dimensional vector
hidden_size = 25
output_size = 1
batch_size = 8
learning_rate = 0.001
num_epochs = 7

# Create DataLoader for train, test, and validation sets with padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels)

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1).float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        preds = (outputs > 0.5).float()  # Thresholding output probabilities
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

Epoch 1, Loss: 0.6752212508935863
Epoch 2, Loss: 0.6358377039432526
Epoch 3, Loss: 0.46431524170422606
Epoch 4, Loss: 0.47187306364671516
Epoch 5, Loss: 0.3357625583216446
Epoch 6, Loss: 0.234838649302144
Epoch 7, Loss: 0.20583260871198428
Accuracy: 0.9211485870556062
Precision: 0.9219562955254943
Recall: 0.9004065040650406
F1 Score: 0.9110539845758355


In [6]:
# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Convert text to numerical vectors using pre-trained word embeddings (GloVe)
        vectorized_text = [glove[word] for word in self.X[idx] if word in glove.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(100)]  # Use zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx]

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out

# Define hyperparameters
input_size = 100  # Assuming each article is represented by a 300-dimensional vector
hidden_size = 25
output_size = 1
batch_size = 8
learning_rate = 0.001
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels)

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1).float())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        preds = (outputs > 0.5).float()  # Thresholding output probabilities
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

Epoch 1, Loss: 0.6725827258920344
Epoch 2, Loss: 0.5312362424129777
Epoch 3, Loss: 0.4434224267585674
Epoch 4, Loss: 0.2624666429003849
Epoch 5, Loss: 0.20824145930638888
Epoch 6, Loss: 0.17415309053907227
Epoch 7, Loss: 0.1515084014091734
Epoch 8, Loss: 0.13486986150010435
Epoch 9, Loss: 0.11766106991822872
Epoch 10, Loss: 0.10655977917646306
Accuracy: 0.9279854147675478
Precision: 0.9072978303747534
Recall: 0.9349593495934959
F1 Score: 0.9209209209209209
