In [1]:
%pip install torch
%pip install pandas
%pip install scikit-learn
%pip install torchtext

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Adam Optimiser + GloVe Embedding + no Activation Function

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('cleaned_fake.csv')

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    # Handle NaN values
    if isinstance(text, float) and torch.isnan(torch.tensor(text)):
        text = str(text)
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]

In [5]:
# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        vectorized_text = [glove[word] for word in self.X[idx] if word in glove.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(100)]  # Zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx] - 1  # Adjust labels to start from 0

# Modify collate_fn for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=torch.long)

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out

# Define hyperparameters
input_size = 100
hidden_size = 50
output_size = 8  # Number of categories
batch_size = 32
learning_rate = 0.005
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multiclass
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)  # Get the predicted class with the highest probability
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    print(classification_report(y_true, y_pred))

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 1, Loss: 1.8700809851288795
Epoch 2, Loss: 1.7979104320208232
Epoch 3, Loss: 1.7966830804944038
Epoch 4, Loss: 1.7837729007005692
Epoch 5, Loss: 1.7871571232875187
Epoch 6, Loss: 1.7845735599597294
Epoch 7, Loss: 1.781342973311742
Epoch 8, Loss: 1.7845751717686653
Epoch 9, Loss: 1.772382823129495
Epoch 10, Loss: 1.768716461956501
              precision    recall  f1-score   support

           0       0.50      0.05      0.09        41
           1       0.20      1.00      0.34        37
           2       0.00      0.00      0.00        35
           3       1.00      0.03      0.05        36
           4       0.00      0.00      0.00        19
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        13

    accuracy                           0.21       189
   macro avg       0.24      0.15      0.07       189
weighted avg       0.34      0.21      0.10       189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Adam Optimiser + FastText Embedding + ReLU Activation Function

In [6]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('cleaned_fake.csv')

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    # Handle NaN values
    if isinstance(text, float) and torch.isnan(torch.tensor(text)):
        text = str(text)
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]


# Load pre-trained embeddings (GloVe or FastText)
# Example using GloVe
# embedding = GloVe(name='6B', dim=100)

# Example using FastText
embedding = FastText(language='en')

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        vectorized_text = [embedding[word] for word in self.X[idx] if word in embedding.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(embedding.dim)]  # Zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx] - 1  # Adjust labels to start from 0

# Modify collate_fn for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=torch.long)

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.ReLU()  # Change activation function here

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.activation(out)  # Apply the chosen activation function
        return out

# Define hyperparameters
input_size = embedding.dim  # Use embedding dimension as input size
hidden_size = 50
output_size = 8  # Number of categories
batch_size = 32
learning_rate = 0.005
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multiclass
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  # Change optimizer here

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)  # Get the predicted class with the highest probability
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    print(classification_report(y_true, y_pred))


Epoch 1, Loss: 2.031314601500829
Epoch 2, Loss: 2.0209137772520385
Epoch 3, Loss: 2.0037188107768693
Epoch 4, Loss: 1.9967463438709576
Epoch 5, Loss: 1.9989212229847908
Epoch 6, Loss: 1.990171695748965
Epoch 7, Loss: 1.9301349172989528
Epoch 8, Loss: 1.9144325256347656
Epoch 9, Loss: 1.9114480391144753
Epoch 10, Loss: 1.9187454283237457
              precision    recall  f1-score   support

           0       0.33      0.02      0.05        41
           1       0.20      1.00      0.33        37
           2       0.00      0.00      0.00        35
           3       0.50      0.03      0.05        36
           4       0.00      0.00      0.00        19
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        13

    accuracy                           0.21       189
   macro avg       0.15      0.15      0.06       189
weighted avg       0.21      0.21      0.09       189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Adam Optimiser + GloVe Embedding + Sigmoid Activation Function

In [7]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('cleaned_fake.csv')

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    # Handle NaN values
    if isinstance(text, float) and torch.isnan(torch.tensor(text)):
        text = str(text)
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]


# Load pre-trained embeddings (GloVe or FastText)
# Example using GloVe
embedding = GloVe(name='6B', dim=100)

# Example using FastText
# embedding = FastText(language='en')

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        vectorized_text = [embedding[word] for word in self.X[idx] if word in embedding.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(embedding.dim)]  # Zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx] - 1  # Adjust labels to start from 0

# Modify collate_fn for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=torch.long)

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.Sigmoid()  # Change activation function here

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.activation(out)  # Apply the chosen activation function
        return out

# Define hyperparameters
input_size = embedding.dim  # Use embedding dimension as input size
hidden_size = 50
output_size = 8  # Number of categories
batch_size = 32
learning_rate = 0.005
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multiclass
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  # Change optimizer here

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)  # Get the predicted class with the highest probability
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    print(classification_report(y_true, y_pred))


Epoch 1, Loss: 1.9338994125525157
Epoch 2, Loss: 1.8881641154487927
Epoch 3, Loss: 1.8810638164480527
Epoch 4, Loss: 1.8578733503818512
Epoch 5, Loss: 1.838611329595248
Epoch 6, Loss: 1.8226408809423447
Epoch 7, Loss: 1.8144657636682193
Epoch 8, Loss: 1.8026557092865307
Epoch 9, Loss: 1.7910153220097225
Epoch 10, Loss: 1.7967238103350003
              precision    recall  f1-score   support

           0       0.18      0.15      0.16        41
           1       0.16      0.49      0.24        37
           2       0.29      0.14      0.19        35
           3       0.00      0.00      0.00        36
           4       0.32      0.42      0.36        19
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        13

    accuracy                           0.20       189
   macro avg       0.14      0.17      0.14       189
weighted avg       0.16      0.20      0.15       189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Adam Optimiser + GloVe Embedding + ReLU Activation Function

In [8]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('cleaned_fake.csv')

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    # Handle NaN values
    if isinstance(text, float) and torch.isnan(torch.tensor(text)):
        text = str(text)
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]


# Load pre-trained embeddings (GloVe or FastText)
# Example using GloVe
embedding = GloVe(name='6B', dim=100)

# Example using FastText
# embedding = FastText(language='en')

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        vectorized_text = [embedding[word] for word in self.X[idx] if word in embedding.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(embedding.dim)]  # Zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx] - 1  # Adjust labels to start from 0

# Modify collate_fn for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=torch.long)

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.ReLU()  # Change activation function here

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.activation(out)  # Apply the chosen activation function
        return out

# Define hyperparameters
input_size = embedding.dim  # Use embedding dimension as input size
hidden_size = 50
output_size = 8  # Number of categories
batch_size = 32
learning_rate = 0.005
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multiclass
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  # Change optimizer here

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)  # Get the predicted class with the highest probability
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    print(classification_report(y_true, y_pred))

Epoch 1, Loss: 2.0583991333842278
Epoch 2, Loss: 2.0408638591567674
Epoch 3, Loss: 2.0291736920674643
Epoch 4, Loss: 1.9765668933590252
Epoch 5, Loss: 1.959246630469958
Epoch 6, Loss: 1.9590009649594624
Epoch 7, Loss: 1.9561033621430397
Epoch 8, Loss: 1.9631532952189445
Epoch 9, Loss: 1.9447800889611244
Epoch 10, Loss: 1.95407730837663
              precision    recall  f1-score   support

           0       0.50      0.05      0.09        41
           1       0.20      1.00      0.34        37
           2       0.00      0.00      0.00        35
           3       0.50      0.03      0.05        36
           4       0.00      0.00      0.00        19
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        13

    accuracy                           0.21       189
   macro avg       0.17      0.15      0.07       189
weighted avg       0.24      0.21      0.10       189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Adam Optimiser + FastText Embedding + LeakyReLU Activation Function

In [9]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('cleaned_fake.csv')

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    # Handle NaN values
    if isinstance(text, float) and torch.isnan(torch.tensor(text)):
        text = str(text)
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]


# Load pre-trained embeddings (GloVe or FastText)
# Example using GloVe
#embedding = GloVe(name='6B', dim=100)

# Example using FastText
embedding = FastText(language='en')

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        vectorized_text = [embedding[word] for word in self.X[idx] if word in embedding.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(embedding.dim)]  # Zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx] - 1  # Adjust labels to start from 0

# Modify collate_fn for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=torch.long)

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.LeakyReLU()  # Change activation function here

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.activation(out)  # Apply the chosen activation function
        return out

# Define hyperparameters
input_size = embedding.dim  # Use embedding dimension as input size
hidden_size = 50
output_size = 8  # Number of categories
batch_size = 32
learning_rate = 0.005
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multiclass
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  # Change optimizer here

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)  # Get the predicted class with the highest probability
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    print(classification_report(y_true, y_pred))

Epoch 1, Loss: 2.0002777005235353
Epoch 2, Loss: 1.9673990110556285
Epoch 3, Loss: 1.853474607070287
Epoch 4, Loss: 1.8321706727147102
Epoch 5, Loss: 1.8446450605988503
Epoch 6, Loss: 1.8261538371443748
Epoch 7, Loss: 1.8246745963891347
Epoch 8, Loss: 1.8287115469574928
Epoch 9, Loss: 1.8091176599264145
Epoch 10, Loss: 1.805923027296861
              precision    recall  f1-score   support

           0       0.50      0.05      0.09        41
           1       0.20      1.00      0.34        37
           2       0.00      0.00      0.00        35
           3       0.50      0.03      0.05        36
           4       0.00      0.00      0.00        19
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        13

    accuracy                           0.21       189
   macro avg       0.17      0.15      0.07       189
weighted avg       0.24      0.21      0.10       189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## RMSprop Optimiser + GloVe Embedding + LeakyReLU Activation Function

In [11]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe, FastText
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

# Load the data
data = pd.read_csv('cleaned_fake.csv')

# Split the data into train (80%), test (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(data['text'].values, data['label'].values, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize the text using a simple tokenizer
tokenizer = get_tokenizer('basic_english')

def preprocess_text(text):
    # Handle NaN values
    if isinstance(text, float) and torch.isnan(torch.tensor(text)):
        text = str(text)
    tokens = tokenizer(text)
    return tokens

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]
X_val = [preprocess_text(text) for text in X_val]


# Load pre-trained embeddings (GloVe or FastText)
# Example using GloVe
embedding = GloVe(name='6B', dim=100)

# Example using FastText
#embedding = FastText(language='en')

# Define a custom PyTorch Dataset with padding
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        vectorized_text = [embedding[word] for word in self.X[idx] if word in embedding.stoi]
        if not vectorized_text:
            vectorized_text = [torch.zeros(embedding.dim)]  # Zero vector if no valid embeddings are found
        return torch.stack(vectorized_text), self.y[idx] - 1  # Adjust labels to start from 0

# Modify collate_fn for padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=torch.long)

# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.LeakyReLU()  # Change activation function here

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        out = self.activation(out)  # Apply the chosen activation function
        return out

# Define hyperparameters
input_size = embedding.dim  # Use embedding dimension as input size
hidden_size = 50
output_size = 8  # Number of categories
batch_size = 32
learning_rate = 0.005
num_epochs = 10

# Create DataLoader for train, test, and validation sets with padding
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model, loss function, and optimizer
model = GRUModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multiclass
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)  # Change optimizer here

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, preds = torch.max(outputs, 1)  # Get the predicted class with the highest probability
        y_true.extend(batch_y.tolist())
        y_pred.extend(preds.tolist())

    print(classification_report(y_true, y_pred))

Epoch 1, Loss: 1.9232174679636955
Epoch 2, Loss: 1.8865919982393582
Epoch 3, Loss: 1.872576763232549
Epoch 4, Loss: 1.8506778130928676
Epoch 5, Loss: 1.8579427699247997
Epoch 6, Loss: 1.8449795544147491
Epoch 7, Loss: 1.8423850536346436
Epoch 8, Loss: 1.8358953073620796
Epoch 9, Loss: 1.8483182688554127
Epoch 10, Loss: 1.8390381311376889
              precision    recall  f1-score   support

           0       0.50      0.05      0.09        41
           1       0.00      0.00      0.00        37
           2       0.19      1.00      0.32        35
           3       0.50      0.03      0.05        36
           4       0.00      0.00      0.00        19
           5       0.00      0.00      0.00         8
           7       0.00      0.00      0.00        13

    accuracy                           0.20       189
   macro avg       0.17      0.15      0.07       189
weighted avg       0.24      0.20      0.09       189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
