In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report

In [3]:
# Define the CNN model
class CNN(nn.Module):
    def __init__(self, input_dim, output_dim, dropout):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1, input_dim))
        self.fc = nn.Linear(32, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool1d(x.squeeze(-1), kernel_size=x.size(-1)).squeeze(-1)
        x = self.dropout(x)
        x = self.fc(x)
        return self.sigmoid(x)

In [4]:
# Load data from Excel file
excel_file = '../data/adjusted-labels-multiclass.xlsx'
df = pd.read_excel(excel_file)
df.dropna(subset=['Sentence'], inplace=True)  # Get rid of anything NaN

# Preprocess data
X_text = df['Sentence'].values.tolist()
y = df.drop(columns=['Sentence']).values.astype('float32')

# Split data into train and test sets
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=47)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# Convert data to PyTorch tensors and move to GPU
X_train = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32).unsqueeze(1).unsqueeze(2).cuda()
X_test = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32).unsqueeze(1).unsqueeze(2).cuda()
y_train_tensor = torch.tensor(y_train).cuda()
y_test_tensor = torch.tensor(y_test).cuda()

In [ ]:
# Hyperparameters
input_dim = X_train.shape[3]  # Input dimension is the number of features from TF-IDF
output_dim = y.shape[1]  # Number of classes (equal to number of labels)
dropout = 0.5
learning_rate = 0.001
epochs = 100

# Initialize the model and move to GPU
model = CNN(input_dim, output_dim, dropout).cuda()

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create DataLoader for training data
train_dataset = TensorDataset(X_train, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [ ]:
# Training loop
def train(model, criterion, optimizer, train_loader, num_epochs=epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        if (epoch+1)%10 == 0:
          print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

# Train the model
train(model, criterion, optimizer, train_loader)

In [ ]:
# Testing function
def evaluate(model, criterion, test_loader):
    model.eval()
    all_targets = []
    all_outputs = []
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == torch.argmax(targets, 1)).sum().item()
            total += targets.size(0)
            all_targets.append(targets.cpu())
            all_outputs.append(outputs.cpu())
    avg_loss = total_loss / len(test_loader.dataset)
    print(f"Test Loss: {avg_loss:.4f}")
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")
    all_targets = torch.vstack(all_targets)
    all_outputs = torch.vstack(all_outputs)
    print(f"Criterion: Adam, Learning Rate {learning_rate}, Loss: BCE Loss, Epochs: {epochs}")
    print(classification_report(all_targets.cpu(), (all_outputs > 0.5).cpu().numpy(), target_names=df.columns[1:], zero_division=0))


# Create DataLoader for testing data
test_dataset = TensorDataset(X_test, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32)

# Evaluate the model on the test set and print classification report
evaluate(model, criterion, test_loader)