In [None]:
%load_ext autoreload
%autoreload 2

# Exercise 1

<img src="./images/01.png" width=800>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision 
import math
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader

from tqdm.autonotebook import tqdm

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

import pandas as pd
import time
from torchinfo import summary
from utils import train_network, set_seed, weight_reset, accuracy_score_wrapper

In [None]:
import wandb
wandb.login()

In [None]:
torch.backends.cudnn.deterministic = True
set_seed(42)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Dataset and Dataloder

In [None]:
def process_csv_to_tuples(df):
    """
    Reads a CSV file, combines 'Title' and 'Description', and returns a list of tuples.

    Args:
        df (DataFrame): input dataframe 

    Returns:
        list: A list of tuples, where each tuple is (Class Index, Title + Description).
    """    
    # Combine 'Title' and 'Description' columns
    # Ensure both are strings to avoid errors during concatenation
    df['combined_text'] = df['Title'].astype(str) + " " + df['Description'].astype(str)
    
    # Create the list of tuples
    # Use .iloc to access rows by integer position for conversion to tuple
    list_of_tuples = [(row['Class Index'], row['combined_text']) for index, row in df.iterrows()]
    
    return list_of_tuples

# Example usage:
# Assuming 'train.csv' is in the same directory as your script
# Or provide the full path to your file
train_data_tuples = process_csv_to_tuples(train_df)

In [None]:
import torchtext
from torchtext.datasets import AG_NEWS

train_iter, test_iter = AG_NEWS(root='./data', split=('train', 'test'))
train_dataset = list(train_iter)
test_dataset = list(test_iter)

In [None]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

from collections import Counter
from torchtext.vocab import vocab

counter = Counter()
for (label, line) in train_dataset:
    counter.update(tokenizer(line))
vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))

In [None]:
def text_transform(x):
    return [vocab['<BOS>'] + [vocab(token) for token in tokenizer(x)] + vocab['<EOS>']]


def label_transform(x):
    return x-1

print(text_transform(train_dataset[0][1]))

In [None]:
VOCAB_SIZE = len(vocab)
NUM_CLASS = len(np.unique([z[0] for z in train_dataset]))
print("Vocab:", VOCAB_SIZE)
print("Num Classes:", NUM_CLASS)
padding_idx = vocab['<PAD>']
embed_dim = 128
B = 64
epochs = 15

In [None]:
def pad_batch(batch):
    labels = [label_transform(z[0]) for z in batch]
    texts =  [torch.tensor(text_transform(z[1]), dtype=torch.int64) for z in batch]
    
    max_len = max([text.size(0) for text in texts])
    texts = [F.pad(text, (0, max_len-text.size(0)), value=padding_idx) for text in texts]
    x = torch.stack(texts)
    y = torch.tensor(labels, stype=torch.int64)
    return x, y

In [None]:
train_loader = DataLoader(train_dataset, batch_size=B, shuffle=True, collate_fn=pad_batch)
test_loader = DataLoader(test_dataset, batch_size=B, collate_fn=pad_batch)

## Models