# MIE524 - Assignment 5
Please complete this notebook for Assignment 5.

## Imports

In [1]:
import pandas as pd
import numpy as np

## Q1 - Neural Collaborative Filtering
[[paper]](https://arxiv.org/pdf/1708.05031.pdf)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### Data
The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies.

In [76]:
data = pd.read_csv("q1-ratings.csv")

### Encode Data

In [77]:
# split train and validation before encoding
np.random.seed(3)
torch.manual_seed(666)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [None]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids.
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [None]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids.
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

### a) Evaluate model parameters for NCF-GMF

In [None]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    # YOUR CODE HERE
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # the MSE loss on the validation set of each epoch
        val_loss = test_loss(model)

        print(f"Epoch {i + 1}/{epochs}, Train Loss: {loss.item():.3f}, Validation Loss: {val_loss:.4f}")

    # Final validation MSE loss
    final_val_loss = test_loss(model)
    print(f"Final Validation Loss: {final_val_loss:.4f}")
    return final_val_loss

def test_loss(model):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)

    return loss.item()

In [None]:
num_users = len(df_train['userId'].unique())
num_items = len(df_train['movieId'].unique())
print(f"Number of users: {num_users}, Number of items: {num_items}")

Number of users: 610, Number of items: 8998


In [None]:
model_NCF_GMF_Benchmark = MF(num_users, num_items, emb_size=100)
model_NCF_GMF_1 = MF(num_users, num_items, emb_size=256)

In [None]:
print("Training model_NCF_GMF_Benchmark with epochs=10, lr=0.01, wd=0")
c1 = train_epocs(model_NCF_GMF_Benchmark, epochs=10, lr=0.01, wd=0)
print("\n")
print("Training model_NCF_GMF_Benchmark with epochs=10, lr=0.01, wd=0.001")
c2 = train_epocs(model_NCF_GMF_Benchmark, epochs=10, lr=0.01, wd=0.001)
print("\n")
print("Training model_NCF_GMF_1 with epochs=10, lr=0.01, wd=0")
c3 = train_epocs(model_NCF_GMF_1, epochs=10, lr=0.01, wd=0)
print("\n")
print("Training model_NCF_GMF_1 with epochs=10, lr=0.01, wd=0.001")
c4 = train_epocs(model_NCF_GMF_1, epochs=10, lr=0.01, wd=0.001)

Training model_NCF_GMF_Benchmark with epochs=10, lr=0.01, wd=0
Epoch 1/10, Train Loss: 12.912, Validation Loss: 12.5928
Epoch 2/10, Train Loss: 12.504, Validation Loss: 12.0590
Epoch 3/10, Train Loss: 11.973, Validation Loss: 11.4104
Epoch 4/10, Train Loss: 11.328, Validation Loss: 10.6565
Epoch 5/10, Train Loss: 10.578, Validation Loss: 9.8087
Epoch 6/10, Train Loss: 9.736, Validation Loss: 8.8813
Epoch 7/10, Train Loss: 8.814, Validation Loss: 7.8919
Epoch 8/10, Train Loss: 7.831, Validation Loss: 6.8621
Epoch 9/10, Train Loss: 6.808, Validation Loss: 5.8178
Epoch 10/10, Train Loss: 5.771, Validation Loss: 4.7893
Final Validation Loss: 4.7893


Training model_NCF_GMF_Benchmark with epochs=10, lr=0.01, wd=0.001
Epoch 1/10, Train Loss: 4.750, Validation Loss: 4.1866
Epoch 2/10, Train Loss: 4.162, Validation Loss: 3.6519
Epoch 3/10, Train Loss: 3.640, Validation Loss: 3.2050
Epoch 4/10, Train Loss: 3.205, Validation Loss: 2.8603
Epoch 5/10, Train Loss: 2.870, Validation Loss: 2.6238
Epo

### b) NCF-MLP Model

<img src=https://miro.medium.com/v2/resize:fit:1400/format:webp/1*aP-Mx266ExwoWZPSdHtYpA.png width="600">


In [None]:
class my_NCF_MLP(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, hidden_size=10):
        super(my_NCF_MLP, self).__init__()

        # YOUR CODE HERE
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

        # 2 linear layers
        self.fc1 = nn.Linear(emb_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        # relu activation function
        self.relu = nn.ReLU()

    def forward(self, u, v):

        # YOUR CODE HERE
        u = self.user_emb(u)
        v = self.item_emb(v)
        # concatenate embeddings
        x = torch.cat([u, v], dim=1)
        # pass through dense layers
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        # Output shape: (batch_size,)
        return x.squeeze()

Train model

In [None]:
model_NCF_MLP_Benchmark = my_NCF_MLP(num_users, num_items, emb_size=100, hidden_size=10)
model_NCF_MLP_1 = my_NCF_MLP(num_users, num_items, emb_size=100, hidden_size=5)
model_NCF_MLP_2 = my_NCF_MLP(num_users, num_items, emb_size=256, hidden_size=10)

### c) Evaluate model parameters for NCF-MLP

In [None]:
print("Training model_NCF_MLP_Benchmark with epochs=10, lr=0.01, wd=0")
c5 = train_epocs(model_NCF_MLP_Benchmark, epochs=10, lr=0.01, wd=0)
print("\n")
print("Training model_NCF_MLP_Benchmark with epochs=10, lr=0.01, wd=0.01")
c6 = train_epocs(model_NCF_MLP_Benchmark, epochs=10, lr=0.01, wd=0.01)
print("\n")
print("Training model_NCF_MLP_1 with epochs=10, lr=0.01, wd=0")
c7 = train_epocs(model_NCF_MLP_1, epochs=10, lr=0.01, wd=0)
print("\n")
print("Training model_NCF_MLP_1 with epochs=10, lr=0.01, wd=0.01")
c8 = train_epocs(model_NCF_MLP_1, epochs=10, lr=0.01, wd=0.01)
print("\n")
print("Training model_NCF_MLP_2 with epochs=10, lr=0.01, wd=0")
c9 = train_epocs(model_NCF_MLP_2, epochs=10, lr=0.01, wd=0)
print("\n")
print("Training model_NCF_MLP_2 with epochs=10, lr=0.01, wd=0.01")
c10 = train_epocs(model_NCF_MLP_2, epochs=10, lr=0.01, wd=0.01)

Training model_NCF_MLP_Benchmark with epochs=10, lr=0.01, wd=0
Epoch 1/10, Train Loss: 13.967, Validation Loss: 13.5913
Epoch 2/10, Train Loss: 13.497, Validation Loss: 13.2218
Epoch 3/10, Train Loss: 13.129, Validation Loss: 12.7547
Epoch 4/10, Train Loss: 12.664, Validation Loss: 12.1882
Epoch 5/10, Train Loss: 12.101, Validation Loss: 11.4885
Epoch 6/10, Train Loss: 11.405, Validation Loss: 10.6334
Epoch 7/10, Train Loss: 10.555, Validation Loss: 9.6170
Epoch 8/10, Train Loss: 9.544, Validation Loss: 8.4465
Epoch 9/10, Train Loss: 8.381, Validation Loss: 7.1434
Epoch 10/10, Train Loss: 7.087, Validation Loss: 5.7492
Final Validation Loss: 5.7492


Training model_NCF_MLP_Benchmark with epochs=10, lr=0.01, wd=0.01
Epoch 1/10, Train Loss: 5.703, Validation Loss: 5.1345
Epoch 2/10, Train Loss: 5.084, Validation Loss: 4.5645
Epoch 3/10, Train Loss: 4.509, Validation Loss: 4.0390
Epoch 4/10, Train Loss: 3.979, Validation Loss: 3.5417
Epoch 5/10, Train Loss: 3.478, Validation Loss: 3.1091


### d) Compare your best NCF-GMF and NCF-MLP

In [None]:
print("NCF-GMF validation MSE loss: ")
print(f"{c1:.2f}, {c2:.2f}, {c3:.2f}, {c4:.2f}")
print("NCF-MLP validation MSE loss: ")
print(f"{c5:.2f}, {c6:.2f}, {c7:.2f}, {c8:.2f}, {c9:.2f}, {c10:.2f}")

NCF-GMF validation MSE loss: 
4.79, 2.62, 1.05, 2.81
NCF-MLP validation MSE loss: 
5.75, 2.19, 9.21, 5.78, 2.34, 3.32


Written comments:

My best configuration for NCF-GMF is achieving a final validation loss of 1.05 when the model GMF with embedding size 256 and no regularization. My best configuration for NCF-MLP is achieving a final validation loss of 2.19 when model MLP with embedding size 100, hidden layer 10 and with regularization wd 0.01. My best NCF-GMF model achieves a lower validation loss compared to my best NCF-MLP model, offering both simplicity and better validation performance for current dataset.

Increasing embedding size from 100 to 256 improved performance for models NCF-GMF as well as NCF-MLP, allowing the models to capture more detailed latent features of users and items.
Weight decay helps control overfitting, especially for complex models like NCF_MLP. However, I noticed that when embedding size is 256, regularization hurts the performance for both models. It is probably because that models with larger embedding sizes have greater capacity and over-constraining the models with regularization restrict their ability to learn.
More hidden layers make MLP model more complex but have better performance. Smaller hidden layers may lead to underfitting with consistently high losses.

### e) Change ratings to 1 or 0.

In [None]:
# YOUR CODE HERE
# modify ratings in train and validation data such that any rating greater or equal to 3 is mapped to 1 and any rating less than 3 is mapped to 0
df_train['rating'] = np.where(df_train['rating'] >= 3, 1, 0)
df_val['rating'] = np.where(df_val['rating'] >= 3, 1, 0)

In [None]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return torch.sigmoid((u*v).sum(1))

In [None]:
class my_NCF_MLP(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, hidden_size=10):
        super(my_NCF_MLP, self).__init__()

        # YOUR CODE HERE
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

        # 2 linear layers
        self.fc1 = nn.Linear(emb_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        # relu activation function
        self.relu = nn.ReLU()

    def forward(self, u, v):

        # YOUR CODE HERE
        u = self.user_emb(u)
        v = self.item_emb(v)
        # concatenate embeddings
        x = torch.cat([u, v], dim=1)
        # pass through dense layers
        x = self.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # Sigmoid for binary classification
        # Output shape: (batch_size,)
        return x.squeeze()

In [None]:
model_NCF_GMF_1 = MF(num_users, num_items, emb_size=256)
model_NCF_MLP_Benchmark = my_NCF_MLP(num_users, num_items, emb_size=100, hidden_size=10)
print("Training model_NCF_GMF_1 with epochs=10, lr=0.01, wd=0")
c3 = train_epocs(model_NCF_GMF_1, epochs=10, lr=0.01, wd=0)
print("\n")
print("Training model_NCF_MLP_Benchmark with epochs=10, lr=0.01, wd=0.01")
c6 = train_epocs(model_NCF_MLP_Benchmark, epochs=10, lr=0.01, wd=0.01)
print("\n")

Training model_NCF_GMF_1 with epochs=10, lr=0.01, wd=0
Epoch 1/10, Train Loss: 0.227, Validation Loss: 0.2058
Epoch 2/10, Train Loss: 0.206, Validation Loss: 0.1833
Epoch 3/10, Train Loss: 0.182, Validation Loss: 0.1617
Epoch 4/10, Train Loss: 0.159, Validation Loss: 0.1444
Epoch 5/10, Train Loss: 0.139, Validation Loss: 0.1335
Epoch 6/10, Train Loss: 0.125, Validation Loss: 0.1285
Epoch 7/10, Train Loss: 0.117, Validation Loss: 0.1274
Epoch 8/10, Train Loss: 0.113, Validation Loss: 0.1281
Epoch 9/10, Train Loss: 0.111, Validation Loss: 0.1293
Epoch 10/10, Train Loss: 0.109, Validation Loss: 0.1303
Final Validation Loss: 0.1303


Training model_NCF_MLP_Benchmark with epochs=10, lr=0.01, wd=0.01
Epoch 1/10, Train Loss: 0.220, Validation Loss: 0.2117
Epoch 2/10, Train Loss: 0.213, Validation Loss: 0.2099
Epoch 3/10, Train Loss: 0.212, Validation Loss: 0.2090
Epoch 4/10, Train Loss: 0.211, Validation Loss: 0.2082
Epoch 5/10, Train Loss: 0.210, Validation Loss: 0.2067
Epoch 6/10, Train Los

Written answer:

I modified both class of the model using torch.sigmoid since it aligns the output with the new dataset requirements (binary) and makes the model suitable for generating outputs of binary probabilities.
The NCF-GMF model maintained a low and stable validation loss, better than MCF-MLP. The NCF-MLP model, while decreased steadily across all epochs, might need more epochs to converge. However, since while the binary ratings significantly reduce the complexity of the output space, make the binary classification task easier. Both models shows better performance than before.  

### f) Hyperparameter tuning with ALS

In [None]:
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import SparkContext, SparkConf
# create the session
conf = SparkConf()

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession.builder.getOrCreate()
# convert Pandas to Spark DataFrames
spark_train = spark.createDataFrame(df_train)
spark_val = spark.createDataFrame(df_val)

In [None]:
# insprired by Martin Keenan https://github.com/mkeenan195/Movie-Recommender/blob/main/als-model.py
# parameter combinations
param_grid = [
    {"maxIter": 10, "rank": 10, "regParam": 0.001},
    {"maxIter": 10, "rank": 10, "regParam": 0.1},
    {"maxIter": 10, "rank": 5, "regParam": 0.001},
    {"maxIter": 10, "rank": 5, "regParam": 0.01},
    {"maxIter": 5, "rank": 10, "regParam": 0.001},
    {"maxIter": 5, "rank": 10, "regParam": 0.1}
]
results = []
for params in param_grid:
    als = ALS(
        maxIter=params["maxIter"],
        rank=params["rank"],
        regParam=params["regParam"],
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        coldStartStrategy="drop"
    )
    model = als.fit(spark_train)
    # Compute *training* RMSE
    predictions = model.transform(spark_train)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    results.append((params, rmse))
    print(f"Params: {params}, Training RMSE: {rmse}")
    # Compute *validation* RMSE
    predictions = model.transform(spark_val)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    results.append((params, rmse))
    print(f"Params: {params}, Validation RMSE: {rmse}")

Params: {'maxIter': 10, 'rank': 10, 'regParam': 0.001}, Training RMSE: 0.18202973453389293
Params: {'maxIter': 10, 'rank': 10, 'regParam': 0.001}, Validation RMSE: 0.5166449494225722
Params: {'maxIter': 10, 'rank': 10, 'regParam': 0.1}, Training RMSE: 0.30613510757570583
Params: {'maxIter': 10, 'rank': 10, 'regParam': 0.1}, Validation RMSE: 0.3619350995949266
Params: {'maxIter': 10, 'rank': 5, 'regParam': 0.001}, Training RMSE: 0.24110511048250025
Params: {'maxIter': 10, 'rank': 5, 'regParam': 0.001}, Validation RMSE: 0.49450322668700664
Params: {'maxIter': 10, 'rank': 5, 'regParam': 0.01}, Training RMSE: 0.24104094312897423
Params: {'maxIter': 10, 'rank': 5, 'regParam': 0.01}, Validation RMSE: 0.4029735238466867
Params: {'maxIter': 5, 'rank': 10, 'regParam': 0.001}, Training RMSE: 0.19888033600799354
Params: {'maxIter': 5, 'rank': 10, 'regParam': 0.001}, Validation RMSE: 0.5056575923152006
Params: {'maxIter': 5, 'rank': 10, 'regParam': 0.1}, Training RMSE: 0.3082351511317649
Params: {

In [None]:
sc.stop()

Written Answer:

The maxIter parameter controls the number of iterations the ALS optimization algorithm performs. Increasing it allows for better convergence, as shown in the result, where using maxIter 10 is better than 5. The rank parameter determines the number of latent factors used in matrix factorization. Higher ranks, such as 10,  capture more complex patterns but increase the risk of overfitting. The regParam is the parameter for regularization, which controls overfitting by penalizing the complexity of the model. A smaller regParam, such as 0.001, allows the model to fit the training data closely so that it achieved a lower training RMSE. However, this can lead to poorer generalization with higher validation RMSE. By trying at least 6 hyperparameter combinations, the combination of maxIter = 10, rank = 10, and regParam = 0.1 offers the best validation RMSE (0.3619), showing that the model have better generalization with balancing model complexity and regularization.

## Q2 - Zero-Shot Text Classification
https://huggingface.co/tasks/zero-shot-classification

In [54]:
!pip install transformers
!pip install datasets



In [55]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset

### Data

In [56]:
arxiv_data = pd.read_csv('q2-arxiv-metadata-oai-snapshot-small.csv', index_col = 0)
print(f'Size of data: {len(arxiv_data)}')
# find smallest category
min_samples_per_category = arxiv_data['first_category_english'].value_counts().min()
# make new df to store balanced data sample
balanced_sample = pd.DataFrame()
# go thru category and add random sample to new df
for category, group in arxiv_data.groupby('first_category_english'):
    balanced_sample = pd.concat([balanced_sample, group.sample(min_samples_per_category, random_state=42)])
print(f'Size of downsampled data: {len(balanced_sample)}')
# convert to huggingface dataset for faster processing
balanced_sample = Dataset.from_pandas(balanced_sample)

candidate_labels = list(set(balanced_sample['first_category_english']))

Size of data: 51268
Size of downsampled data: 999


### a) Precict categories and compute performance

In [57]:
pipe = pipeline(model="facebook/bart-large-mnli",device = 0)

In [61]:
def predict_category(batch):
  result = pipe(batch['title'], candidate_labels=candidate_labels)
  batch['predicted_category'] = result['labels'][0]
  return batch

# using map to predict category
balanced_sample = balanced_sample.map(predict_category, batched=False)

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [63]:
true_categories = balanced_sample['first_category_english']
predicted_categories = balanced_sample['predicted_category']

accuracy = accuracy_score(true_categories, predicted_categories)
f1 = f1_score(true_categories, predicted_categories, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.1652
F1 Score: 0.1575


### b) Poor performing categories

In [64]:
from sklearn.metrics import classification_report

# classification report to analyze category-wise performance
report = classification_report(true_categories, predicted_categories, target_names=candidate_labels)
print(report)

                                                 precision    recall  f1-score   support

        Computer Vision and Pattern Recognition       0.03      0.03      0.03        37
                          Emerging Technologies       0.20      0.24      0.22        37
                          Programming Languages       0.09      0.24      0.13        37
                             Information Theory       0.00      0.00      0.00        37
                       Computation and Language       0.62      0.14      0.22        37
                Social and Information Networks       0.33      0.03      0.05        37
               Computer Science and Game Theory       0.00      0.00      0.00        37
                          Mathematical Software       0.39      0.19      0.25        37
                       Computational Complexity       0.18      0.19      0.19        37
Computational Engineering, Finance, and Science       0.86      0.16      0.27        37
                    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Written answer:

The facebook/bart-large-mnli model might perform poorly due to category overlap and unclear distinctions between some labels. For example, Information Theory category have that precision, recall, and F1-score are all 0.00. Information Theory is a board field, and the model might match the paper into a more specific field. The model also depends heavily on how labels are phrased, so the titles may be too short and not match the content well. Also, the bart model is pre-trained on natural language and might not handle technical terms in computer science effectively.

Using a hypothesis template can help the model align better with the text by providing a more structured prompt. For example, writing "hypothesis template = "This scholarly articles is about {}."" makes the task more explicit. Also, enabling multi-label classification allows the model to assign multiple categories to one text, which is useful when the text covers overlapping topics. These changes can make the predictions more accurate and better suited for technical classifications.

### c) Evaluate different language models

In [28]:
pipe = pipeline(model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",device = 0)
balanced_sample = balanced_sample.map(predict_category, batched=False)

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [29]:
true_categories = balanced_sample['first_category_english']
predicted_categories = balanced_sample['predicted_category']

accuracy = accuracy_score(true_categories, predicted_categories)
f1 = f1_score(true_categories, predicted_categories, average='weighted')

print(f"Accuracy MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli: {accuracy:.4f}")
print(f"F1 Score MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli: {f1:.4f}")

Accuracy MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli: 0.1902
F1 Score MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli: 0.1849


In [65]:
pipe = pipeline(model="MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli",device = 0)
balanced_sample = balanced_sample.map(predict_category, batched=False)

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [66]:
true_categories = balanced_sample['first_category_english']
predicted_categories = balanced_sample['predicted_category']

accuracy = accuracy_score(true_categories, predicted_categories)
f1 = f1_score(true_categories, predicted_categories, average='weighted')

print(f"Accuracy MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli: {accuracy:.4f}")
print(f"F1 Score MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli: {f1:.4f}")

Accuracy MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli: 0.0981
F1 Score MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli: 0.0818


Written answer:

DeBERTa-v3-base-mnli-fever-anli is the best model with an accuracy of 0.1902 and an F1 score of 0.1849. It has stronger contextual understanding and pretrained on the MultiNLI, Fever-NLI and Adversarial-NLI (ANLI) datasets. The facebook/bart-large-mnli model does moderately well, with an accuracy of 0.1652 and an F1 score of 0.1575. The multilingual-MiniLMv2-L6-mnli-xnl model has highest inference speed so it is expected to have poor performance. Also, multilingual-MiniLMv2-L6-mnli-xnl focuses on multilingual tasks more than the other 2 models. However, these models performances are not well enough, possibly because the weakness of classification on similar technical terms.

### d) Repear parts a) and b) with the abstracts

In [33]:
pipe = pipeline(model="facebook/bart-large-mnli",device = 0)

def predict_category(batch):
  result = pipe(batch['abstract'], candidate_labels=candidate_labels)
  batch['predicted_category'] = result['labels'][0]
  return batch

# using map to predict category
balanced_sample = balanced_sample.map(predict_category, batched=False)

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [34]:
true_categories = balanced_sample['first_category_english']
predicted_categories = balanced_sample['predicted_category']

accuracy = accuracy_score(true_categories, predicted_categories)
f1 = f1_score(true_categories, predicted_categories, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# classification report to analyze category-wise performance
report = classification_report(true_categories, predicted_categories, target_names=candidate_labels)
print(report)

Accuracy: 0.1582
F1 Score: 0.1648
                                                 precision    recall  f1-score   support

        Computer Vision and Pattern Recognition       0.10      0.08      0.09        37
                          Emerging Technologies       0.18      0.22      0.20        37
                          Programming Languages       0.05      0.32      0.08        37
                             Information Theory       0.00      0.00      0.00        37
                       Computation and Language       1.00      0.11      0.20        37
                Social and Information Networks       0.00      0.00      0.00        37
               Computer Science and Game Theory       0.00      0.00      0.00        37
                          Mathematical Software       0.89      0.22      0.35        37
                       Computational Complexity       0.50      0.08      0.14        37
Computational Engineering, Finance, and Science       0.58      0.19      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Written asnwer:

Using the abstract instead of the title with the facebook/bart-large-mnli model resulted in slightly lower performance, with an accuracy of 0.1582 and F1-score of 0.1648 compared to the previous results of 0.1652 and 0.1575 respectively. However, the differences are minor overall. The abstracts may add more context subject so that some subjects like "Formal Languages and Automata Theory" increase the F1 score. F1 scores for some subjects like " Cryptography and Security" drops, which indicates that the additional texts also introduce noise, depending on the clarity and specificity of the abstract's content. I think the additional context in the abstract might confuse the model for categories with overlapping semantics.

### e) Change labels to long descriptions

In [71]:
categories_dict = {
    'cs.AI': 'Covers all areas of AI except Vision, Robotics, Machine Learning, Multiagent Systems, and Computation and Language (Natural Language Processing), which have separate subject areas. In particular, includes Expert Systems, Theorem Proving (although this may overlap with Logic in Computer Science), Knowledge Representation, Planning, and Uncertainty in AI. Roughly includes material in ACM Subject Classes I.2.0, I.2.1, I.2.3, I.2.4, I.2.8, and I.2.11.',
    'cs.AR': 'Covers systems organization and hardware architecture. Roughly includes material in ACM Subject Classes C.0, C.1, and C.5.',
    'cs.CC': 'Covers models of computation, complexity classes, structural complexity, complexity tradeoffs, upper and lower bounds. Roughly includes material in ACM Subject Classes F.1 (computation by abstract devices), F.2.3 (tradeoffs among complexity measures), and F.4.3 (formal languages), although some material in formal languages may be more appropriate for Logic in Computer Science. Some material in F.2.1 and F.2.2, may also be appropriate here, but is more likely to have Data Structures and Algorithms as the primary subject area.',
    'cs.CE': 'Covers applications of computer science to the mathematical modeling of complex systems in the fields of science, engineering, and finance. Papers here are interdisciplinary and applications-oriented, focusing on techniques and tools that enable challenging computational simulations to be performed, for which the use of supercomputers or distributed computing platforms is often required. Includes material in ACM Subject Classes J.2, J.3, and J.4 (economics).',
    'cs.CG': 'Roughly includes material in ACM Subject Classes I.3.5 and F.2.2.',
    'cs.CL': 'Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.',
    'cs.CR': 'Covers all areas of cryptography and security including authentication, public key cryptosytems, proof-carrying code, etc. Roughly includes material in ACM Subject Classes D.4.6 and E.3.',
    'cs.CV': 'Covers image processing, computer vision, pattern recognition, and scene understanding. Roughly includes material in ACM Subject Classes I.2.10, I.4, and I.5.',
    'cs.CY': 'Covers impact of computers on society, computer ethics, information technology and public policy, legal aspects of computing, computers and education. Roughly includes material in ACM Subject Classes K.0, K.2, K.3, K.4, K.5, and K.7.',
    'cs.DB': 'Covers database management, datamining, and data processing. Roughly includes material in ACM Subject Classes E.2, E.5, H.0, H.2, and J.1.',
    'cs.DC': 'Covers fault-tolerance, distributed algorithms, stabilility, parallel computation, and cluster computing. Roughly includes material in ACM Subject Classes C.1.2, C.1.4, C.2.4, D.1.3, D.4.5, D.4.7, E.1.',
    'cs.DL': 'Covers all aspects of the digital library design and document and text creation. Note that there will be some overlap with Information Retrieval (which is a separate subject area). Roughly includes material in ACM Subject Classes H.3.5, H.3.6, H.3.7, I.7.',
    'cs.DM': 'Covers combinatorics, graph theory, applications of probability. Roughly includes material in ACM Subject Classes G.2 and G.3.',
    'cs.DS': 'Covers data structures and analysis of algorithms. Roughly includes material in ACM Subject Classes E.1, E.2, F.2.1, and F.2.2.',
    'cs.ET': 'Covers approaches to information processing (computing, communication, sensing) and bio-chemical analysis based on alternatives to silicon CMOS-based technologies, such as nanoscale electronic, photonic, spin-based, superconducting, mechanical, bio-chemical and quantum technologies (this list is not exclusive). Topics of interest include (1) building blocks for emerging technologies, their scalability and adoption in larger systems, including integration with traditional technologies, (2) modeling, design and optimization of novel devices and systems, (3) models of computation, algorithm design and programming for emerging technologies.',
    'cs.FL': 'Covers automata theory, formal language theory, grammars, and combinatorics on words. This roughly corresponds to ACM Subject Classes F.1.1, and F.4.3. Papers dealing with computational complexity should go to cs.CC; papers dealing with logic should go to cs.LO.',
    'cs.GL': 'Covers introductory material, survey material, predictions of future trends, biographies, and miscellaneous computer-science related material. Roughly includes all of ACM Subject Class A, except it does not include conference proceedings (which will be listed in the appropriate subject area).',
    'cs.GR': 'Covers all aspects of computer graphics. Roughly includes material in all of ACM Subject Class I.3, except that I.3.5 is is likely to have Computational Geometry as the primary subject area.',
    'cs.GT': 'Covers all theoretical and applied aspects at the intersection of computer science and game theory, including work in mechanism design, learning in games (which may overlap with Learning), foundations of agent modeling in games (which may overlap with Multiagent systems), coordination, specification and formal methods for non-cooperative computational environments. The area also deals with applications of game theory to areas such as electronic commerce.',
    'cs.HC': 'Covers human factors, user interfaces, and collaborative computing. Roughly includes material in ACM Subject Classes H.1.2 and all of H.5, except for H.5.1, which is more likely to have Multimedia as the primary subject area.',
    'cs.IR': 'Covers indexing, dictionaries, retrieval, content and analysis. Roughly includes material in ACM Subject Classes H.3.0, H.3.1, H.3.2, H.3.3, and H.3.4.',
    'cs.IT': 'Covers theoretical and experimental aspects of information theory and coding. Includes material in ACM Subject Class E.4 and intersects with H.1.1.',
    'cs.LG': 'Papers on all aspects of machine learning research (supervised, unsupervised, reinforcement learning, bandit problems, and so on) including also robustness, explanation, fairness, and methodology. cs.LG is also an appropriate primary category for applications of machine learning methods.',
    'cs.LO': 'Covers all aspects of logic in computer science, including finite model theory, logics of programs, modal logic, and program verification. Programming language semantics should have Programming Languages as the primary subject area. Roughly includes material in ACM Subject Classes D.2.4, F.3.1, F.4.0, F.4.1, and F.4.2; some material in F.4.3 (formal languages) may also be appropriate here, although Computational Complexity is typically the more appropriate subject area.',
    'cs.MA': 'Covers multiagent systems, distributed artificial intelligence, intelligent agents, coordinated interactions. and practical applications. Roughly covers ACM Subject Class I.2.11.',
    'cs.MM': 'Roughly includes material in ACM Subject Class H.5.1.',
    'cs.MS': 'Roughly includes material in ACM Subject Class G.4.',
    'cs.NA': 'cs.NA is an alias for math.NA. Roughly includes material in ACM Subject Class G.1.',
    'cs.NE': 'Covers neural networks, connectionism, genetic algorithms, artificial life, adaptive behavior. Roughly includes some material in ACM Subject Class C.1.3, I.2.6, I.5.',
    'cs.NI': 'Covers all aspects of computer communication networks, including network architecture and design, network protocols, and internetwork standards (like TCP/IP). Also includes topics, such as web caching, that are directly relevant to Internet architecture and performance. Roughly includes all of ACM Subject Class C.2 except C.2.4, which is more likely to have Distributed, Parallel, and Cluster Computing as the primary subject area.',
    'cs.OH': 'This is the classification to use for documents that do not fit anywhere else.',
    'cs.OS': 'Roughly includes material in ACM Subject Classes D.4.1, D.4.2., D.4.3, D.4.4, D.4.5, D.4.7, and D.4.9.',
    'cs.PF': 'Covers performance measurement and evaluation, queueing, and simulation. Roughly includes material in ACM Subject Classes D.4.8 and K.6.2.',
    'cs.PL': 'Covers programming language semantics, language features, programming approaches (such as object-oriented programming, functional programming, logic programming). Also includes material on compilers oriented towards programming languages; other material on compilers may be more appropriate in Architecture (AR). Roughly includes material in ACM Subject Classes D.1 and D.3.',
    'cs.RO': 'Roughly includes material in ACM Subject Class I.2.9.',
    'cs.SC': 'Roughly includes material in ACM Subject Class I.1.',
    'cs.SD': 'Covers all aspects of computing with sound, and sound as an information channel. Includes models of sound, analysis and synthesis, audio user interfaces, sonification of data, computer music, and sound signal processing. Includes ACM Subject Class H.5.5, and intersects with H.1.2, H.5.1, H.5.2, I.2.7, I.5.4, I.6.3, J.5, K.4.2.',
    'cs.SE': 'Covers design tools, software metrics, testing and debugging, programming environments, etc. Roughly includes material in all of ACM Subject Classes D.2, except that D.2.4 (program verification) should probably have Logics in Computer Science as the primary subject area.',
    'cs.SI': 'Covers the design, analysis, and modeling of social and information networks, including their applications for on-line information access, communication, and interaction, and their roles as datasets in the exploration of questions in these and other domains, including connections to the social and biological sciences. Analysis and modeling of such networks includes topics in ACM Subject classes F.2, G.2, G.3, H.2, and I.2; applications in computing include topics in H.3, H.4, and H.5; and applications at the interface of computing and other disciplines include topics in J.1--J.7. Papers on computer communication systems and network protocols (e.g. TCP/IP) are generally a closer fit to the Networking and Internet Architecture (cs.NI) category.',
    'cs.SY': 'cs.SY is an alias for eess.SY. This section includes theoretical and experimental research covering all facets of automatic control systems. The section is focused on methods of control system analysis and design using tools of modeling, simulation and optimization. Specific areas of research include nonlinear, distributed, adaptive, stochastic and robust control in addition to hybrid and discrete event systems. Application areas include automotive and aerospace control systems, network control, biological systems, multiagent and cooperative control, robotics, reinforcement learning, sensor networks, control of cyber-physical and energy-related systems, and control of computing systems.'
}

# helper function to match the first_category to first_category_english and the long description
def match_description(dataset, categories_dict):
    mapping = {}
    for example in dataset:
        first_category = example['first_category']
        first_category_english = example['first_category_english']
        long_description = categories_dict.get(first_category, 'Unknown Description')
        mapping[first_category_english] = long_description
    return mapping

categories_dict = match_description(balanced_sample, categories_dict)


In [72]:
# extract long descriptions
long_labels = list(categories_dict.values())

# reverse mapping to category
reverse_mapping = {value: key for key, value in categories_dict.items()}

# modified predict_category function
def predict_category(batch):
  result = pipe(batch['title'], candidate_labels=long_labels)
  predicted_long_label = result['labels'][0]
  # Map to the English description
  batch['predicted_category'] = reverse_mapping[predicted_long_label]
  return batch

pipe = pipeline(model="facebook/bart-large-mnli",device = 0)
# using map to predict category
balanced_sample = balanced_sample.map(predict_category, batched=False)

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [75]:
true_categories = balanced_sample['first_category_english']
predicted_categories = balanced_sample['predicted_category']

accuracy = accuracy_score(true_categories, predicted_categories)
f1 = f1_score(true_categories, predicted_categories, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.0380
F1 Score: 0.0048


Written answers:

NO. The titles are short and lack of enough context to match the long descriptions. The long descriptions are not concise labels but instead detailed explanations, which are not ideal for ZeroShot classification. The model performs better when labels are shorter and capture the essence of the category. It results allocated number of papers into only a few categories that are more general. The model cannot easily disambiguate between similar concepts without explicit training.