In [None]:
# Connect Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import all libraries

import numpy as np
import pandas as pd
import ast
import torch

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler

! pip install gensim
import gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
# Import and prepare data

train = pd.read_csv('/content/drive/MyDrive/5. Fall 2025/CSC 149/Text Classification/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/5. Fall 2025/CSC 149/Text Classification/data/test.csv')

genres = pd.read_csv('/content/drive/MyDrive/5. Fall 2025/CSC 149/Text Classification/data/movies_genres.csv')


# Create validation split in train data

train, val = train_test_split(train, test_size = 0.1, random_state = 42)

X_train = train['overview']
y_train = train['genre_ids']

X_val = val['overview']
y_val = val['genre_ids']

In [None]:
# View train data

train.head()

Unnamed: 0,movie_id,title,overview,genre_ids
7935,5321,Pet Sematary,After the Creed family's cat is accidentally k...,[27]
4608,1926,Don't Torture a Duckling,A reporter and a promiscuous young woman try t...,[27]
2939,4386,The Women on the 6th Floor,"Paris, in the early 1960s. Jean-Louis Joubert ...",[35]
4336,3082,Contempt,"A philistine in the art film business, Jeremy ...","[18, 10749]"
3658,9464,Ogni maledetto Natale,Massimo Marinelli Lops and Giulia Colardo meet...,[35]


In [None]:
# Create word embeddings

X_train = X_train.apply(lambda row: gensim.utils.simple_preprocess(row))
X_val = X_val.apply(lambda row: gensim.utils.simple_preprocess(row))

In [None]:
# Perform Word2Vec

print('Beginning Word2Vec...')
word2vec_model = gensim.models.Word2Vec(X_train, vector_size = 100, window = 5, min_count = 2)
print('Done!')

Beginning Word2Vec...
Done!


In [None]:
X_train

Unnamed: 0,overview
7935,"[after, the, creed, family, cat, is, accidenta..."
4608,"[reporter, and, promiscuous, young, woman, try..."
2939,"[paris, in, the, early, jean, louis, joubert, ..."
4336,"[philistine, in, the, art, film, business, jer..."
3658,"[massimo, marinelli, lops, and, giulia, colard..."
...,...
5226,"[while, trying, to, make, his, sister, wedding..."
5390,"[eva, an, ex, dancer, is, now, living, in, whe..."
860,"[tang, lung, arrives, in, rome, to, help, his,..."
7603,"[in, group, of, astronauts, are, sent, to, inv..."


In [None]:
#  Vectorize the data

words = set(word2vec_model.wv.index_to_key)
word_vectors = word2vec_model.wv

def document_vector(doc):
    """
    Computes the mean of all Word2Vec vectors for words in a document.
    Returns a zero vector if no words are found in the model's vocabulary.
    """
    # Filter out words not in the model's vocabulary
    vectors = [word_vectors[word] for word in doc if word in word_vectors]

    if not vectors:
        # Return a zero vector of the correct size (100 in your case)
        return np.zeros(word_vectors.vector_size)
    else:
        # Calculate the mean of all word vectors in the document
        return np.mean(vectors, axis=0)

# Apply the function to your training and validation data
# This results in a NumPy array where each element is a 100-dimension feature vector.
X_train = np.array([document_vector(training_example) for training_example in X_train])
X_val = np.array([document_vector(validation_example) for validation_example in X_val])

print(f"X_train shape after averaging: {X_train.shape}")
print(f"X_val shape after averaging: {X_val.shape}")

X_train shape after averaging: (7200, 100)
X_val shape after averaging: (800, 100)


In [None]:
# Use multi-label binarizer to one-hot encode the label

mlb = MultiLabelBinarizer()

y_train = y_train.apply(ast.literal_eval)
y_val = y_val.apply(ast.literal_eval)

# 1. Simply pass the Series data (which acts as the outer iterable)
# The inner lists contain the individual labels (10402, 35, etc.)
y_train_binarized = mlb.fit_transform(y_train)

# 2. Transform the validation set using the fitted binarizer
y_val_binarized = mlb.transform(y_val)

In [None]:
class CustomGaussianNB(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        # 1. Store unique classes and calculate class priors
        self.classes_ = np.unique(y)
        self.priors_ = {}
        self.likelihood_params_ = {}

        for k in self.classes_:
            X_k = X[y == k]
            self.priors_[k] = len(X_k) / len(X)

            self.likelihood_params_[k] = {}
            for i in range(X.shape[1]):
                feature_data = X_k[:, i]
                mean = np.mean(feature_data)
                variance = np.var(feature_data) + 1e-9
                self.likelihood_params_[k][i] = (mean, variance)

        return self

    def _gaussian_log_likelihood(self, x_i, mean, variance):
        log_prob = -0.5 * np.log(2. * np.pi * variance) - \
                   (x_i - mean)**2 / (2. * variance)
        return log_prob

    def predict_log_proba(self, X):
        log_probabilities = []

        for k in self.classes_:
            log_prior = np.log(self.priors_[k])
            log_class_prob = np.full(X.shape[0], log_prior)

            for i in range(X.shape[1]):
                mean, variance = self.likelihood_params_[k][i]
                log_likelihood = self._gaussian_log_likelihood(X[:, i], mean, variance)
                log_class_prob += log_likelihood

            log_probabilities.append(log_class_prob)

        return np.array(log_probabilities).T

    def predict_proba(self, X):
        log_proba_matrix = self.predict_log_proba(X)
        proba_matrix = np.exp(log_proba_matrix)

        row_sums = proba_matrix.sum(axis=1)
        proba_matrix = proba_matrix / row_sums[:, np.newaxis]

        return proba_matrix

    def predict(self, X):
        proba_matrix = self.predict_proba(X)
        max_class_indices = np.argmax(proba_matrix, axis=1)

        return self.classes_[max_class_indices]

In [None]:
# Fit the data

base_model = CustomGaussianNB()
model = OneVsRestClassifier(base_model)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model.fit(X_train_scaled, y_train_binarized)

probability_predictions = model.predict_proba(X_val_scaled)
class_label_predictions = model.predict(X_val_scaled)
f1 = f1_score(y_val_binarized, class_label_predictions, average = 'weighted')
f1

0.3596989059402609

In [None]:
# BERT implementation

! pip install datasets transformers evaluate sentencepiece accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)

   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Use pretrained BERT architecture

from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

model_path = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_path)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

classes = list(mlb.classes_)
class2id = {int(class_name): id for id, class_name in enumerate(classes)}
id2class = {id: int(class_name) for class_name, id in class2id.items()}

model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = len(classes), id2label = id2class, label2id = class2id, problem_type = "multi_label_classification")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize texts for BERT model

from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

train_dict = {'text': train['overview'].tolist(), 'labels': y_train_binarized.astype(np.float32).tolist()}
val_dict = {'text': val['overview'].tolist(), 'labels': y_val_binarized.astype(np.float32).tolist()}

hf_train_dataset = Dataset.from_dict(train_dict)
hf_val_dataset = Dataset.from_dict(val_dict)

tokenized_train_dataset = hf_train_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_train_dataset.set_format("torch")

tokenized_val_dataset = hf_val_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["text"])
tokenized_val_dataset.set_format("torch")

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [None]:
# Train the model

training_args = TrainingArguments(
    learning_rate = 2e-5,
    per_device_train_batch_size = 3,
    per_device_eval_batch_size = 3,
    num_train_epochs = 10,
    weight_decay = 0.01,
    load_best_model_at_end = True,

    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    metric_for_best_model='f1',
)

trainer = Trainer(
    model=model,
    args=training_args,
    # === USE THE TOKENIZED DATASETS HERE ===
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    # =======================================
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2727,0.259437,0.890903,0.581402,0.682728,0.506265
2,0.2336,0.246682,0.89875,0.626537,0.699257,0.567517
3,0.1998,0.247668,0.902986,0.65701,0.697602,0.620882
4,0.1759,0.249107,0.904097,0.658422,0.704979,0.617633
5,0.149,0.267564,0.901944,0.654091,0.692787,0.61949
6,0.1302,0.281794,0.901319,0.655181,0.686673,0.62645
7,0.1135,0.297132,0.901944,0.656448,0.690026,0.625986
8,0.1007,0.306373,0.905,0.671154,0.696259,0.647796
9,0.0893,0.319647,0.902639,0.661353,0.689673,0.635267
10,0.08,0.320597,0.903472,0.664575,0.692308,0.638979


TrainOutput(global_step=24000, training_loss=0.15766977755228678, metrics={'train_runtime': 2385.8592, 'train_samples_per_second': 30.178, 'train_steps_per_second': 10.059, 'total_flos': 1485728602198200.0, 'train_loss': 0.15766977755228678, 'epoch': 10.0})

In [None]:
# Make predictions on test data

def tokenize_unlabeled(examples):
    return tokenizer(examples['text'], truncation=True)

def get_predictions(logits, threshold=0.5):
    probs = 1 / (1 + np.exp(-logits))
    return (probs > threshold).astype(int)

unlabeled_texts = test['overview'].tolist()
unlabeled_dataset = Dataset.from_dict({'text': unlabeled_texts})
tokenized_unlabeled = unlabeled_dataset.map(tokenize_unlabeled, batched=True)

raw_pred = trainer.predict(tokenized_unlabeled)

binary_predictions = get_predictions(raw_pred.predictions)
predicted_labels = mlb.inverse_transform(binary_predictions)

test['genre_ids'] = [list(labels) for labels in predicted_labels]

submission = test[['movie_id', 'genre_ids']]
submission['genre_ids'] = submission['genre_ids'].apply(lambda x: ' '.join(map(str, x)))

submission.to_csv('/content/drive/MyDrive/5. Fall 2025/CSC 149/Text Classification/submission.csv', index = False)
submission.head()

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['genre_ids'] = submission['genre_ids'].apply(lambda x: ' '.join(map(str, x)))


Unnamed: 0,movie_id,genre_ids
0,529,18 36 10749
1,3549,18
2,7536,27
3,5086,18 28 53 10752
4,3452,18 10749
