# Udemy DS Algorithm Excercise

## PART A : Course Classification

I'm gonna handle this problem in 2 ways. it could 

1. Traditional Machine Learning Approach
2. Transformer Based Model

## Transformer Based Model

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
#Loading Data into DataFrame
data = pd.read_csv("/Users/sharukh/Documents/Code_Udemy/udemy_ds_algos_exercise (1).csv")

In [None]:
# Preprocess the data
data = data.rename(columns={"course_section_lecture_title": "text", "label": "label"})
labels = data['label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
data['label'] = data['label'].map(label2id)

In [None]:
# Split the data
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
pred_labels = [id2label[pred] for pred in preds]
true_labels = [id2label[true] for true in test_df['label']]