In [None]:
# from tqdm import tqdm
from tqdm.notebook import tqdm

import json 
import pandas as pd
import polars as pl
import unicodedata

import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        # embeddings: list of numpy arrays or torch tensors
        # labels: list of scalars
        self.X = torch.tensor(embeddings, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float16)  # or long, depending on your task
       
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
val = torch.load("../data/2_training_ready/mymethod/take00/val.pt")

In [None]:
testing = torch.load("../data/2_training_ready/mymethod/take00/testing.pt")


In [None]:
training1 = torch.load("../data/2_training_ready/mymethod/take00/training1.pt")
training2 = torch.load("../data/2_training_ready/mymethod/take00/training2.pt")
training3 = torch.load("../data/2_training_ready/mymethod/take00/training3.pt")
# training4 = torch.load("../data/2_training_ready/mymethod/take00/training4.pt")

In [None]:
# # Combine the embeddings and labels
# combined_embeddings = torch.cat([training1.X, training2.X, training3.X,training4.X], dim=0)
# combined_labels = torch.cat([training1.y, training2.y, training3.y,training4.y], dim=0)
# 
# # Create a new EmbeddingDataset with the combined data
# training = EmbeddingDataset(combined_embeddings.numpy(), combined_labels.numpy())

In [None]:
# Combine the embeddings and labels
combined_embeddings = torch.cat([training1.X, training2.X, training3.X], dim=0)
combined_labels = torch.cat([training1.y, training2.y, training3.y], dim=0)

# Create a new EmbeddingDataset with the combined data
training = EmbeddingDataset(combined_embeddings.numpy(), combined_labels.numpy())

In [None]:
# OG 
# class SimpleNN(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(SimpleNN, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(input_dim, hidden_dim),
#             # nn.ReLU(),
#             # nn.Linear(hidden_dim, hidden_dim),
#             nn.ReLU(),
#             nn.Linear(hidden_dim, output_dim)
#         )
# 
#     def forward(self, x):
#         return self.model(x)

In [None]:
# class SimpleNN(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(SimpleNN, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(input_dim, hidden_dim),
#             nn.ReLU(),
#             nn.Dropout(0.1),
#             nn.Linear(hidden_dim, hidden_dim),
#             nn.ReLU(),
#             nn.Dropout(0.1),
#             nn.Linear(hidden_dim, output_dim)
#         )
# 
#     def forward(self, x):
#         return self.model(x)

In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim=4096, hidden_dim=2048, output_dim=5, dropout=0.1):
        super(SimpleNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)


In [None]:
# Set up your dataset and dataloaders
batch_size = 256
train_loader = DataLoader(training, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(testing, batch_size=batch_size, shuffle=False)


In [None]:
# Convert dataset labels to integers (class indices)
training.y = training.y.long()  # Keep 1-based indexing for star ratings
testing.y = testing.y.long()  # Keep 1-based indexing for star ratings
val.y = val.y.long()

In [None]:
# Define model, loss function, and optimizer
input_dim = training.X.shape[1]  # Number of features in the embeddings
hidden_dim = 2048
output_dim = len(torch.unique(training.y))  # Number of classes

model = SimpleNN(input_dim, hidden_dim, output_dim).to("cuda")
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=1e-5)
# optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-5)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.631)

In [None]:
# Training and evaluation loops
def evaluate(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X, y in data_loader:
            X = X.to("cuda")
            y = y.to("cuda")
            outputs = model(X)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted + 1 == y).sum().item()  # Adjust prediction for 1-based indexing
            total += y.size(0)
    accuracy = correct / total
    return accuracy


In [None]:
num_epochs = 10
log_interval = 1_000

In [None]:
step = 0

train_losses = []
eval_accuracies = []
learning_rates = []  # New list to track learning rates

for epoch in range(num_epochs):
    current_lr = optimizer.param_groups[0]['lr']
    learning_rates.append((epoch, current_lr))
    # learning_rates.append(current_lr)
    
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    running_loss = 0.0

    for X, y in pbar:
        X = X.to("cuda")
        y = y.to("cuda")
        # Forward pass
        outputs = model(X)
        loss = criterion(outputs, y - 1)  # Shift labels for 0-based indexing during training
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        running_loss += loss.item()
        step += 1

        # Log progress every log_interval steps
        if step % log_interval == 0:
            eval_accuracy = evaluate(model, test_loader)
            eval_accuracies.append((step, eval_accuracy))
            print(f"Step {step}, Loss: {loss.item():.4f}, Eval Accuracy: {eval_accuracy:.4f}")

    train_losses.append(running_loss / len(train_loader))
    scheduler.step()



In [None]:
# Plot training loss and evaluation accuracy
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Training Loss")

plt.subplot(1, 2, 2)
steps, accuracies = zip(*eval_accuracies)
plt.plot(steps, accuracies, label="Evaluation Accuracy")
plt.xlabel("Step")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Evaluation Accuracy")

plt.tight_layout()
plt.show()


In [None]:
epochs, lrs = zip(*learning_rates)
plt.figure(figsize=(10, 5))
plt.plot(epochs, lrs, 'b-')
plt.yscale('log')
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')
plt.grid(True)
plt.show()

In [None]:
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False)
val_accuracy = evaluate(model, val_loader)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

In [None]:
# import torch
# from torch.utils.data import DataLoader
# import matplotlib.pyplot as plt
# 
# # Gather all embeddings from the testing dataset into a single tensor
# all_embeddings = []
# for i in range(len(testing)):
#     X, y = testing[i]  # X is embeddings, y is target
#     # Ensure X is a tensor of shape [embedding_dim]
#     # If not, you might need to reshape or extract the embedding part
#     all_embeddings.append(X)
# 
# test_embeddings = torch.stack(all_embeddings, dim=0)  # Shape: (N, D)
# 
# # Compute per-dimension means and variances
# dimension_means = test_embeddings.mean(dim=0)
# dimension_vars = test_embeddings.var(dim=0, unbiased=False)
# 
# print("Mean of means:", dimension_means.mean().item())
# print("Mean of variances:", dimension_vars.mean().item())
# 
# plt.figure(figsize=(10,4))
# plt.subplot(1,2,1)
# plt.hist(dimension_means.cpu().numpy(), bins=50, color='skyblue', edgecolor='black')
# plt.title("Distribution of Per-Dimension Means")
# plt.xlabel("Mean Value")
# plt.ylabel("Frequency")
# 
# plt.subplot(1,2,2)
# plt.hist(dimension_vars.cpu().numpy(), bins=50, color='lightgreen', edgecolor='black')
# plt.title("Distribution of Per-Dimension Variances")
# plt.xlabel("Variance Value")
# plt.ylabel("Frequency")
# 
# plt.tight_layout()
# plt.show()
