# 2025 GENAI-ML-HW5
# Regression
In this assignment, you are expected to use linear regression and multi layer perceptron (MLP) model to predict the metacritic score on the famous game platform, Steam. For more information, please check the homework slide.
HW5 Slide Link :

https://docs.google.com/presentation/d/1ysys__L1HKLPV2LX0u-KMP0LD1XamhNCY_hq29k-I0A/edit?usp=sharing



## Check GPU Status

In [None]:
!nvidia-smi

Mon Nov 10 16:02:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   75C    P0             30W /   70W |    2184MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Import Modules


In [None]:
# ===== Import libraries =====
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import random



# Set Random Seed

In [None]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = "cuda" if torch.cuda.is_available() else "cpu"
print("‚úÖ Device:", device)

‚úÖ Device: cuda


# Download Dataset

In [None]:
!git clone https://github.com/murphy-cthsu/GENAI-ML-2025-HW5-Data.git
!mv GENAI-ML-2025-HW5-Data/*.csv .



fatal: destination path 'GENAI-ML-2025-HW5-Data' already exists and is not an empty directory.
mv: cannot stat 'GENAI-ML-2025-HW5-Data/*.csv': No such file or directory


# Preview Training Data

In [None]:
# ===== ËºâÂÖ•Ë®ìÁ∑¥Ë≥áÊñôÈõÜ =====
train_df = pd.read_csv("train.csv")
# È†êË¶ΩË≥áÊñôÈõÜ
row_index = 0
row = train_df.iloc[row_index]
for col, val in row.items():
    if isinstance(val, str) and len(val) > 100:
        print(f"{col:25}: {val[:100]}...")   # truncate long text
    else:
        print(f"{col:25}: {val}")
# Êï∏ÂÄºËàáÊñáÂ≠óÁâπÂæµ
numeric_features = [c for c in train_df.select_dtypes(include=['number']).columns if c != 'metacritic_score']
print("All available numeric features :", numeric_features)
print("All available text features :", train_df.select_dtypes(include=['object']).columns.tolist())

appid                    : 249130
name                     : LEGO¬Æ Marvel‚Ñ¢ Super Heroes
release_date             : 1382400000
required_age             : 0
price                    : 19.99
dlc_count                : 2
detailed_description     : LEGO¬Æ Marvel‚Ñ¢ Super Heroes features an original story crossing the entire Marvel Universe. Players t...
about_the_game           : LEGO¬Æ Marvel‚Ñ¢ Super Heroes features an original story crossing the entire Marvel Universe. Players t...
short_description        : LEGO¬Æ Marvel‚Ñ¢ Super Heroes features an original story crossing the entire Marvel Universe. Players t...
reviews                  : nan
windows                  : 1
mac                      : 0
linux                    : 0
achievements             : 45
recommendations          : 18110
positive                 : 24279
negative                 : 1196
estimated_owners         : 1500000
average_playtime_forever : 0
average_playtime_2weeks  : 0
median_playtime_forever  : 0
median_pla

# Load Data

In [None]:


# TODO(Boss baseline) - Feature Selection: ÈÅ∏ÊìáÈÅ©ÂêàÁöÑfeaturesÈÄ≤Ë°åË®ìÁ∑¥ÔºåÂèØÈÅ∏featuresË´ãÂèÉË¶ãcsvÊ™îÊ¨Ñ‰Ωç
# To check all available numeric features, uncomment the line below :
# print("All availbale numeric features :", train_df.select_dtypes(include=['number']).columns.tolist())
# ===== Load data =====
train_df = pd.read_csv("train.csv")

# ===== Feature Selection =====
numeric_features = [
   'recommendations', 'positive', 'negative', 'price', 'median_playtime_2weeks',
    'required_age', 'linux', 'mac',
    'owners', 'average_playtime', # <--- ÂÅáË®≠ÈÄôÂÖ©ÂÄãÊòØÈÅ∫ÊºèÁöÑ
    'is_top_20', 'has_website'
]

# Âª∫Á´ãË°çÁîüÁâπÂæµ
train_df['positive_ratio'] = train_df['positive'] / (train_df['positive'] + train_df['negative'] + 1)
train_df['price_per_hour'] = train_df['price'] / (train_df['average_playtime_forever'] + 1)

numeric_features = [f for f in numeric_features if f in train_df.columns]

print("‚úÖ Numeric features used:", numeric_features)

X_numeric = train_df[numeric_features].fillna(0).values
y = train_df['metacritic_score'].fillna(0).values.reshape(-1, 1)

# ===== Use Text Embeddings =====
text_columns = [c for c in ['reviews', 'short_description'] if c in train_df.columns]
print("‚úÖ Text columns used:", text_columns)

embedder = SentenceTransformer('all-mpnet-base-v2')

def extract_embeddings(df, col):
    texts = df[col].fillna("").astype(str).tolist()
    emb = embedder.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    return emb

def reduce_dim_with_pca(embeddings, n_components=100):
    pca = PCA(n_components=n_components)
    reduced = pca.fit_transform(embeddings)
    return reduced, pca

embeddings_reduced = []
pca_models = {}
for col in text_columns:
    emb = extract_embeddings(train_df, col)
    emb_reduced, pca_model = reduce_dim_with_pca(emb, n_components=100)
    embeddings_reduced.append(emb_reduced)
    pca_models[col] = pca_model

X_text_reduced = np.concatenate(embeddings_reduced, axis=1)
X = np.concatenate([X_numeric, X_text_reduced], axis=1)

# ===== Normalization =====
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y)

# ===== Train / Dev Split =====
X_train, X_dev, y_train, y_dev = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=SEED)


‚úÖ Numeric features used: ['recommendations', 'positive', 'negative', 'price', 'median_playtime_2weeks', 'required_age', 'linux', 'mac', 'has_website']
‚úÖ Text columns used: ['reviews', 'short_description']


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

# Dataset Class


In [None]:
# ===== Ë≥áÊñôÈõÜ =====
class SteamDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Model Class

In [None]:
# ===== Strong baseline - Model Architecture =====
class LinearModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.fc(x)

class MLPModel(nn.Module):
    def __init__(self, input_dim, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.shortcut = nn.Linear(512, 256)  # üëà Êñ∞Â¢û‰∏ÄÂ±§ÔºåËÆì skip connection Â∞çÈΩä
        self.fc3 = nn.Linear(256, 128)
        self.out = nn.Linear(128, 1)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.LeakyReLU(0.1)

    def forward(self, x):
        x1 = self.act(self.fc1(x))
        x1 = self.dropout(x1)
        x2 = self.act(self.fc2(x1))
        x2 = self.dropout(x2)
        # üëá skip connection (x2 + 0.5 * projection(x1))
        x3 = self.act(self.fc3(x2 + 0.5 * self.shortcut(x1)))
        x3 = self.dropout(x3)
        out = self.out(x3)
        return out


# Hyperparameter


In [None]:
# ===== Medium/Strong baseline - Training Hyperparameters =====
n_epochs = 7000
learning_rate = 1e-3
batch_size = 64
weight_decay = 1e-4
shuffle_data = True
early_stop_patience = 500

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5000, eta_min=1e-5)



# Train

In [None]:

# ===== Training =====
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
dev_dataset = TensorDataset(torch.tensor(X_dev, dtype=torch.float32), torch.tensor(y_dev, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_data)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

model = MLPModel(input_dim=X_train.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.5)

best_loss = float('inf')
no_improve = 0

for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in dev_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            val_loss += criterion(preds, yb).item() * xb.size(0)

    train_loss /= len(train_loader.dataset)
    val_loss /= len(dev_loader.dataset)
    scheduler.step()

    if val_loss < best_loss:
        best_loss = val_loss
        no_improve = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        no_improve += 1
        if no_improve >= early_stop_patience:
            print(f"Early stopped at epoch {epoch}")
            break

    if epoch % 200 == 0:
        print(f"Epoch {epoch:4d} | Train loss: {train_loss:.5f} | Val loss: {val_loss:.5f}")


Epoch    0 | Train loss: 0.80551 | Val loss: 0.73111
Epoch  200 | Train loss: 0.03499 | Val loss: 0.55001
Epoch  400 | Train loss: 0.02790 | Val loss: 0.55317
Early stopped at epoch 557


# Inference on Test Dataset

In [None]:
# ===== Inference =====
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

test_df = pd.read_csv("test.csv")
numeric_features = [f for f in numeric_features if f in test_df.columns]
X_test_num = test_df[numeric_features].fillna(0).values

# Embedding ÈÉ®ÂàÜ
reviews_embed = embedder.encode(
    test_df["reviews"].fillna("").astype(str).tolist(),
    batch_size=64, show_progress_bar=True, convert_to_numpy=True
)
short_desc_embed = embedder.encode(
    test_df["short_description"].fillna("").astype(str).tolist(),
    batch_size=64, show_progress_bar=True, convert_to_numpy=True
)
reviews_embed_reduced = pca_models["reviews"].transform(reviews_embed)
short_desc_embed_reduced = pca_models["short_description"].transform(short_desc_embed)
X_test_combined = np.hstack([X_test_num, reviews_embed_reduced, short_desc_embed_reduced])

# Scaling
X_test_scaled = scaler.transform(X_test_combined)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# Prediction
with torch.no_grad():
    preds_scaled = model(X_test_tensor).cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled).squeeze()

preds = np.clip(np.round(preds), 0, 100).astype(int)


submission = pd.DataFrame({
    "appid": test_df["appid"],
    "metacritic_score": preds
})
submission.to_csv("submission.csv", index=False)
print("‚úÖ Output saved to submission.csv")
from google.colab import files # Â¶ÇÊûúÊÇ®ÂâçÈù¢ÈÇÑÊ≤íÂåØÂÖ•ÈÅé
files.download("submission.csv") # Âü∑Ë°å‰∏ãËºâÊåá‰ª§

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

ValueError: X has 210 features, but StandardScaler is expecting 212 features as input.