# Exploratory Data Analysis & Preprocessing
## Data Preprocessing
### Importing the Dataset

In [None]:
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


df = pd.concat(
    [
        pd.read_csv(
            f"data/OraclesElixir/{year}_LoL_esports_match_data_from_OraclesElixir.csv",
            dtype={"url": "str"}
        )
        for year in range(2020, 2025)
    ],
    ignore_index=True
)

rows, cols = df.shape
print(f"The CSV file has {rows} rows and {cols} columns.")

print(df.columns.tolist())


### Filter for Complete Matches Only

In [None]:
num_complete_rows = df[df["datacompleteness"] == 'complete'].shape[0]
total_rows = df.shape[0]
ratio = num_complete_rows / total_rows
print(f"Number of rows where datacompleteness is 'complete': {num_complete_rows}")
print(f"Ratio of 'complete' rows to total rows: {ratio:.4f}")

df = df[df["datacompleteness"] == 'complete']


### Aggregate Individual Stats to Team-Level Rows

In [None]:
team_rows = df[df['position'] == 'team'].copy()
player_rows = df[df['position'] != 'team']

positions = ['top', 'jng', 'mid', 'bot', 'sup']

for pos in positions:
    champ_col = (
        player_rows[player_rows['position'] == pos]
        .loc[:, ['gameid', 'side', 'champion']]
        .rename(columns={'champion': f'{pos}_champ'})
    )
    
    team_rows = team_rows.merge(champ_col, on=['gameid', 'side'], how='left')
df = team_rows


### Visualize Feature Correlations

In [None]:
correlation_void = df['void_grubs'].corr(df['result'])  
print("Correlation with void:", correlation_void)

correlation_monsterkillsownjungle = df['monsterkillsownjungle'].corr(df['result'])  
print("Correlation with monsterkillsownjungle:", correlation_monsterkillsownjungle)

correlation_turretplates = df['turretplates'].corr(df['result'])  
print("Correlation with turretplates:", correlation_turretplates)

correlation_heralds = df['heralds'].corr(df['result'])  
print("Correlation with heralds:", correlation_heralds)

correlation_visionscore = df['visionscore'].corr(df['result'])  
print("Correlation with visionscore:", correlation_visionscore)

correlation_vspm = df['vspm'].corr(df['result'])  
print("Correlation with vspm:", correlation_vspm)

correlation_minionkills = df['minionkills'].corr(df['result'])  
print("Correlation with minionkills:", correlation_minionkills)

correlation_cspm = df['cspm'].corr(df['result'])  
print("Correlation with cspm:", correlation_cspm)

### Remove Unnecessary or Redundant Features

In [None]:
columns_to_drop = (
    df.columns[1:11]  # Metadata columns
    .union(df.columns[12:18])  # Additional metadata columns
    .union(df.columns[18:28])  # BP data
    .union(df.columns[30:43])  # End game data columns
    .union(df.columns[48:57])  # Drake-related columns
    .union(df.columns[40:43])  # Individual data columns
    .union(pd.Index([df.columns[78]]))  # Specific column (damageshare)
    .union(pd.Index([df.columns[91]]))  # Specific column (earnedgoldshare)
    .union(pd.Index([df.columns[95]]))  # Specific column (total cs)
    .union(pd.Index([df.columns[28]]))  # Specific column (gamelength)
    .union(df.columns[131:161])  # Data after 20 minutes
)

df.drop(columns=columns_to_drop, axis=1, inplace=True)



### Identify Null Values

In [None]:
# Print the count of null values in each column
print("Null values in each column:")
null_counts = df.isnull().sum()
null_columns = null_counts[null_counts > 0]
print(null_columns)
print("----------------------------------------------------")

# Calculate the ratio of null values for each column
null_ratio = (null_counts / total_rows)

# Filter and print only the columns where the ratio of null values is greater than 0
null_columns_with_ratio = null_ratio[null_ratio > 0]
print("Columns with null values and their ratios:")
print(null_columns_with_ratio)


### Drop or fill null values

In [None]:
df['void_grubs'] = df['void_grubs'].fillna(0)
df['opp_void_grubs'] = df['opp_void_grubs'].fillna(0)
df['turretplates'] = df['turretplates'].fillna(0)
df['opp_turretplates'] = df['opp_turretplates'].fillna(0)
df['heralds'] = df['heralds'].fillna(0)
df['opp_heralds'] = df['opp_heralds'].fillna(0)


df['cspm'] = df['cspm'].fillna(df['cspm'].median())
df['vspm'] = df['vspm'].fillna(df['vspm'].median())
df['visionscore'] = df['visionscore'].fillna(df['visionscore'].median())
df.drop(columns=['monsterkillsownjungle', 'monsterkillsenemyjungle'], inplace=True)
df.dropna(inplace=True)
df.drop(columns=['gameid', 'side'], inplace=True)


### Verify That All Missing Values Are Handled

In [None]:
# Print the count of null values in each column
print("Null values in each column:")
null_counts = df.isnull().sum()
null_columns = null_counts[null_counts > 0]
print(null_columns)
print("")

# Calculate the ratio of null values for each column
null_ratio = (null_counts / total_rows)

# Filter and print only the columns where the ratio of null values is greater than 0
null_columns_with_ratio = null_ratio[null_ratio > 0]
print("Columns with null values and their ratios:")
print(null_columns_with_ratio)


### Convert Categorical Variables into Numerical Format

In [None]:
from sklearn.preprocessing import LabelEncoder

champ_cols = ['top_champ', 'jng_champ', 'mid_champ', 'bot_champ', 'sup_champ']

all_champs = pd.concat([df[col] for col in champ_cols], axis=0).unique()

le = LabelEncoder()
le.fit(all_champs)

for col in champ_cols:
    df[col] = le.transform(df[col])

### Preview the Dataset

In [None]:
# Concatenate the head and tail of the dataframe
head_and_tail = pd.concat([df.head(), df.tail()])

# Save to a CSV file
head_and_tail.to_csv("head_and_tail.csv", index=False)
display(pd.concat([df.head(), df.tail()]))

## EDA

### Class Imbalance

In [None]:
import matplotlib.pyplot as plt

class_counts = df['result'].value_counts()
display(class_counts)

df['result'].value_counts(normalize=True).plot(kind='bar', title='Class Distribution')
plt.show()

### Feature Distributions 

In [None]:
display(df.describe())
print("Feature skewness: ")
display(df.skew().sort_values(ascending=False))

skewed = df.skew()[abs(df.skew()) > 1].index
axes = df.hist(bins=30, figsize=(25, 20))

# Annotate skewed features
for ax in axes.flatten():
    if ax.get_title() in skewed:
        ax.set_title(ax.get_title(), color='red')

plt.suptitle("Feature Distributions (Red = Skewed)", fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

#### Apply normalization to skew data

In [None]:
import numpy as np

skewed_features = df.skew()[abs(df.skew()) >= 1].index

for col in skewed_features:
    if (df[col] >= 0).all(): 
        df[col] = np.log1p(df[col])
    else:
        print(f"Feture {col} contains negative values, abort normalization")


In [None]:
from sklearn.preprocessing import MinMaxScaler

label_col = 'result'
binary_cols = [
    'firstdragon', 'firstherald', 'firstbaron', 'firsttower',
    'firstmidtower', 'firsttothreetowers'
]

exclude_cols = [label_col] + binary_cols

normalize_cols = df.columns.difference(exclude_cols)

scaler = MinMaxScaler()
df[normalize_cols] = scaler.fit_transform(df[normalize_cols])

#### Feature Distribution after normalization

In [None]:
display(df.describe())

axes = df.hist(bins=30, figsize=(25, 20))

plt.suptitle("Feature Distributions (Red = Skewed)", fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

### Correlation with Target

In [None]:
grouped_means = df.groupby('result').mean()
display(grouped_means)

df.groupby('result').mean().T.plot(kind='bar', figsize=(25, 20), title='Feature Mean by Result')


### Feature Correlation Matrix / Heatmap

In [None]:
import seaborn as sns

# correlation_matrix = df.corr()
# display(correlation_matrix)

plt.figure(figsize=(25, 20))
sns.heatmap(df.corr(), cmap='coolwarm', center=0, annot=False)
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

## Save the processed data to Parquet

In [None]:
df.to_parquet("data/processed_lol_data.parquet", index=False)


# Model Training


## Load Processed data

In [None]:
df = pd.read_parquet("data/processed_lol_data.parquet")

## Prepare data loader

In [None]:
def create_dataloaders(df, batch_size=64, test_size=0.2, seed=42):
    from sklearn.model_selection import train_test_split
    from torch.utils.data import Dataset, DataLoader
    import torch

    class LoLDataset(torch.utils.data.Dataset):
        def __init__(self, df):
            self.X = torch.tensor(df.drop(columns=['result']).values, dtype=torch.float32)
            self.y = torch.tensor(df['result'].values, dtype=torch.long)

        def __len__(self):
            return len(self.X)

        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=seed, stratify=df['result'])

    train_dataset = LoLDataset(train_df)
    test_dataset = LoLDataset(test_df)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader


## Define DNN Model

In [None]:
import torch.nn as nn

class LoLNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super(LoLNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)


## Define Training Loop

In [None]:
import torch
import optuna
from sklearn.metrics import accuracy_score

def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=10, device='cpu', trial=None, patience=3):
    model.to(device)
    best_acc = 0
    epoch_log = []
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

        train_acc = accuracy_score(all_labels, all_preds)

        model.eval()
        test_preds, test_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                preds = torch.argmax(outputs, dim=1)
                test_preds.extend(preds.cpu().numpy())
                test_labels.extend(y_batch.cpu().numpy())

        test_acc = accuracy_score(test_labels, test_preds)
        print(f"üß™ Epoch {epoch+1:>2}/{num_epochs:<2} | "
            f"Loss: {total_loss:10.4f} | "
            f"Train Acc: {train_acc:7.4f} | "
            f"Test Acc: {test_acc:7.4f}")


        
        epoch_log.append({
            'epoch': epoch + 1,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'loss': total_loss
        })
        
        if test_acc > best_acc:
            best_acc = test_acc
            best_epoch = epoch
            best_model_state = model.state_dict()
        elif epoch - best_epoch >= patience:
            print(f"‚èπÔ∏è Early stopping at epoch {epoch+1} (no improvement for {patience} epochs)")
            break

        # if trial is not None:
        #     trial.report(test_acc, step=epoch)
        #     if trial.should_prune():
        #         print(f"üî™ Trial pruned at epoch {epoch+1}")
        #         raise optuna.TrialPruned()

        best_acc = max(best_acc, test_acc)

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        
    return best_acc, epoch_log




## Define parameter tuning function

In [None]:
import optuna
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score

def objective(trial):
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256])
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    # patience = trial.suggest_int("patience", 2, 6)
    patience = 6
    num_epochs = trial.suggest_int("num_epochs", 10, 50)
    # num_epochs = 30 
    

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, test_loader = create_dataloaders(df, batch_size=batch_size)
    input_dim = df.drop(columns=['result']).shape[1]

    model = LoLNet(input_dim=input_dim, hidden_dim=hidden_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    acc, epoch_log = train_model(
        model, train_loader, test_loader, 
        criterion, optimizer, 
        num_epochs=num_epochs, device=device,
        trial=trial, patience=patience
    )
    
    trial.set_user_attr("epoch_log", epoch_log)

    model_path = f"models/trials/model_trial_{trial.number}.pth"
    torch.save(model.state_dict(), model_path)
    trial.set_user_attr("saved_model_path", model_path)
    print(f"üíæ Model for Trial {trial.number} saved to: {model_path}")


    return acc

## Start Tuning

In [None]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import shutil

def callback(study, trial):
    best = study.best_trial
    print(f"‚úÖ Trial {trial.number} | Accuracy: {trial.value:.4f} | Params: {trial.params} | "
          f"üèÜ Best: Trial {best.number} ({best.value:.4f}) \n")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, callbacks=[callback])


best_model_path_before = study.best_trial.user_attrs["saved_model_path"]
best_model_path_after = f"models/best_model.pth"
shutil.copy(best_model_path_before, best_model_path_after)
print(f"‚úÖ Best model copied to: {best_model_path_after}")



## Save training logs

In [None]:
import pandas as pd

trial_data = []

for trial in study.trials:
    if trial.state == optuna.trial.TrialState.COMPLETE:
        row = {
            "trial": trial.number,
            "accuracy": trial.value,
            **trial.params,  
        }

        if "epoch_log" in trial.user_attrs:
            row["actual_epochs"] = len(trial.user_attrs["epoch_log"])

        if "saved_model_path" in trial.user_attrs:
            row["model_path"] = trial.user_attrs["saved_model_path"]

        trial_data.append(row)

df_trials = pd.DataFrame(trial_data)
df_trials.to_csv("training_log/trial_results.csv", index=False)
print("‚úÖ Saved trial results to trial_results.csv")


In [None]:
epoch_logs = []

for trial in study.trials:
    if trial.state == optuna.trial.TrialState.COMPLETE and "epoch_log" in trial.user_attrs:
        for e in trial.user_attrs["epoch_log"]:
            log_row = {
                "trial": trial.number,
                **trial.params,
                **e  
            }
            epoch_logs.append(log_row)

df_epochs = pd.DataFrame(epoch_logs)
df_epochs.to_csv("training_log/epoch_logs.csv", index=False)
print("‚úÖ Saved per-epoch logs to epoch_logs.csv")


## Get Best Trial

In [None]:
best_trial = study.best_trial

print(f"üèÜ Best Trial: {best_trial.number}")
print(f"‚úÖ Accuracy: {best_trial.value:.4f}")
print(f"üì¶ Hyperparameters: {best_trial.params}")


In [None]:
import matplotlib.pyplot as plt

# Get best trial and epoch log
best_trial = study.best_trial
epoch_log = best_trial.user_attrs["epoch_log"]

# Extract values
epochs     = [e['epoch'] for e in epoch_log]
train_accs = [e['train_acc'] for e in epoch_log]
test_accs  = [e['test_acc'] for e in epoch_log]
losses     = [e['loss'] for e in epoch_log]

# Create two side-by-side subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy plot
ax1.plot(epochs, train_accs, label="Train Accuracy", marker='o')
ax1.plot(epochs, test_accs, label="Test Accuracy", marker='o')
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Accuracy")
ax1.set_title(f"Best Trial {best_trial.number} - Accuracy")
ax1.legend()
ax1.grid(True)

# Loss plot
ax2.plot(epochs, losses, label="Loss", color='gray', linestyle='--', marker='x')
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Loss")
ax2.set_title("Loss")
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()




# Local Robustness

## Checking for local robustness