In [25]:
import pandas as pd
import numpy as np


# Converts blue-red into win-lose
def process_data(data):
    """Vectorized processing of a single dataframe"""
    blue_wins = data['blueWin'] == 1
    
    # Initialize new dataframe with common columns
    new_df = data[['matchID', 'fullTimeMS', 'timePercent']].copy()
    
    # Define core metrics to process
    metrics = [
        'ChampionKill',
        'DragonKill',
        'DragonElderKill',
        'RiftHeraldKill',
        'BaronKill',
        'TowerKill',
        'InhibitorKill',
        'TotalGold'
    ]
    
    # Process combined minions
    new_df['winnerMinionsKilled'] = np.where(
        blue_wins,
        data['blueMinionsKilled'] + data['blueJungleMinionsKilled'],
        data['redMinionsKilled'] + data['redJungleMinionsKilled']
    )
    new_df['loserMinionsKilled'] = np.where(
        blue_wins,
        data['redMinionsKilled'] + data['redJungleMinionsKilled'],
        data['blueMinionsKilled'] + data['blueJungleMinionsKilled']
    )
    
    # Process all core metrics
    for metric in metrics:
        new_df[f'winner{metric}'] = np.where(
            blue_wins,
            data[f'blue{metric}'],
            data[f'red{metric}']
        )
        new_df[f'loser{metric}'] = np.where(
            blue_wins,
            data[f'red{metric}'],
            data[f'blue{metric}']
        )
    
    return new_df

# Main processing
data_subsets = {
    '20': 'full_data_20.csv',
    '40': 'full_data_40.csv',
    '60': 'full_data_60.csv',
    '80': 'full_data_80.csv',
    '100': 'full_data_100.csv'
}

# Dictionary to store processed DataFrames
processed_datasets = {}

for subset_label, file_path in data_subsets.items():
    print(f"Processing {subset_label}% subset")
    raw_data = pd.read_csv(file_path)
    processed_data = process_data(raw_data)
    
    # Add subset label as a column for reference
    processed_data['subset'] = subset_label
    
    # Store in dictionary
    processed_datasets[subset_label] = processed_data

# Example usage:
print(f"Available subsets: {list(processed_datasets.keys())}")
print("\n20% subset head:")
print(processed_datasets['20'].head())
print("\n100% subset info:")
print(processed_datasets['100'].head())

# If you need one combined DataFrame:
combined_df = pd.concat(processed_datasets.values(), ignore_index=True)

Processing 20% subset
Processing 40% subset
Processing 60% subset
Processing 80% subset
Processing 100% subset
Available subsets: ['20', '40', '60', '80', '100']

20% subset head:
          matchID  fullTimeMS  timePercent  winnerMinionsKilled  \
0  BR1_2720891721     2092233            6                  109   
1  BR1_2720337066     1867984            6                  131   
2  BR1_2720218416     2396948            7                  162   
3  BR1_2720199652     1656791            5                  117   
4  BR1_2720058177     1875549            6                  119   

   loserMinionsKilled  winnerChampionKill  loserChampionKill  \
0                  87                   5                  3   
1                 114                   3                  1   
2                 130                   3                  1   
3                 111                   2                  1   
4                 120                   5                  2   

   winnerDragonKill  loserDragon

In [26]:
from sklearn.preprocessing import MinMaxScaler

def create_gold_labels(data_dict):
    subsets = sorted(data_dict.keys(), key=lambda x: int(x))
    
    metrics = [
        'winnerChampionKill',
        'winnerDragonKill',
        'winnerDragonElderKill',
        'winnerRiftHeraldKill',
        'winnerBaronKill',
        'winnerTowerKill',
        'winnerInhibitorKill',
        'winnerTotalGold',
        'winnerMinionsKilled'
    ]
    
    label_map = {
        'winnerChampionKill': 'Champion Kills',
        'winnerDragonKill': 'Dragons',
        'winnerDragonElderKill': 'Elder Dragon',
        'winnerRiftHeraldKill': 'Herald',
        'winnerBaronKill': 'Baron',
        'winnerTowerKill': 'Towers',
        'winnerInhibitorKill': 'Inhibitors',
        'winnerTotalGold': 'Gold',
        'winnerMinionsKilled': 'Minions'
    }

    gold_labels = {}

    for i, subset in enumerate(subsets):
        current_data = data_dict[subset]
        
        # Create DataFrame to store results (preserves matchID and order)
        result_df = pd.DataFrame(index=current_data.index)
        
        if subset == '20':
            # For 20%: curr - 0
            differences = current_data[metrics].values
        else:
            # For others: curr - prev
            prev_data = data_dict[subsets[i-1]]

            # Since match IDs are in same order, we can directly subtract
            current_values = current_data[metrics].values
            prev_values = prev_data[metrics].values  # Direct access in same order

            # Calculate differences (current - previous)
            differences = current_values - prev_values
        
        # Normalize and label
        scaler = MinMaxScaler()
        normalized = scaler.fit_transform(differences)
        
        # Store results
        result_df['gold_label'] = [label_map[metrics[np.argmax(row)]] for row in normalized]

        gold_labels[subset] = result_df

    return gold_labels

In [27]:
# Usage Example:
gold_labels = create_gold_labels(processed_datasets)

# Access labels for 40% subset:
labels_40 = gold_labels['40']
print(labels_40.head())

labels_80 = gold_labels['80']
print(labels_80.head())

  gold_label
0    Dragons
1    Dragons
2    Minions
3     Herald
4    Dragons
       gold_label
0          Towers
1  Champion Kills
2         Dragons
3         Dragons
4  Champion Kills


In [28]:
from sklearn.preprocessing import MinMaxScaler
# min/max scale the preprocessed data
scaler = MinMaxScaler()
# print(processed_datasets['20'].head(3))

metrics = [
    'ChampionKill',
    'DragonKill',
    'DragonElderKill',
    'RiftHeraldKill',
    'BaronKill',
    'TowerKill',
    'InhibitorKill',
    'TotalGold',
    'MinionsKilled'
]


# WILL NOT WORK TWICE IN A ROW
subsets = sorted(processed_datasets.keys(), key=lambda x: int(x))
for i, subset in enumerate(subsets):
    current_data = processed_datasets[subset]
    
    # Create DataFrame to store results (preserves matchID and order)
    result_df = pd.DataFrame(index=current_data.index)
    

    # Calculate differences (winner-loser)
    for metric in metrics:
        result_df[f'{metric}diff'] = current_data[f'winner{metric}'] - current_data[f'loser{metric}']
    


    # Normalize and label
    scaler = MinMaxScaler()
    result_df[result_df.columns] = scaler.fit_transform(result_df[result_df.columns])
    
    # Set new dataframe in place of old
    processed_datasets[subset] = result_df







In [None]:
# # add gold label data into the dataframe
# for i, subset in enumerate(subsets):
#     current_data = processed_datasets[subset]
#     # print(current_data.columns)
#     gold_label = gold_labels[subset]

#     new_df = pd.concat([current_data, gold_label], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
# split into train/test
for i, subset in enumerate(subsets):
    current_data = processed_datasets[subset]
    current_gold = gold_labels[subset]

    data_array = np.
    gold_array = np.

    X_train, X_test, y_train, y_test = train_test_split(data_array, gold_array, test_size=.2, random_state=42)

    processed_datasets[subsets] = [X_train, X_test, y_train, y_test]






