In [18]:
import pandas as np
import numpy as np

def process_data(data):
    """Vectorized processing of a single dataframe"""
    blue_wins = data['blueWin'] == 1
    
    # Initialize new dataframe with common columns
    new_df = data[['matchID', 'fullTimeMS', 'timePercent']].copy()
    
    # Define core metrics to process
    metrics = [
        'ChampionKill',
        'DragonKill',
        'DragonElderKill',
        'RiftHeraldKill',
        'BaronKill',
        'TowerKill',
        'InhibitorKill',
        'TotalGold'
    ]
    
    # Process combined minions
    new_df['winnerMinionsKilled'] = np.where(
        blue_wins,
        data['blueMinionsKilled'] + data['blueJungleMinionsKilled'],
        data['redMinionsKilled'] + data['redJungleMinionsKilled']
    )
    new_df['loserMinionsKilled'] = np.where(
        blue_wins,
        data['redMinionsKilled'] + data['redJungleMinionsKilled'],
        data['blueMinionsKilled'] + data['blueJungleMinionsKilled']
    )
    
    # Process all core metrics
    for metric in metrics:
        new_df[f'winner{metric}'] = np.where(
            blue_wins,
            data[f'blue{metric}'],
            data[f'red{metric}']
        )
        new_df[f'loser{metric}'] = np.where(
            blue_wins,
            data[f'red{metric}'],
            data[f'blue{metric}']
        )
    
    return new_df

# Main processing
data_subsets = {
    '20': 'full_data_20.csv',
    '40': 'full_data_40.csv',
    '60': 'full_data_60.csv',
    '80': 'full_data_80.csv',
    '100': 'full_data_100.csv'
}

# Dictionary to store processed DataFrames
processed_datasets = {}

for subset_label, file_path in data_subsets.items():
    print(f"Processing {subset_label}% subset...")
    raw_data = pd.read_csv(file_path)
    processed_data = process_data(raw_data)
    
    # Add subset label as a column for reference
    processed_data['subset'] = subset_label
    
    # Store in dictionary
    processed_datasets[subset_label] = processed_data

# Example usage:
print(f"Available subsets: {list(processed_datasets.keys())}")
print("\n20% subset head:")
print(processed_datasets['20'].head())
print("\n100% subset info:")
print(processed_datasets['100'].info())

# If you need one combined DataFrame:
combined_df = pd.concat(processed_datasets.values(), ignore_index=True)

Processing 20% subset...
Processing 40% subset...
Processing 60% subset...
Processing 80% subset...
Processing 100% subset...
Available subsets: ['20', '40', '60', '80', '100']

20% subset head:
          matchID  fullTimeMS  timePercent  winnerMinionsKilled  \
0  BR1_2720891721     2092233            6                  109   
1  BR1_2720337066     1867984            6                  131   
2  BR1_2720218416     2396948            7                  162   
3  BR1_2720199652     1656791            5                  117   
4  BR1_2720058177     1875549            6                  119   

   loserMinionsKilled  winnerChampionKill  loserChampionKill  \
0                  87                   5                  3   
1                 114                   3                  1   
2                 130                   3                  1   
3                 111                   2                  1   
4                 120                   5                  2   

   winnerDragonKi

In [None]:
from sklearn.preprocessing import MinMaxScaler

def create_gold_labels(data_dict):
    """Returns only gold label vectors for each percentage subset"""
    subsets = sorted(data_dict.keys(), key=lambda x: int(x))
    
    metrics = [
        'winnerChampionKill',
        'winnerDragonKill',
        'winnerDragonElderKill',
        'winnerRiftHeraldKill',
        'winnerBaronKill',
        'winnerTowerKill',
        'winnerInhibitorKill',
        'winnerTotalGold',
        'winnerMinionsKilled'
    ]
    
    label_map = {
        'winnerChampionKill': 'Kills',
        'winnerDragonKill': 'Dragons',
        'winnerDragonElderKill': 'Elder',
        'winnerRiftHeraldKill': 'Herald',
        'winnerBaronKill': 'Baron',
        'winnerTowerKill': 'Towers',
        'winnerInhibitorKill': 'Inhibitors',
        'winnerTotalGold': 'Gold',
        'winnerMinionsKilled': 'CS'
    }

    gold_labels = {}  
    
    for i, subset in enumerate(subsets):
        current_data = data_dict[subset]
        
        if subset == '20':
            # For 20%: curr - 0 (just current values)
            differences = current_data[metrics].copy()
        else:
            # For others: curr - prev
            prev_data = data_dict[subsets[i-1]]
            common_matches = np.intersect1d(current_data['matchID'], prev_data['matchID'])
            
            # Get aligned data
            current_matched = current_data.set_index('matchID').loc[common_matches]
            prev_matched = prev_data.set_index('matchID').loc[common_matches]
            
            # Calculate differences
            differences = current_matched[metrics] - prev_matched[metrics]
        
        # Normalize and create labels
        scaler = MinMaxScaler()
        normalized_diffs = scaler.fit_transform(differences)
        max_metric_indices = np.argmax(normalized_diffs, axis=1)
        
        # Map to label names
        metric_names = [label_map[metric] for metric in metrics]
        gold_labels[subset] = pd.Series(
            [metric_names[i] for i in max_metric_indices],
            index=differences.index,
            name='gold_label'
        )
    
    return gold_labels

In [26]:
# Usage Example:
gold_labels = create_gold_labels(processed_datasets)

# Access labels for 40% subset:
labels_40 = gold_labels['40']
print(labels_40.head())

matchID
BR1_2660730273      CS
BR1_2660921375      CS
BR1_2660924744    Gold
BR1_2661030804      CS
BR1_2661040994      CS
Name: gold_label, dtype: object
