# Phase 2 & 3: Feature Engineering and Model Training

In this notebook, we will:
1. Load the raw match data we collected.
2. Convert champion names into numerical features (One-Hot Encoding).
3. Train a Machine Learning model to predict the winner.
4. Test the accuracy of our model.

In [1]:
%pip install pandas scikit-learn numpy seaborn matplotlib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-macosx_10_9_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 1.1 MB/s eta 0:00:01
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting matplotlib
  Downloading matplotlib-3.7.5-cp38-cp38-macosx_10_12_x86_64.whl (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 12.3 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.5.0
  Downloading scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl (35.0 MB)
[K     |████████████████████████████████| 35.0 MB 31.0 MB/s eta 0:00:01
[?25hCollecting joblib>=1.1.1
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 32.3 MB/s eta 0:00:01
[?25hCollecting pillow>=6.2.0
  Downloading pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 76.2 MB/s eta 0:00:

In [2]:
# 1. Load the dataset
df = pd.read_csv("league_matches_raw.csv")

print(f"Loaded {len(df)} matches.")
display(df.head())

Loaded 171 matches.


Unnamed: 0,blue_player_1,blue_player_2,blue_player_3,blue_player_4,blue_player_5,red_player_1,red_player_2,red_player_3,red_player_4,red_player_5,winner
0,Jax,Elise,Akshan,AurelionSol,Maokai,Heimerdinger,Naafiri,Irelia,Ezreal,Fiora,red
1,Teemo,Naafiri,Ryze,Yunara,Alistar,Kennen,Malphite,Zed,Kaisa,Leona,red
2,Kennen,Talon,Quinn,Kaisa,Pyke,Vayne,Qiyana,Ekko,Lucian,Rakan,blue
3,Ambessa,Nidalee,Irelia,Mel,Rell,KSante,Elise,Hwei,Smolder,Bard,blue
4,Shen,Viego,Syndra,Yunara,Nautilus,Ambessa,Zed,Zoe,Ashe,Bard,blue


In [3]:
# 2. Get a list of all unique champions in our dataset
# We need to know every possible champion name to create columns for them.
all_champs = pd.unique(df.values.ravel()) # .ravel() flattens the whole table into one long list
all_champs = [c for c in all_champs if c != 'blue' and c != 'red'] # Remove 'winner' values
all_champs = sorted(all_champs) # Sort alphabetically

print(f"Total unique champions found: {len(all_champs)}")

# 3. Create the One-Hot Encoded Features
# We will create a new DataFrame where each row is a match,
# and columns are like "blue_Aatrox", "red_Ahri", etc.

def encode_match(row):
    # Create a dictionary with 0s for all champions
    # Structure: {'blue_Aatrox': 0, 'red_Aatrox': 0, ...}
    features = {}
    for champ in all_champs:
        features[f"blue_{champ}"] = 0
        features[f"red_{champ}"] = 0
    
    # Now set to 1 if the champion is in the game
    # Blue Team (first 5 columns in raw data)
    for i in range(1, 6):
        champ_name = row[f"blue_player_{i}"]
        if champ_name in all_champs: # Check exists to be safe
            features[f"blue_{champ_name}"] = 1
            
    # Red Team (next 5 columns)
    for i in range(1, 6):
        champ_name = row[f"red_player_{i}"]
        if champ_name in all_champs:
            features[f"red_{champ_name}"] = 1
            
    return pd.Series(features)

print("Encoding data... this might take a moment.")
# Apply this function to every row in our dataframe
X = df.apply(encode_match, axis=1)

# Create the target vector y (1 for Blue Win, 0 for Red Win)
y = df['winner'].apply(lambda x: 1 if x == 'blue' else 0)

print("Done!")
print(f"Input shape: {X.shape} (Rows, Features)")
display(X.head())

Total unique champions found: 159
Encoding data... this might take a moment.
Done!
Input shape: (171, 318) (Rows, Features)


Unnamed: 0,blue_Aatrox,red_Aatrox,blue_Ahri,red_Ahri,blue_Akali,red_Akali,blue_Akshan,red_Akshan,blue_Alistar,red_Alistar,...,blue_Zeri,red_Zeri,blue_Ziggs,red_Ziggs,blue_Zilean,red_Zilean,blue_Zoe,red_Zoe,blue_Zyra,red_Zyra
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
# 1. Split into Training (80%) and Testing (20%) sets
# random_state=42 ensures we get the same split every time (for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {len(X_train)} matches.")
print(f"Testing on {len(X_test)} matches.")

# 2. Initialize the Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 3. Train! (Fit the model to the training data)
print("Training model...")
model.fit(X_train, y_train)

# 4. Evaluate! (Predict on the test data and compare to real answers)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("--- Results ---")
print(f"Model Accuracy: {accuracy:.2%}")
print("\nDetailed Report:")
print(classification_report(y_test, y_pred, target_names=['Red Win', 'Blue Win']))

Training on 136 matches.
Testing on 35 matches.
Training model...
--- Results ---
Model Accuracy: 54.29%

Detailed Report:
              precision    recall  f1-score   support

     Red Win       0.41      0.54      0.47        13
    Blue Win       0.67      0.55      0.60        22

    accuracy                           0.54        35
   macro avg       0.54      0.54      0.53        35
weighted avg       0.57      0.54      0.55        35



In [16]:
def predict_game(blue_team, red_team):
    """
    Predicts the winner for a custom draft.
    blue_team: List of 5 champion names
    red_team: List of 5 champion names
    """
    # 1. Create a feature vector of all 0s
    input_vector = pd.Series(0, index=X.columns)
    
    # 2. Set the picked champions to 1
    for champ in blue_team:
        if f"blue_{champ}" in input_vector.index:
            input_vector[f"blue_{champ}"] = 1
        else:
            print(f"Warning: Model has never seen {champ} before!")
            
    for champ in red_team:
        if f"red_{champ}" in input_vector.index:
            input_vector[f"red_{champ}"] = 1
        else:
            print(f"Warning: Model has never seen {champ} before!")
            
    # 3. Predict
    # We need to reshape it because the model expects a list of rows, not just one row
    input_df = pd.DataFrame([input_vector])
    
    prediction = model.predict(input_df)[0]
    probability = model.predict_proba(input_df)[0]
    
    winner = "Blue" if prediction == 1 else "Red"
    confidence = probability[1] if prediction == 1 else probability[0]
    
    print(f"Prediction: {winner} Team Wins!")
    print(f"Confidence: {confidence:.1%}")
    return winner

# --- TRY IT OUT! ---
# Replace these with real champion names (Case Sensitive!)
my_blue = ["Aatrox", "LeeSin", "Sylas", "Vi", "Rell"]
my_red  = ["Malphite", "Viego", "Akali", "Sivir", "Mel"]

predict_game(my_blue, my_red)

Prediction: Blue Team Wins!
Confidence: 55.0%


'Blue'

In [5]:
print(all_champs)

['Aatrox', 'Ahri', 'Akali', 'Akshan', 'Alistar', 'Ambessa', 'Anivia', 'Annie', 'Aphelios', 'Ashe', 'AurelionSol', 'Aurora', 'Azir', 'Bard', 'Belveth', 'Blitzcrank', 'Brand', 'Braum', 'Caitlyn', 'Camille', 'Cassiopeia', 'Chogath', 'Corki', 'Darius', 'Diana', 'DrMundo', 'Draven', 'Ekko', 'Elise', 'Evelynn', 'Ezreal', 'FiddleSticks', 'Fiora', 'Fizz', 'Galio', 'Gangplank', 'Garen', 'Gnar', 'Gragas', 'Graves', 'Gwen', 'Hecarim', 'Heimerdinger', 'Hwei', 'Illaoi', 'Irelia', 'Ivern', 'Janna', 'JarvanIV', 'Jax', 'Jayce', 'Jhin', 'KSante', 'Kaisa', 'Kalista', 'Karma', 'Karthus', 'Kassadin', 'Katarina', 'Kayle', 'Kayn', 'Kennen', 'Khazix', 'Kindred', 'KogMaw', 'Leblanc', 'LeeSin', 'Leona', 'Lissandra', 'Lucian', 'Lulu', 'Lux', 'Malphite', 'Malzahar', 'Maokai', 'Mel', 'Milio', 'MissFortune', 'MonkeyKing', 'Morgana', 'Naafiri', 'Nami', 'Nautilus', 'Neeko', 'Nidalee', 'Nilah', 'Nocturne', 'Nunu', 'Olaf', 'Orianna', 'Ornn', 'Pantheon', 'Poppy', 'Pyke', 'Qiyana', 'Quinn', 'Rakan', 'RekSai', 'Rell', 'R