In [1]:
# the goal of this feature engineering notebook is to transform exploratory features into a clean, model-ready dataset
# the goal is to select meaningful features that can be used to predict match outcomes

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

matches = pd.read_csv("../data/processed/matches_stats.csv")
matches.head()

Unnamed: 0,id,gameid,platformid,queueid,seasonid,duration,creation,version,blue_teamid,blue_firstblood,...,red_towerkills,red_inhibkills,red_baronkills,red_dragonkills,red_harrykills,blueWins,tower_diff,dragon_diff,baron_diff,inhib_diff
0,10,3187427022,EUW1,420,8,1909,1495068946860,7.10.187.9675,100,0,...,10,3,1,3,1,0,-5,-3,-1,-3
1,11,3187425281,EUW1,420,8,1693,1495066760778,7.10.187.9675,100,1,...,10,3,0,2,0,0,-8,-2,0,-3
2,12,3187269801,EUW1,420,8,1482,1495053375889,7.10.187.9675,100,1,...,11,3,0,2,0,0,-10,-2,0,-3
3,13,3187252065,EUW1,420,8,1954,1495050993613,7.10.187.9675,100,1,...,9,2,1,2,0,0,-3,0,-1,-2
4,14,3187201038,EUW1,420,8,2067,1495047893400,7.10.187.9675,100,1,...,7,1,1,3,0,0,-2,-3,-1,-1


In [3]:
# the target variable is `blueWins`, indicating whether the blue team won (1) the match or lost (0)
target = 'blueWins'
y = matches[target]

In [4]:
# based on EDA, these features show strong correlation with match outcomes so they're the most important for the model
features = [
    'tower_diff',
    'dragon_diff',
    'baron_diff',
    'inhib_diff',
    'blue_firstblood',
    'blue_firsttower',
    'blue_firstdragon',
    'blue_firstbaron'
]

X = matches[features]
X.head()


Unnamed: 0,tower_diff,dragon_diff,baron_diff,inhib_diff,blue_firstblood,blue_firsttower,blue_firstdragon,blue_firstbaron
0,-5,-3,-1,-3,0,1,0,0
1,-8,-2,0,-3,1,0,0,0
2,-10,-2,0,-3,1,0,0,0
3,-3,0,-1,-2,1,1,1,0
4,-2,-3,-1,-1,1,0,0,0


In [5]:
X.describe()

Unnamed: 0,tower_diff,dragon_diff,baron_diff,inhib_diff,blue_firstblood,blue_firsttower,blue_firstdragon,blue_firstbaron
count,184069.0,184069.0,184069.0,184069.0,184069.0,184069.0,184069.0,184069.0
mean,0.255502,-0.010436,-0.036736,0.057875,0.507277,0.502638,0.476354,0.304712
std,6.907783,2.134854,0.946655,2.156802,0.499948,0.499994,0.499442,0.460287
min,-12.0,-7.0,-5.0,-12.0,0.0,0.0,0.0,0.0
25%,-6.0,-2.0,-1.0,-2.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,7.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0
max,13.0,7.0,4.0,12.0,1.0,1.0,1.0,1.0


In [6]:
processed = matches[features + [target]]
processed.to_csv("../data/processed/matches_ml_ready.csv", index=False)