In [1]:
import os
import pandas as pd
import numpy as np 
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import cross_val_score , GridSearchCV
from sklearn.linear_model import LogisticRegression
import json
from tqdm import tqdm_notebook
import lightgbm as lgb

PATH_TO_DATA = '../input/mlcourse-dota2-win-prediction'

#Train and test set
df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_features.csv'), index_col='match_id_hash')
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'), index_col='match_id_hash')

#Target variable 
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_targets.csv'), 
                                   index_col='match_id_hash')

Using TensorFlow backend.


In [2]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [3]:
import collections

MATCH_FEATURES = [
    ('game_time', lambda m: m['game_time']),
    ('game_mode', lambda m: m['game_mode']),
    ('lobby_type', lambda m: m['lobby_type']),
    ('objectives_len', lambda m: len(m['objectives'])),
    ('chat_len', lambda m: len(m['chat'])),
    ('number_of_teamfights', lambda m: len(m['teamfights'])),
]

PLAYER_FIELDS = [
    'hero_id',
    
    'kills',
    'deaths',
    'assists',
    'denies',
    
    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',
    
    'stuns',
    'creeps_stacked',
    'camps_stacked',
    'rune_pickups',
    'firstblood_claimed',
    'teamfight_participation',
    'towers_killed',
    'roshans_killed',
    'obs_placed',
    'sen_placed',
]



def extract_features_csv(match):
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]
    
    for field, f in MATCH_FEATURES:
        row.append((field, f(match)))
        
    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        for field in PLAYER_FIELDS:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, player[field]))
            
        row.append((player_name +'_ability_level', len(player['ability_upgrades'])))
        row.append((player_name +'_max_hero_hit_val', player['max_hero_hit']['value']))
        #row.append((player_name +'_max_hero_hit_time', player['max_hero_hit']['time']))
        row.append((player_name +'_purchase_count', len(player['purchase_log'])))
        row.append((player_name +'_killslog', len(player['kills_log'])))
        row.append((player_name +'_buyback', len(player['buyback_log'])))
        row.append((player_name +'_actions', len(player['actions'])))
        row.append((player_name +'_killedlen', len(player['killed'])))
        
        row.append((f'{player_name}_damage_dealt', sum(player['damage'].values())))
        row.append((f'{player_name}_damage_received', sum(player['damage_taken'].values())))
        
        row.append((player_name +'_nearby_creep_death_count', player['nearby_creep_death_count']))
    
        
        type_of_damage = player['damage'].keys()
        hero_damage_keys = [key for key in type_of_damage if "hero" in key] 
        hero_damage_values = [player['damage'][key] for key in hero_damage_keys]
        
        nonhero_damage_keys = [key for key in type_of_damage if "creep" in key] 
        nonhero_damage_values = [player['damage'][key] for key in nonhero_damage_keys]
        
        neutral_damage_keys = [key for key in type_of_damage if "neutral" in key] 
        neutral_damage_values = [player['damage'][key] for key in nonhero_damage_keys]
        
        #Damage dealed 
        row.append((player_name +'_damage_deal_toheroes', sum(hero_damage_values)))
        row.append((player_name +'_damage_deal_tocreeps', sum(nonhero_damage_values)))
        row.append((player_name +'_damage_deal_toneutral', sum(neutral_damage_values)))
        
        #Items
        row.append((player_name +'_unique_items', len(player['item_uses'])))
        row.append((player_name +'_number_of_itmes_used', sum(player['item_uses'].values())))
        
        #Ability uses 
        row.append((player_name +'_unique_ability', len(player['ability_uses'])))
        row.append((player_name +'_number_of_ability_used', sum(player['ability_uses'].values())))
        
            
    return collections.OrderedDict(row)

def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [4]:
%%time
df_new_features = []
df_new_targets = []

for match in read_matches(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    targets = extract_targets_csv(match, match['targets'])
    
    df_new_features.append(features)
    df_new_targets.append(targets)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))


CPU times: user 2min 7s, sys: 7.75 s, total: 2min 14s
Wall time: 2min 13s


In [5]:
df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
df_new_targets = pd.DataFrame.from_records(df_new_targets).set_index('match_id_hash')

In [6]:
def add_new_featuress(df_features, matches_file):
    
    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Counting ruined towers for both teams
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills
        
        # ... here you can add more features ...

In [7]:
add_new_featuress(df_new_features, 
                 os.path.join(PATH_TO_DATA, 
                              'train_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [8]:
test_new_features = []
for match in read_matches(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    
    test_new_features.append(features)
test_new_features = pd.DataFrame.from_records(test_new_features).set_index('match_id_hash')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [9]:
add_new_featuress(test_new_features, 
                 os.path.join(PATH_TO_DATA, 
                              'test_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [10]:
for c in ['kills', 'deaths', 'assists', 'denies', 'gold', 'lh', 'xp', 'health', 'max_health', 'max_mana', 'level', 'x', 'y', 'stuns', 'creeps_stacked', 'camps_stacked', 'rune_pickups',
          'firstblood_claimed', 'teamfight_participation', 'towers_killed', 'roshans_killed', 'obs_placed', 'sen_placed', 'ability_level', 'purchase_count',
           'damage_dealt', 'damage_received']:
    r_columns = [f'r{i}_{c}' for i in range(1, 6)]
    d_columns = [f'd{i}_{c}' for i in range(1, 6)]
    
    df_new_features['r_total_' + c] = df_new_features[r_columns].sum(1)
    df_new_features['d_total_' + c] = df_new_features[d_columns].sum(1)
    df_new_features['total_' + c + '_ratio'] = df_new_features['r_total_' + c] / df_new_features['d_total_' + c]
    
    test_new_features['r_total_' + c] = test_new_features[r_columns].sum(1)
    test_new_features['d_total_' + c] = test_new_features[d_columns].sum(1)
    test_new_features['total_' + c + '_ratio'] = test_new_features['r_total_' + c] / test_new_features['d_total_' + c]
    
    df_new_features['r_std_' + c] = df_new_features[r_columns].std(1)
    df_new_features['d_std_' + c] = df_new_features[d_columns].std(1)
    df_new_features['std_' + c + '_ratio'] = df_new_features['r_std_' + c] / df_new_features['d_std_' + c]
    
    test_new_features['r_std_' + c] = test_new_features[r_columns].std(1)
    test_new_features['d_std_' + c] = test_new_features[d_columns].std(1)
    test_new_features['std_' + c + '_ratio'] = test_new_features['r_std_' + c] / test_new_features['d_std_' + c]
    
    df_new_features['r_mean_' + c] = df_new_features[r_columns].mean(1)
    df_new_features['d_mean_' + c] = df_new_features[d_columns].mean(1)
    df_new_features['mean_' + c + '_ratio'] = df_new_features['r_mean_' + c] / df_new_features['d_mean_' + c]
    
    test_new_features['r_mean_' + c] = test_new_features[r_columns].mean(1)
    test_new_features['d_mean_' + c] = test_new_features[d_columns].mean(1)
    test_new_features['mean_' + c + '_ratio'] = test_new_features['r_mean_' + c] / test_new_features['d_mean_' + c]

In [11]:
TEAMFIGHT_FIELDS = [
    
    'ability_uses',
    'item_uses',
    'killed',
    'deaths',
    'buybacks',
    'damage',
    'healing',
    'gold_delta',
    'xp_delta'
]

def extract_teamfight_features_csv(match):
    
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]
    
    #Teamfights 
    
    for number,teamfight in enumerate(match["teamfights"]) :
        
        #row.append(("number_of_teamfights", len(match["teamfights"])))
        
        for slot,player in enumerate(teamfight["players"]) :
            
            if slot < 5:
                player_name = 'r%d' % (slot + 1) +"_teamfight_" + str(number)
            else:
                player_name = 'd%d' % (slot - 4) +"_teamfight_" + str(number)
                

            

            row.append((player_name +'_damage', player['damage']))
            row.append((player_name +'_gold_delta', player['gold_delta']))
            row.append((player_name +'_xp_delta', player['xp_delta']))

                
                
    return collections.OrderedDict(row)

In [12]:
%%time
df_train_teamfight = []


for match in read_matches(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_teamfight_features_csv(match)
    
    df_train_teamfight.append(features)

df_train_teamfight_features = pd.DataFrame.from_records(df_train_teamfight).set_index('match_id_hash')
df_train_teamfight_features.head(5)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))


CPU times: user 1min 51s, sys: 2.78 s, total: 1min 54s
Wall time: 1min 53s


Unnamed: 0_level_0,r1_teamfight_0_damage,r1_teamfight_0_gold_delta,r1_teamfight_0_xp_delta,r2_teamfight_0_damage,r2_teamfight_0_gold_delta,r2_teamfight_0_xp_delta,r3_teamfight_0_damage,r3_teamfight_0_gold_delta,r3_teamfight_0_xp_delta,r4_teamfight_0_damage,...,d2_teamfight_24_xp_delta,d3_teamfight_24_damage,d3_teamfight_24_gold_delta,d3_teamfight_24_xp_delta,d4_teamfight_24_damage,d4_teamfight_24_gold_delta,d4_teamfight_24_xp_delta,d5_teamfight_24_damage,d5_teamfight_24_gold_delta,d5_teamfight_24_xp_delta
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,,,,,,,,,,,...,,,,,,,,,,
b9c57c450ce74a2af79c9ce96fac144d,1079.0,570.0,110.0,0.0,40.0,0.0,236.0,-21.0,19.0,122.0,...,,,,,,,,,,
6db558535151ea18ca70a6892197db41,,,,,,,,,,,...,,,,,,,,,,
46a0ddce8f7ed2a8d9bd5edcbb925682,,,,,,,,,,,...,,,,,,,,,,
b1b35ff97723d9b7ade1c9c3cf48f770,,,,,,,,,,,...,,,,,,,,,,


In [13]:
df_train_teamfight_features.fillna(0, inplace=True)

In [14]:
test_teamfight_new_features = []
for match in read_matches(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_teamfight_features_csv(match)
    
    test_teamfight_new_features.append(features)
    
test_teamfight_new_features = pd.DataFrame.from_records(test_teamfight_new_features).set_index('match_id_hash')

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [15]:
test_teamfight_new_features.fillna(value = 0 , inplace = True)
test_teamfight_new_features.head()

Unnamed: 0_level_0,r1_teamfight_0_damage,r1_teamfight_0_gold_delta,r1_teamfight_0_xp_delta,r2_teamfight_0_damage,r2_teamfight_0_gold_delta,r2_teamfight_0_xp_delta,r3_teamfight_0_damage,r3_teamfight_0_gold_delta,r3_teamfight_0_xp_delta,r4_teamfight_0_damage,...,d2_teamfight_22_xp_delta,d3_teamfight_22_damage,d3_teamfight_22_gold_delta,d3_teamfight_22_xp_delta,d4_teamfight_22_damage,d4_teamfight_22_gold_delta,d4_teamfight_22_xp_delta,d5_teamfight_22_damage,d5_teamfight_22_gold_delta,d5_teamfight_22_xp_delta
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30cc2d778dca82f2edb568ce9b585caa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70e5ba30f367cea48793b9003fab9d38,739.0,458.0,107.0,526.0,128.0,50.0,0.0,0.0,0.0,505.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4d9ef74d3a2025d79e9423105fd73d41,276.0,194.0,95.0,190.0,101.0,238.0,99.0,134.0,268.0,532.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2bb79e0c1eaac1608e5a09c8e0c6a555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bec17f099b01d67edc82dfb5ce735a43,1201.0,259.0,511.0,762.0,41.0,572.0,190.0,287.0,362.0,531.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def sum_for_teamfight(data,stats,maxim ):
    df = data.copy()
    for feat_suff in stats:
            for team in 'r', 'd':
                players = [team +str(i) + "_teamfight_" + str(number_of_fight)  for i in range(1, 6) for number_of_fight in range(0,maxim)] # r1, r2...
                player_col_names = [player + "_" + feat_suff for player in players] # e.g. r1_gold, r2_gold
                
                df[team  + "_teamfight_" + feat_suff + "_sum"] = df[player_col_names].sum(axis=1) # e.g. r_gold_mean
                
                df.drop(columns=player_col_names, inplace=True) # remove raw features from the dataset
    
    return(df)

In [17]:
df_train_teamfight_features_average = sum_for_teamfight(df_train_teamfight_features,["damage","gold_delta","xp_delta"],maxim = 25)
test_teamfight_new_features_average = sum_for_teamfight(test_teamfight_new_features,["damage","gold_delta","xp_delta"],maxim = 23)

In [18]:
target_variable = pd.DataFrame(df_new_targets["radiant_win"].astype("int"))

In [19]:
numeric_features = ['deaths', 'assists', 'denies', 'health','gold','kills','xp' ,'x','y',
                    'max_health', 'max_mana', 'level', 'towers_killed', 'stuns', 'creeps_stacked', 
                    'camps_stacked', 'lh', 'rune_pickups',
                    'teamfight_participation', 'roshans_killed', 'obs_placed', 'sen_placed',
                    
                     'gold_per_minute','kills_per_minute','xp_per_minute','level_per_minute','damage_deal_tocreeps_per_minute',
                     "deaths_per_minute","number_of_ability_used_per_minute",
                    
                     "damage_deal_toheroes_per_minute","damage_deal_toneutral_per_minute",
                    
                    'ability_level','max_hero_hit_val','purchase_count','damage_deal_toheroes','damage_deal_tocreeps',
                    'unique_items','number_of_itmes_used','unique_ability','number_of_ability_used',
                   
                    "nearby_creep_death_count","buyback","damage_deal_toneutral"]

In [20]:
def combine_numeric_features (data, feature_suffixes):
    
    df = data.copy()
    for feat_suff in feature_suffixes:
        for team in 'r', 'd':
            players = [team +str(i) for i in range(1, 6)] # r1, r2...
            player_col_names = [player + "_" +feat_suff for player in players] # e.g. r1_gold, r2_gold
            
           
            df[team + "_" + feat_suff + "_"+ "sum"] = df[player_col_names].sum(axis=1) # e.g. r_gold_mean
            #df[team + "_" + feat_suff + "_"+ "min"] =  df[player_col_names].min(axis=1) # e.g. r_gold_min
            
            df.drop(columns=player_col_names, inplace=True) # remove raw features from the dataset
    return df

In [21]:
def hero_approach(df):
    for team in 'r', 'd':
        players = [team + str(i)  for i in range(1, 6)]
        hero_columns = [player + '_hero_id' for player in players]

        d = pd.get_dummies(df[hero_columns[0]])
        for c in hero_columns[1:]:
            d += pd.get_dummies(df[c])
        df = pd.concat([df, d.add_prefix(team  + '_hero_')], axis=1)
        df.drop(columns=hero_columns, inplace=True)
    return df

df_train_features_heroes = hero_approach(df_new_features)
df_test_features_heroes = hero_approach(test_new_features)

In [22]:
def firstblood(data) :
    df = data.copy()
    for team in 'r','d' :
        columns = [team + str(i) + '_firstblood_claimed' for i in range(1,6) ]
        print(columns)
        if team == 'r' :
            df["r_team_firstblood"] = df[columns].sum(axis = 1)
        df.drop(columns = columns,inplace = True)
    
    return(df)

In [23]:
df_train_features_firstblood = firstblood(df_train_features_heroes)
df_test_features_firstblood =  firstblood(df_test_features_heroes)

['r1_firstblood_claimed', 'r2_firstblood_claimed', 'r3_firstblood_claimed', 'r4_firstblood_claimed', 'r5_firstblood_claimed']
['d1_firstblood_claimed', 'd2_firstblood_claimed', 'd3_firstblood_claimed', 'd4_firstblood_claimed', 'd5_firstblood_claimed']
['r1_firstblood_claimed', 'r2_firstblood_claimed', 'r3_firstblood_claimed', 'r4_firstblood_claimed', 'r5_firstblood_claimed']
['d1_firstblood_claimed', 'd2_firstblood_claimed', 'd3_firstblood_claimed', 'd4_firstblood_claimed', 'd5_firstblood_claimed']


In [24]:
df_train_features_firstblood["game_minutes"] = (df_train_features_firstblood["game_time"]/60).astype("int")
df_test_features_firstblood["game_minutes"] = (df_test_features_firstblood["game_time"]/60).astype("int")

In [25]:
def add_new_features(data,original_features):
    
    df = data.copy()
    
    #Gold/min, Exp/min, and Kills/min
    
    for  feature in original_features :
        for team in "r","d" :
            players = [team +str(i) for i in range(1, 6)] # r1, r2...
            player_col_names = [player + "_" + feature for player in players] # e.g. r1_gold, r2_gold
            
            for player_feature in  player_col_names:
               # df[player_feature + '_per_minute'] = df[player_feature]/df["game_minutes"]
                 df.loc[df["game_minutes"] != 0 ,player_feature + '_per_minute'] = df[player_feature]/df["game_minutes"]
            
    return df

#df_train_features_firstblood.head()

In [26]:
def differences(data , feature_suff , stats) :
    
    df=data.copy()
    
    for col in feature_suff :
        for stat in stats :
            df[col +"_" + stat + "_ratio"] = df["r"  + "_" + col +"_" + stat] / df["d"  + "_" + col +"_" + stat]
                
            original_col = ["r"  + "_" + col +"_" + stat,"d"  + "_" + col +"_" + stat]
    
            df.drop(columns = original_col , inplace = True)
    return df

In [27]:
df_train_features_addtional_features = add_new_features(df_train_features_firstblood,
                                                        ['kills','gold','xp','level',"damage_deal_tocreeps",
                                                         "deaths","number_of_ability_used","damage_deal_toheroes","damage_deal_toneutral"])
df_test_features_addtional_features = add_new_features(df_test_features_firstblood,
                                                       ['kills','gold','xp','level',"damage_deal_tocreeps",
                                                        "deaths","number_of_ability_used","damage_deal_toheroes","damage_deal_toneutral"])

In [28]:
df_train_features_averages = combine_numeric_features(df_train_features_addtional_features, numeric_features)
df_test_features_averages =  combine_numeric_features(df_test_features_addtional_features, numeric_features)

In [29]:
df_train_features_averages["r_kda"] = (df_train_features_averages["r_kills_sum"] + df_train_features_averages['r_assists_sum']) / (df_train_features_averages['r_deaths_sum'] + 1)
df_train_features_averages["d_kda"] = (df_train_features_averages["d_kills_sum"] + df_train_features_averages['d_assists_sum']) / (df_train_features_averages['d_deaths_sum'] + 1)

In [30]:
df_test_features_averages["r_kda"] = (df_test_features_averages["r_kills_sum"] + df_test_features_averages['r_assists_sum']) / (df_test_features_averages['r_deaths_sum'] + 1)
df_test_features_averages["d_kda"] = (df_test_features_averages["d_kills_sum"] + df_test_features_averages['d_assists_sum']) / (df_test_features_averages['d_deaths_sum'] + 1)

In [31]:
params = {'num_leaves': 9,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4}
from sklearn.model_selection import KFold, StratifiedKFold
skf = KFold(n_splits=5, random_state=13)
from sklearn.metrics import roc_auc_score
predict = [0]*df_test_features_averages.shape[0]
cross_val_result = 0
for ind_trn, ind_test in skf.split(df_train_features_averages, df_train_targets.radiant_win): 
    
   
    
    X_train_lgbm = lgb.Dataset(df_train_features_averages.iloc[ind_trn], df_train_targets.radiant_win.iloc[ind_trn] )
    X_test_lgbm = lgb.Dataset(df_train_features_averages.iloc[ind_test], df_train_targets.radiant_win.iloc[ind_test])
    
    
    clf = lgb.train(params, X_train_lgbm, num_boost_round = 10000, valid_sets=X_test_lgbm, early_stopping_rounds=100, verbose_eval=200 )
    
    cross_val_predict = clf.predict(df_train_features_averages.iloc[ind_test])
    cross_val_auc = roc_auc_score(df_train_targets.radiant_win.iloc[ind_test] , cross_val_predict) / 5
    
    cross_val_result +=cross_val_auc
    
    pred = clf.predict(df_test_features_averages)/5
    predict += pred  
    
print(cross_val_result)

Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.825864
[400]	valid_0's auc: 0.833416
[600]	valid_0's auc: 0.837725
[800]	valid_0's auc: 0.840147
[1000]	valid_0's auc: 0.841557
[1200]	valid_0's auc: 0.842555
[1400]	valid_0's auc: 0.843325
[1600]	valid_0's auc: 0.844197
[1800]	valid_0's auc: 0.844275
Early stopping, best iteration is:
[1733]	valid_0's auc: 0.84435
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.82272
[400]	valid_0's auc: 0.831795
[600]	valid_0's auc: 0.836552
[800]	valid_0's auc: 0.839092
[1000]	valid_0's auc: 0.84095
[1200]	valid_0's auc: 0.841888
[1400]	valid_0's auc: 0.842624
[1600]	valid_0's auc: 0.842979
[1800]	valid_0's auc: 0.843335
[2000]	valid_0's auc: 0.843626
[2200]	valid_0's auc: 0.843989
[2400]	valid_0's auc: 0.844309
[2600]	valid_0's auc: 0.844656
Early stopping, best iteration is:
[2649]	valid_0's auc: 0.844759
Training until validation scores don't improve for 100 rounds
[200]	valid

In [32]:
df_submission = pd.DataFrame({'radiant_win_prob': predict}, 
                                 index=df_test_features.index)
df_submission.to_csv("final.csv")