# HKJC Processor

### Notes

- all data ends 1/1/21

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import data
runs = pd.read_csv('runs.csv').iloc[:,1:]
races = pd.read_csv('races.csv').iloc[:,1:]
horse_runs = pd.read_csv('horse_runs.csv').iloc[:,1:]
horse_profiles = pd.read_csv('horse_profiles.csv').iloc[:,1:]

## Data

### Pre-process Data

In [2]:
# open runs data & convert dates/times to datetime
# horse, horse_id 11805,11852/11951 non-null
runs_data = runs.copy()
runs_data['date'] = pd.to_datetime(runs_data['date'], format = '%d/%m/%Y') # date to datetime object

# drop redundant features & sort
#runs_data = runs_data.drop(columns=['run_position'])
runs_data = runs_data.sort_values(by=['date','race'], ascending=True).reset_index(drop=True)

# output size and final df
print('Features:',runs_data.shape[1])
print('Entries :',runs_data.shape[0])
runs_data.head()

Features: 20
Entries : 11951


Unnamed: 0,date,race,horse,horse_id,jockey,trainer,place,horse_no,weight_adj,weight_horse_declared,draw,length_behind_winner,pos_1,pos_2,pos_3,pos_4,pos_5,pos_6,position_finish_time,win_odds
0,2019-09-01,1,GOOD RUNNERS WAY,C127,Z Purton,C S Shum,1,3,131,1072,6,0.0,3,3,2,1.0,,,95.48,2.2
1,2019-09-01,1,REGENCY GEM,C035,J Moreira,W Y So,2,4,129,1179,7,0.0,9,8,8,2.0,,,95.5,4.9
2,2019-09-01,1,PROUD SKY,V406,K Teetan,L Ho,3,7,121,1053,4,0.0,5,4,3,3.0,,,95.57,18.0
3,2019-09-01,1,LE PANACHE,V369,A Sanna,D J Whyte,4,2,132,1076,3,1.25,4,6,6,4.0,,,95.68,5.7
4,2019-09-01,1,GOLD VELVET,V400,L Hewitson,A T Millard,5,1,133,1155,8,1.5,8,7,7,5.0,,,95.71,7.0


In [3]:
# open race data
races_data = races.copy()
races_data['date'] = pd.to_datetime(races_data['date'], format = '%d/%m/%Y') # date to datetime object

# drop redundant features & sort
races_data = races_data.drop(columns=['handicap','number'])
races_data = races_data.sort_values(by=['date','race'], ascending=True)
races_data = races_data.reset_index(drop=True).reset_index().rename(columns={'index':'race_id'})

# output size and final df
print('Features:',races_data.shape[1])
print('Entries :',races_data.shape[0])
races_data.head()

Features: 64
Entries : 1004


Unnamed: 0,race_id,date,race,class,distance,ratings,going,surface,surface_type,pool,...,first4_2,first4_3,first4_4,first4_div,quartet_1,quartet_2,quartet_3,quartet_4,quartet_div,race_id.1
0,0,2019-09-01,1,Class 5,1600,(40-0),GOOD,TURF,B,725000,...,3,4,7,195.0,3,4,7,2,3065.0,994
1,1,2019-09-01,2,Class 5,1200,(40-0),GOOD,TURF,B,725000,...,8,13,14,3525.0,13,2,8,14,86477.0,995
2,2,2019-09-01,3,Class 1,1200,(90+),GOOD,TURF,B,2800000,...,5,6,7,143.0,5,2,6,7,2032.0,996
3,3,2019-09-01,4,Class 4,1200,(60-40),GOOD,TURF,B,967000,...,3,6,10,2837.0,3,10,2,6,81776.0,997
4,4,2019-09-01,5,Class 4,1400,(60-40),GOOD,TURF,B,967000,...,3,6,11,2017.0,6,11,3,2,40492.0,998


In [4]:
# open horse run data
# jockey 20584/20911 non-null
hr_data = horse_runs.copy()

# date to datetime object
def horse_to_date(date):
    try: return pd.to_datetime(date, format = '%d/%m/%y')
    except: return pd.to_datetime(date, format = '%d/%m/%Y')
hr_data['date'] = hr_data['date'].apply(horse_to_date)

# make relational DB for jockey & trainer features
jockey_ids = pd.DataFrame(hr_data['jockey'].unique()).sort_values(by=0,ascending=True).reset_index().rename(columns={'index':'jockey_id',0:'jockey'})
trainer_ids = pd.DataFrame(hr_data['trainer'].unique()).sort_values(by=0,ascending=True).reset_index().rename(columns={'index':'trainer_id',0:'trainer'})
hr_data = hr_data.merge(jockey_ids, right_on='jockey', left_on='jockey')
hr_data = hr_data.merge(trainer_ids, right_on='trainer', left_on='trainer')

# drop redundant features & sort
hr_data = hr_data.sort_values(by=['horse_id','date'], ascending=True).reset_index(drop=True)

# output size and final df
print('Features:',hr_data.shape[1])
print('Entries :',hr_data.shape[0])
hr_data.head()

Features: 28
Entries : 20911


Unnamed: 0,horse_id,race_idx,place,date,race_track,surface,surface_type,distance,g_race,class,...,pos_2,pos_3,pos_4,pos_5,pos_6,position_finish_time,weight_horse_declared,gear,jockey_id,trainer_id
0,A005,25,5,2018-09-09,ST,Turf,C,1600,GF,4,...,5.0,4.0,5.0,,,95.06,1141.0,B,8,15
1,A005,56,2,2018-10-01,ST,Turf,A+3,1800,G,4,...,8.0,9.0,8.0,2.0,,107.96,1141.0,B,8,15
2,A005,168,3,2018-11-10,ST,Turf,A,1800,G,4,...,5.0,4.0,3.0,3.0,,108.95,1148.0,B,8,15
3,A005,250,7,2018-12-12,HV,Turf,C,1800,G,4,...,6.0,7.0,5.0,7.0,,110.41,1146.0,B,8,15
4,A005,319,4,2019-01-06,ST,Turf,C+3,1800,G,4,...,8.0,7.0,6.0,4.0,,108.42,1143.0,B,8,15


In [5]:
# open horse profile data
# age 1029/1534 non-null
hp_data = horse_profiles.copy()

# output size and final df
print('Features:',hp_data.shape[1])
print('Entries :',hp_data.shape[0])
hp_data.head()

Features: 9
Entries : 1590


Unnamed: 0,name,horse_id,age,country,colour,sex,sire,dam,dams_sire
0,TRAVEL GLORY,E170,3.0,NZ,Gelding,Brown,Savabeel,Iguazu's Girl,Redoute's Choice
1,BRILLIANT,C005,,AUS,Gelding,Bay,Encosta de Lago,True Roman,Is It True
2,JOYFUL WIN,D451,3.0,AUS,Gelding,Bay,Shooting To Win,Miss Kistler,Darci Brahma
3,VIGOR CHAMP,C498,5.0,AUS,Gelding,Bay,Mossman,Eve Angelene,Catbird
4,CHAMPION PRIDE,B049,7.0,GB,Gelding,Bay,Invincible Spirit,Loch Jipp,


### Merge Horse and Runs DataFrames Seperately

In [6]:
# merge the cleaned horse_runs and horse_profile datasets based on horse_id
horse_merge = pd.merge(hr_data, hp_data, on='horse_id')
print('Merged Horse Data Size:', horse_merge.shape)
#merge = merge.dropna().reset_index(drop=True) # 79447 entries -> 75710 entries

# merge the race and run datasets based on date and race #
runs_merge = pd.merge(runs_data, races_data, on=['date','race'])
print('Merged Runs Data Size:', runs_merge.shape)

Merged Horse Data Size: (20581, 36)
Merged Runs Data Size: (11975, 82)


### Feature Extraction

In [7]:
# define average calculation function
def get_average(col):
    values = []
    averages = [np.nan]
    for row in col[1:]:
        values += [row]
        averages += [np.mean(values)]
    return averages

'''horses = horse_merge['horse_id'].unique()
for horse in horses[0:1]:
    temp = horse_merge[horse_merge['horse_id'] == horse]
    temp['recent_result'] = temp['place'].shift(1)
temp'''

"horses = horse_merge['horse_id'].unique()\nfor horse in horses[0:1]:\n    temp = horse_merge[horse_merge['horse_id'] == horse]\n    temp['recent_result'] = temp['place'].shift(1)\ntemp"

In [8]:
# extract horse features
horses = horse_merge['horse_id'].unique() # isolate each unique horse

# set parameters
end = -1

# define feature Series
race_count = pd.Series(dtype='float64')
days_since_race = pd.Series(dtype='float64')

# define recent & average Series
recent_result = pd.Series(dtype='float64')
average_result = pd.Series(dtype='float64')
recent_lengths_behind = pd.Series(dtype='float64')
average_lengths_behind = pd.Series(dtype='float64')
recent_normal_speed = pd.Series(dtype='float64')
average_normal_speed = pd.Series(dtype='float64')
recent_draw = pd.Series(dtype='float64')
average_draw = pd.Series(dtype='float64')
recent_actual_weight = pd.Series(dtype='float64')
average_actual_weight = pd.Series(dtype='float64')

# perform for each horse
for horse in horses[0:end]:
    # isolate horses races
    temp = horse_merge[horse_merge['horse_id'] == horse]
    
    # extract # of race ran
    temp['race_count'] = list(range(0,temp.shape[0])) # attach to index
    race_count = race_count.append(temp['race_count']) # append list
    
    # extract days since last race
    temp['days_since_race'] = (temp['date'] - temp['date'].shift(1))
    temp['days_since_race'] = temp['days_since_race'].map(lambda x: x.days)
    days_since_race = days_since_race.append(temp['days_since_race']) 

    # extract most recent race result
    temp['recent_result'] = temp['place'].shift(1)
    recent_result = recent_result.append(temp['recent_result']) # append list
    
    # extract average race result
    temp['average_result'] = temp[['recent_result']].apply(get_average)
    average_result = average_result.append(temp['average_result'])
    
    # extract most recent races lengths behind
    temp['recent_lengths_behind'] = temp['length_behind_winner'].shift(1)
    recent_lengths_behind = recent_lengths_behind.append(temp['recent_lengths_behind']) 
    
    # extract average lengths behind
    temp['average_lengths_behind'] = temp[['recent_lengths_behind']].apply(get_average)
    average_lengths_behind = average_lengths_behind.append(temp['average_lengths_behind']) 
    
    # extract normalized finishing speed
    temp['recent_normal_speed'] = (temp['position_finish_time']/temp['distance']).shift(1)
    recent_normal_speed = recent_normal_speed.append(temp['recent_normal_speed']) 
    
    # extract average normalized finishing speed
    temp['average_normal_speed'] = temp[['recent_normal_speed']].apply(get_average)
    average_normal_speed = average_normal_speed.append(temp['average_normal_speed']) 
    
    # extract recent draw position
    temp['recent_draw'] = temp['draw'].shift(1)
    recent_draw = recent_draw.append(temp['recent_draw'])
    
    # extract average past draw position
    temp['average_draw'] = temp[['recent_draw']].apply(get_average)
    average_draw = average_draw.append(temp['average_draw'])
    
    # extract recent actual weight
    temp['recent_actual_weight'] = temp['weight_actual'].shift(1)
    recent_actual_weight = recent_actual_weight.append(temp['recent_actual_weight'])
    
    # extract average actual weight
    temp['average_actual_weight'] = temp[['recent_actual_weight']].apply(get_average)
    average_actual_weight = average_actual_weight.append(temp['average_actual_weight'])
    
# define new df w/ extracted features
extracted_df = horse_merge.copy()

# load extracted features
extracted_df['race_count'] = race_count
extracted_df['days_since_race'] = days_since_race

# load recent & average features
extracted_df['recent_result'] = recent_result
extracted_df['average_result'] = average_result
extracted_df['recent_lengths_behind'] = recent_lengths_behind
extracted_df['average_lengths_behind'] = average_lengths_behind
extracted_df['recent_normal_speed'] = recent_normal_speed
extracted_df['average_normal_speed'] = average_normal_speed
extracted_df['recent_draw'] = recent_draw
extracted_df['average_draw'] = average_draw
extracted_df['recent_actual_weight'] = recent_actual_weight
extracted_df['average_actual_weight'] = average_actual_weight

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [9]:
# define preference function
def get_preference(df,col_name,check=False):
    new_col_name = col_name + '_pref'
    df = df.reset_index(drop=True)
    preference = [np.nan]
    for i in range(1,df.shape[0]):
        # get col preferences
        temp_df = df.loc[:i-1,['place',col_name]]
        temp_group = temp_df.groupby(col_name)[['place']].mean().rename(columns={'place':new_col_name})
        temp = temp_df.join(temp_group, on=col_name)
        
        # check for most similar pairing
        current = df.loc[i,col_name]
        if current != temp.loc[i-1,col_name] and check == True:
            less = np.abs(current - temp[col_name])
            pref = temp.loc[less.idxmin,[new_col_name]]
        else:
            pref = temp.loc[i-1,[new_col_name]]
        preference += [pref.values[0]]
    return preference

In [10]:
# extract horse features
horses = horse_merge['horse_id'].unique() # isolate each unique horse

# define preference Series
distance_pref = pd.Series(dtype='float64')
surface_pref = pd.Series(dtype='float64')
venue_pref = pd.Series(dtype='float64')

# perform for each horse
for horse in horses[0:end]:
    # isolate horses races
    temp = horse_merge[horse_merge['horse_id'] == horse]
    
    # may need to normalize results for preference features
    # extract distance preferences (avg finish result at given distance)
    temp['distance_pref'] = get_preference(temp,'distance',True)
    distance_pref = distance_pref.append(temp['distance_pref'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [11]:
# perform for each horse
for horse in horses[0:end]:
    # extract surface preferences (avg finish result on given surface)
    temp = horse_merge[horse_merge['horse_id'] == horse]    
    temp['surface_pref'] = get_preference(temp,'surface')
    surface_pref = surface_pref.append(temp['surface_pref'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [12]:
# perform for each horse
for horse in horses[0:end]:
    # extract track preferences (avg finish result on given going)
    temp = horse_merge[horse_merge['horse_id'] == horse]    
    temp['venue_pref'] = get_preference(temp,'race_track')
    venue_pref = venue_pref.append(temp['venue_pref'])

# load preference features
extracted_df['distance_pref'] = distance_pref
extracted_df['surface_pref'] = surface_pref
extracted_df['venue_pref'] = venue_pref

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [13]:
# extract jockey features
jockies = horse_merge['jockey_id'].unique() # isolate each unique horse

# define jockey Series
recent_jockey_perf = pd.Series(dtype='float64')
average_jockey_perf = pd.Series(dtype='float64')

# perform for each jockey
for jockey in jockies[0:end]:
    # isolate jockey races
    temp = horse_merge[horse_merge['jockey_id'] == jockey]
    
    # extract recent jocky performance
    temp['recent_jockey_perf'] = temp['place'].shift(1)
    recent_jockey_perf = recent_jockey_perf.append(temp['recent_jockey_perf'])
    
    # extract average jocky performance
    temp['average_jockey_perf'] = temp[['recent_jockey_perf']].apply(get_average)
    average_jockey_perf = average_jockey_perf.append(temp['average_jockey_perf']) 
    
# load preference features
extracted_df['recent_jockey_perf'] = recent_jockey_perf
extracted_df['average_jockey_perf'] = average_jockey_perf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
# extract trainer features
trainers = horse_merge['trainer_id'].unique() # isolate each unique horse

# define trainer Series
recent_trainer_perf = pd.Series(dtype='float64')
average_trainer_perf = pd.Series(dtype='float64')

# perform for each trainer
for trainer in trainers[0:end]:
    # isolate trainer races
    temp = horse_merge[horse_merge['trainer_id'] == trainer]
    
    # extract recent trainer performance
    temp['recent_trainer_perf'] = temp['place'].shift(1)
    recent_trainer_perf = recent_trainer_perf.append(temp['recent_trainer_perf'])
    
    # extract average trainer performance
    temp['average_trainer_perf'] = temp[['recent_trainer_perf']].apply(get_average)
    average_trainer_perf = average_trainer_perf.append(temp['average_trainer_perf']) 

# load preference features
extracted_df['recent_trainer_perf'] = recent_trainer_perf
extracted_df['average_trainer_perf'] = average_trainer_perf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
# extract trainer & jockey horse features
horses = extracted_df['horse_id'].unique() # isolate each unique horse

# define horse's past jocky performance Series
recent_jockey_skill = pd.Series(dtype='float64')
average_jockey_skill = pd.Series(dtype='float64')

# perform for each horse
for horse in horses[0:end]:
    # isolate horses races
    temp = extracted_df[extracted_df['horse_id'] == horse]

    # extract horse's most recent jokey skill
    temp['recent_jockey_skill'] = temp['average_jockey_perf'].shift(1)
    recent_jockey_skill = recent_jockey_skill.append(temp['recent_jockey_skill']) 
    
    # extract horse's average jockey skill
    temp['average_jockey_skill'] = temp[['average_jockey_perf']].apply(get_average)
    average_jockey_skill = average_jockey_skill.append(temp['average_jockey_skill']) 

# load recent & average features
extracted_df['recent_jockey_skill'] = recent_jockey_skill
extracted_df['average_jockey_skill'] = average_jockey_skill

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Merge Extracted and Merged Runs Dataframes

In [16]:
# merge the race and run datasets based on race_id column
cols = ['surface', 'surface_type', 'distance', 'class', 'draw', 'trainer', 'jockey', 'name',
        'length_behind_winner', 'place', 'weight_actual', 'win_odds', 'pos_1', 'pos_2', 
        'pos_3', 'pos_4', 'pos_5', 'pos_6', 'position_finish_time', 'weight_horse_declared']
horse_temp = extracted_df.drop(columns=cols)
extracted_df = pd.merge(runs_merge, horse_temp, on=['date','horse_id'])
print('Merged Horses-Runs Data Size:', extracted_df.shape)

# output quick stats & merged df
avg_race_count = round(extracted_df.horse_id.shape[0]/extracted_df.horse_id.unique().shape[0],3)
print('Avg Races per Horse:',avg_race_count)

Merged Horses-Runs Data Size: (9793, 117)
Avg Races per Horse: 6.666


In [17]:
# since no going in horse_merge -> run after horse-runs merge
# extract horse features
horses = extracted_df['horse_id'].unique() # isolate each unique horse

# define preference Series
going_pref = pd.Series(dtype='float64')

# perform for each horse
for horse in horses[0:end]:
    
    # extract going preferences (avg finish result on given going)
    temp = extracted_df[extracted_df['horse_id'] == horse]    
    temp['going_pref'] = get_preference(temp,'going')
    going_pref = going_pref.append(temp['going_pref']) 
    
# load preference features
extracted_df['going_pref'] = going_pref

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [18]:
# output extracted feature dataframe
extracted_df.to_csv('extracted.csv',index=False)
extracted_df.head()

Unnamed: 0,date,race,horse,horse_id,jockey,trainer,place,horse_no,weight_adj,weight_horse_declared,...,distance_pref,surface_pref,venue_pref,recent_jockey_perf,average_jockey_perf,recent_trainer_perf,average_trainer_perf,recent_jockey_skill,average_jockey_skill,going_pref
0,2019-09-01,1,GOOD RUNNERS WAY,C127,Z Purton,C S Shum,1,3,131,1072,...,7.0,8.0,6.857143,3.0,4.411837,3.0,5.960986,4.845238,6.541275,
1,2019-09-01,1,REGENCY GEM,C035,J Moreira,W Y So,2,4,129,1179,...,8.0,10.0,9.2,7.0,4.594679,8.0,6.718232,7.042885,7.274772,
2,2019-09-01,1,PROUD SKY,V406,K Teetan,L Ho,3,7,121,1053,...,6.0,5.583333,5.375,4.0,5.592417,6.0,7.538077,4.523358,6.227424,
3,2019-09-01,1,LE PANACHE,V369,A Sanna,D J Whyte,4,2,132,1076,...,4.166667,6.65625,9.571429,7.0,7.762857,1.0,6.557377,8.557312,6.867467,
4,2019-09-01,1,GOLD VELVET,V400,L Hewitson,A T Millard,5,1,133,1155,...,5.4,6.695652,6.4,6.0,8.539171,3.0,6.708618,5.59498,6.274691,


### Feature Processing

In [19]:
# load dataframe
extracted_df = pd.read_csv('extracted.csv')
print('Size Initial:',extracted_df.shape)

# drop columns no longer needed
cols = ['race_idx','race_id.1','horse_id','position_finish_time','length_behind_winner',
        'date','pos_1','pos_2','pos_3','pos_4','pos_5','pos_6','time1','time2','time3','time4',
        'time5','time6','sec1','sec2','sec3','sec4','sec5','sec6','horse','jockey','trainer',
        'win_combination','win_div','place_combination_1','place_combination_2','place_combination_3',
        'place_div_1','place_div_2','place_div_3']
extracted_df = extracted_df.drop(columns=cols)

# one-hot encode categorical features
extracted_df = pd.get_dummies(extracted_df, drop_first=True)
print('Size w/ Dummies:',extracted_df.shape)

# drop each horses first race
#extracted_df = extracted_df.dropna().reset_index(drop=True)
print('Size w/o Nan:',extracted_df.shape)
print('Size Final:',extracted_df.shape)

# save one-hotted dataset
extracted_df.to_csv('extracted_big.csv',index=False)
extracted_df.head()

Size Initial: (9793, 85)
Size w/ Dummies: (9793, 2558)
Size w/o Nan: (9793, 2558)
Size Final: (9793, 2558)


Unnamed: 0,race,place,horse_no,weight_adj,weight_horse_declared,draw,win_odds,race_id,distance,pool,...,dams_sire_ Western Winter,dams_sire_ Whipper,dams_sire_ Wild Again,dams_sire_ Woodman,dams_sire_ Written Tycoon,dams_sire_ Xaar,dams_sire_ Yachtie,dams_sire_ Zabeel,dams_sire_ Zamindar,dams_sire_ Zeditave
0,1,1,3,131,1072,6,2.2,0,1600,725000,...,0,0,0,0,0,0,0,0,0,0
1,1,2,4,129,1179,7,4.9,0,1600,725000,...,0,0,0,0,0,0,0,0,0,0
2,1,3,7,121,1053,4,18.0,0,1600,725000,...,0,0,0,0,0,0,0,0,0,0
3,1,4,2,132,1076,3,5.7,0,1600,725000,...,0,0,0,0,0,0,0,0,0,0
4,1,5,1,133,1155,8,7.0,0,1600,725000,...,0,0,0,0,0,0,0,0,0,0


### Exploration

In [20]:
def plot_relations(col1,col2):
    # grab relation data
    total_count = runs_data.groupby(col1)[col2].count()
    conditional_prob = runs_data[runs_data[col2] == 1].groupby(col1)[col2].count()/total_count
    
    # plot relations
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (18,4))

    ax1.bar(total_count.index, total_count.values)
    title = 'Horses Per ' + col1
    ax1.set_title(title)
    ax1.set_ylabel('Count')
    ax1.set_xlabel(col1)

    ax2.bar(conditional_prob.index, conditional_prob.values)
    ax2.set_title('Conditional Rate')
    ylabel = 'P( ' + col2 + ' | ' + col1 + ' )'
    ax2.set_ylabel(ylabel)
    ax2.set_xlabel(col1)

    plt.show()

In [21]:
'''plot_relations('horse_country','won')
plot_relations('horse_age','won')
plot_relations('actual_weight','won')
plot_relations('declared_weight','won')
plot_relations('draw','won')
plot_relations('horse_type','won')
plot_relations('win_odds','won')
plot_relations('place_odds','won')'''
plot = False