In [21]:
# Assert minimum versions 
import sys 
assert sys.version_info >= (3, 5) 
import sklearn 
assert sklearn.__version__ >= "0.20" 
 
# Import packages and modules that will be used 
import numpy as np 
import pandas as pd
from sklearn import linear_model
from sklearn import metrics 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import SGDClassifier 
from sklearn.metrics import accuracy_score 

# Import and configure matplotlib 
%matplotlib inline  
import matplotlib as mpl  
import matplotlib.pyplot as plt 
mpl.rc('figure', dpi=120) # set good resolution

# Set a seed for reproducability
import random
random.seed(42)
# numpy needs a random seed, too
np.random.seed(42)

In [22]:
df = pd.read_csv('/Users/phillipmonk/research_paper/horse_code/data/horse_data.csv')

In [23]:
df.head()

Unnamed: 0,horse_name,foaled,colour,sire,dam,sex,group_1_wins,starts,firsts,seconds,...,track_date,dist,cond,weight,800m,400m,margin,rating,odds,odds_source
0,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,31/07/23,1100,9.0,61.5,2.0,1.0,10.2,61,20.0,SP
1,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,02/01/23,1125,3.0,54.0,7.0,5.0,6.7,61,11.0,SP
2,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,16/12/22,1109,3.0,61.5,6.0,6.0,0.8,61,6.5,SP
3,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,08/12/22,1100,3.0,58.5,4.0,5.0,1.0,61,9.5,SP
4,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,27/11/22,1000,4.0,56.5,11.0,10.0,5.6,63,15.0,SP


In [24]:
df.head(10)

Unnamed: 0,horse_name,foaled,colour,sire,dam,sex,group_1_wins,starts,firsts,seconds,...,track_date,dist,cond,weight,800m,400m,margin,rating,odds,odds_source
0,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,31/07/23,1100,9.0,61.5,2.0,1.0,10.2,61,20.0,SP
1,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,02/01/23,1125,3.0,54.0,7.0,5.0,6.7,61,11.0,SP
2,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,16/12/22,1109,3.0,61.5,6.0,6.0,0.8,61,6.5,SP
3,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,08/12/22,1100,3.0,58.5,4.0,5.0,1.0,61,9.5,SP
4,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,27/11/22,1000,4.0,56.5,11.0,10.0,5.6,63,15.0,SP
5,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,12/08/22,1130,8.0,58.0,6.0,4.0,5.1,64,7.5,SP
6,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,15/07/22,1100,7.0,55.5,8.0,8.0,3.0,65,12.0,SP
7,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,17/06/22,1125,10.0,58.5,3.0,3.0,0.8,65,11.0,SP
8,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,29/05/22,1100,5.0,57.5,12.0,11.0,3.6,65,20.0,SP
9,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,03/05/22,1100,6.0,58.5,12.0,12.0,0.1,65,41.0,SP


In [30]:
df_jockey = df['jockey'].unique()

df_jockey

array(['luke-williams', 'tom-madden', 'declan-bates', 'mikaela-lawrence',
       'tayla-childs', 'will-price', 'jarrod-fry', 'damian-lane',
       'luke-nolen', 'luke-campbell', 'matthew-chadwick', 'ryan-houston',
       'paul-gatt', 'lachlan-overall', 'matthew-cartwright',
       'dean-yendall', 'sheridan-clarke', 'daniel-stackhouse',
       'alana-kelly', 'beau-mertens', 'kiran-quilty', 'craig-newitt',
       'ben-melham', 'jye-mcneil', 'lachlan-king', 'sally-wynne',
       'liam-riordan', 'dylan-turner', 'carleen-hefel', 'chris-mccarthy',
       'sophie-logan', 'jake-toeroek', 'jake-duffy', 'winona-costin',
       'patrick-moloney', 'michael-poy', 'barend-vorster',
       'nash-rawiller', 'adam-hyeronimus', 'kerrin-mcevoy',
       'joao-moreira-brz', 'brenton-avdulla', 'jamie-kah',
       'craig-williams', 'jason-collett', 'hugh-bowman-hk',
       'ashley-morgan', 'ben-looker', 'ben-allen', 'robbie-downey',
       'jamie-mott', 'john-allen', 'blaike-mcdougall', 'thomas-stockdale',
 

In [None]:
df_jockey.to_csv('/Users/phillipmonk/research_project/horse_code/data/jockeys.csv')

In [31]:
df_trainer = df['trainer'].unique()

df_trainer

array(['mervyn-mckenzie', 'dean-krongold', 'jason-warren',
       'robbie-griffiths', 'andrew-homann', 'chris-waller',
       'dwayne-schmidt', 'ben-hayes', 'tom-dabernig', 'john-manzelmann',
       'john-sprague', 'cody-morgan', 'les-bridge', 'james-cummings',
       'gregory-mcfarlane', 'clayton-douglas', 'mick-price',
       'matthew-dale', 'bjorn-baker', 'kerry-parker', 'kristen-buchanan',
       'aaron-purcell', 'ciaron-maher', nan, 'roger-james-nz',
       'paul-snowden', 'trent-busuttin', 'kris-lees', 'mitchell-freedman',
       'bruce-anderson', 'tony-mcevoy', 'david-payne', 'allan-denham',
       'gai-waterhouse-ao', 'mick-cerchi', 'leon-corstens',
       'andrew-forsman-nz', 'chris-bieg', 'annabel-neasham', 'bryce-heys',
       'marc-quinn', 'andrew-cameron', 'will-clarken', 'john-macmillan',
       'shane-fliedner', 'kacy-fogden', 'jason-coyle', 'sam-kavanagh',
       'matt-laurie', 'terry-robinson', 'john-hawkes', 'richard-jolly',
       'paul-messara', 'gary-portelli', 'jo

In [None]:
df_trainer.to_csv('/Users/phillipmonk/research_project/horse_code/data/trainers.csv')