In [104]:
# Assert minimum versions 
import sys 
assert sys.version_info >= (3, 5) 
import sklearn 
assert sklearn.__version__ >= "0.20" 
 
# Import packages and modules that will be used 
import numpy as np 
import pandas as pd
from sklearn import linear_model
from sklearn import metrics 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import SGDClassifier 
from sklearn.metrics import accuracy_score 

# Import and configure matplotlib 
%matplotlib inline  
import matplotlib as mpl  
import matplotlib.pyplot as plt 
mpl.rc('figure', dpi=120) # set good resolution

# Set a seed for reproducability
import random
random.seed(42)
# numpy needs a random seed, too
np.random.seed(42)

import datetime
from datetime import date

In [105]:
df = pd.read_csv('/Users/phillipmonk/research_paper/horse_code/data/horse_data.csv')

In [106]:
df.head()

Unnamed: 0,horse_name,foaled,colour,sire,dam,sex,group_1_wins,starts,firsts,seconds,...,date,dist,cond,weight,800m,400m,margin,rating,odds,odds_source
0,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,31/07/23,1100,9.0,61.5,2.0,1.0,10.2,61,20.0,SP
1,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,02/01/23,1125,3.0,54.0,7.0,5.0,6.7,61,11.0,SP
2,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,16/12/22,1109,3.0,61.5,6.0,6.0,0.8,61,6.5,SP
3,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,08/12/22,1100,3.0,58.5,4.0,5.0,1.0,61,9.5,SP
4,A Good Yarn,18/09/2013,Brown or Black,O'lonhro,Little Thread,Gelding,0,46,6,8,...,27/11/22,1000,4.0,56.5,11.0,10.0,5.6,63,15.0,SP


In [107]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y').dt.date

In [108]:
df_jockey = df[df['track'].isin(['CAUL','FLEM','RAND','RHIL'])]['jockey'].unique()

df_jockey

array(['jarrod-fry', 'luke-nolen', 'luke-campbell', 'ben-melham',
       'patrick-moloney', 'craig-newitt', 'craig-williams',
       'jason-collett', 'kerrin-mcevoy', 'jamie-mott', 'thomas-stockdale',
       'cejay-graham', 'jamie-kah', 'william-pike', 'ethan-brown',
       'amy-mclucas', 'reece-jones', 'michael-dee', 'linda-meech',
       'brett-prebble', 'john-allen', 'regan-bayliss', 'hugh-bowman-hk',
       'jye-mcneil', 'andrew-gibbons', 'dylan-gibbons', 'tahlia-hope',
       'daniel-moor', 'carleen-hefel', 'joe-bowditch', 'damien-oliver',
       'rachel-king', 'blake-shinn', 'teo-nugent', 'luke-currie',
       'beau-mertens', 'chad-schofield', 'tyler-schiller',
       'ellen-hennessy', 'james-mcdonald', 'tim-clark', 'keagan-latham',
       'jenny-duggan', 'tom-sherry', 'luke-rolls', 'damien-thornton',
       'jess-taylor', 'nash-rawiller', 'alysha-collett', 'louise-day',
       'robbie-dolan', 'matthew-cartwright', 'tommy-berry', 'zac-lloyd',
       'brock-ryan', 'glen-boss', 'aa

In [109]:
pd.DataFrame(df_jockey).to_csv('/Users/phillipmonk/research_paper/horse_code/data/jockeys.csv')

In [110]:
df_trainer = df[(df['date'] >= datetime.date(2022,1,1)) & (df['track'].isin(['CAUL','FLEM','RAND','RHIL']))]['trainer'].unique()

df_trainer

array(['mervyn-mckenzie', 'dean-krongold', 'robbie-griffiths',
       'andrew-homann', 'chris-waller', 'ben-hayes', 'john-sprague',
       'james-cummings', 'gregory-mcfarlane', 'mick-price',
       'matthew-dale', 'kerry-parker', 'kristen-buchanan',
       'aaron-purcell', 'ciaron-maher', 'roger-james-nz', 'paul-snowden',
       'trent-busuttin', 'kris-lees', 'mitchell-freedman',
       'bruce-anderson', 'tony-mcevoy', 'david-payne',
       'gai-waterhouse-ao', 'mick-cerchi', 'leon-corstens',
       'andrew-forsman-nz', 'chris-bieg', 'annabel-neasham', 'bryce-heys',
       'marc-quinn', 'will-clarken', 'john-macmillan', 'shane-fliedner',
       'jason-coyle', 'sam-kavanagh', 'matt-laurie', 'terry-robinson',
       'john-hawkes', 'richard-jolly', 'bjorn-baker', 'paul-messara',
       'gary-portelli', 'robert-hickmott', 'tash-burleigh',
       'richard-freedman', 'michael-freedman', 'paul-koumis',
       'michael-moroney', 'anthony-cummings', 'robert-quinn',
       'mark-kavanagh', 'imo

In [111]:
pd.DataFrame(df_trainer).to_csv('/Users/phillipmonk/research_paper/horse_code/data/trainers.csv')