# EDA for Big Data Derby Competition

### Creating paths to diferent folders

In [None]:
import os
from pathlib import Path

# Get current work directory of file (Notebook dir)
NOTEBOOK_DIR=Path(os.path.abspath(os.getcwd()))
# Root path of repo
WORK_DIR=NOTEBOOK_DIR.parent.parent
# Path to Datasets
DATA_DIR=WORK_DIR / 'data'

# Noob printing to check if I did this shit right
print("Notebook dir: ", NOTEBOOK_DIR)
print("Repo dir: ", WORK_DIR)
print("Data dir: ", DATA_DIR)

### Loading csv into dataframes

In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns

complete_data= pd.read_csv(DATA_DIR / 'nyra_2019_complete.csv', names=['track_id','race_date','race_number','program_number','trakus_index','latitude','longitude','distance_id','course_type','track_condition','run_up_distance','race_type','purse','post_time','weight_carried','jockey','odds','position_at_finish'])
complete_data['race_date'].value_counts()
mix_cd = complete_data.set_index(['track_id', 'race_date', 'race_number', 'program_number'])


In [None]:
race_result_df = complete_data.groupby(['track_id', 'race_date','race_number','jockey'])['race_type', 'track_condition','program_number','weight_carried','position_at_finish'].first()
rcd=race_result_df.reset_index()
rcd.head()

## Load Race table df, and check race location distribution


In [None]:
race_data = pd.read_csv(DATA_DIR / 'nyra_race_table.csv')

race_data['track_id'].value_counts().plot(kind='bar')

## Managing data about races... Distance, Velocity and Acceleration

### For now, let's check good jockeys for evaluation, i.e. Who are the best and the worst "W/L" in more races

In [None]:
# Let's check which is the best "winner jockey" to evaluate 
rcd[rcd['position_at_finish']==1].jockey.describe()

In [None]:
# Now let's check the same for best "looser jockey" case.
# Since we don't have always the same amount of competitors,
# let's check which is the most common quantity of positions

rcd[rcd['position_at_finish']==6].jockey.describe()


### Now that we know that 'Manuel Franco' and 'Dylan Davis' are respectively the "most 1st" and "most sixth", lets try to add distance, velocity and acceleration

#### Considerations: Trackus index starts at 1.

In [None]:
from math import radians, cos, sin, asin, sqrt

# Let's def function to apply Haversine formulae
# this snippet of code is extracted from https://www.geeksforgeeks.org/program-distance-two-points-earth/
# by Aarti_Rathi
def haversine_distance(row):
    # Convert from degrees to radians.
    lon1 = radians(row.longitude)
    lon2 = radians(row.shift_longitude)
    lat1 = radians(row.latitude)
    lat2 = radians(row.shift_latitude)
      
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371
      
    # calculate the result
    return(c * r)


So, we now (at least i think) have the distance the horse have runned. now let's check how many readings were made by trakus for the run-up (58 feet in this example).
We may want to replace iterrows with cumsum

#### Now let's get some more interesting stats, like velocity and acceleration!

In [None]:
def velocity(row):
    km_to_m=row.trakus_distance*1000
    time=0.25
    v_m_s = km_to_m / time
    return v_m_s

def acceleration(row):
    delta_v = row.velocity - row.shift_velocity
    time = 0.25
    a_m_s2= delta_v / time
    return a_m_s2


### Let's find slower competitor

In [None]:
complete_data[(complete_data.track_id == "AQU")
& (complete_data.race_date == "2019-04-19")
& (complete_data.race_number == 7)
& (complete_data.position_at_finish == 6)
].jockey

Now, to apply same result than previous guy, maybe do a function to make this isolated dataframes

In [None]:
def isolate_jockey_and_race(complete_data_df, jockey, track_id, race_date, race_number):
    
    # Reading race data into df
    single_jockey_race=complete_data_df[(complete_data_df.jockey == jockey)
    & (complete_data_df.track_id == track_id)
    & (complete_data_df.race_date == race_date)
    & (complete_data_df.race_number == race_number)
    ]
    
    # Adding "previuos coordinates" to data frame with pandas shift func
    single_jockey_race=single_jockey_race.sort_values(by='trakus_index')
    r_shift = single_jockey_race.shift()
    r_shift
    single_jockey_race['shift_latitude']=r_shift['latitude'].fillna(0)
    single_jockey_race['shift_longitude']=r_shift['longitude'].fillna(0)

    # Add distance between current trakus read and previous one
    single_jockey_race['trakus_distance']=single_jockey_race.apply(haversine_distance, axis='columns')
    single_jockey_race['trakus_distance']=single_jockey_race.trakus_distance.map(lambda td: 0 if td>1 else td)
    single_jockey_race['distance_covered']=single_jockey_race.trakus_distance.cumsum()

    # Add Velocity and acceleration
    single_jockey_race['velocity']=single_jockey_race.apply(velocity, axis='columns')
    single_jockey_race_vshift=single_jockey_race.shift()
    single_jockey_race['shift_velocity']=single_jockey_race_vshift['velocity'].fillna(0)
    single_jockey_race['acceleration']=single_jockey_race.apply(acceleration, axis='columns')

    # Print some stats
    runup_kmd=0.0176784   
    rand_d=0 # runup distance
    tix=0 # trackus index post ranup distance
    for index, row in single_jockey_race.iterrows():
        rand_d=rand_d+row.trakus_distance
        if rand_d > runup_kmd:
            ntix=row.trakus_index
            tix = row.trakus_index - 1
            break
    total_distance = single_jockey_race.trakus_distance.sum() * 1000 
    runup_distance = rand_d * 1000
    race_distance = (single_jockey_race.distance_id.unique()[0] / 100) * 201.168 # from furlongs to meters
    excess_distance = total_distance - runup_distance - race_distance

    print(f'''Race of {jockey}, in {track_id}, {race_date}, race number: {race_number}
    Total distance ran: {total_distance} m
    runup distance: {runup_distance} m
    declared race distance: {race_distance} m
    excess distance: {excess_distance} m
    ''')

    return single_jockey_race

    

In [None]:
# results for Hector Rafael Diaz Jr.
hector_rdj_df=isolate_jockey_and_race(complete_data_df=complete_data, jockey="Hector Rafael Diaz Jr.", track_id="AQU", race_date="2019-04-19", race_number=7)


In [None]:
manuel_f_df=isolate_jockey_and_race(complete_data_df=complete_data, jockey="Manuel Franco", track_id="AQU", race_date="2019-04-19", race_number=7)


#### Let's do some visualization!

In [None]:
manuel_f_df.acceleration.describe()

In [None]:
hector_rdj_df.acceleration.describe()

In [None]:

sns.lineplot(x="distance_covered", y="velocity",
data=manuel_f_df, label="velocity_manuel")
sns.lineplot(x="distance_covered", y="velocity", label="hector_velocity",
data=hector_rdj_df)
sns.lineplot(x="distance_covered", y="acceleration", label="acc_manuel",
data=manuel_f_df)
sns.lineplot(x="distance_covered", y="acceleration", label="acc_hector",
data=hector_rdj_df)
plt.legend()