# EDA for Big Data Derby Competition

### Creating paths to diferent folders

In [None]:
import os
from pathlib import Path

# Get current work directory of file (Notebook dir)
NOTEBOOK_DIR=Path(os.path.abspath(os.getcwd()))
# Root path of repo
WORK_DIR=NOTEBOOK_DIR.parent.parent
# Path to Datasets
DATA_DIR=WORK_DIR / 'data'

# Noob printing to check if I did this shit right
print("Notebook dir: ", NOTEBOOK_DIR)
print("Repo dir: ", WORK_DIR)
print("Data dir: ", DATA_DIR)

### Loading csv into dataframes

In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns

# horse_data = pd.read_csv(DATA_DIR / 'horse_ids.csv', index_col=0)


#### Loader functions

In [None]:
def load_data():
    complete_data_headers = ['track_id','race_date','race_number','program_number','trakus_index','latitude','longitude','distance_id',
    'course_type','track_condition','run_up_distance','race_type','purse','post_time','weight_carried','jockey','odds','position_at_finish']
    complete_data= pd.read_csv(DATA_DIR / 'nyra_2019_complete.csv', names=complete_data_headers)

    loaded = complete_data.drop(['trakus_index','latitude','longitude','purse','post_time'], axis = 1)
    loaded = loaded.groupby(['race_date','race_number','program_number']).max()

    return loaded

In [None]:
# Adds coordinate list to each horse in each race in the dataframe
# TODO Could be improved to have a different I/O dataframe but didn't think it was needed

def add_coordinates(ref_df):
    
    tracking_df = pd.read_csv(DATA_DIR / 'nyra_tracking_table.csv')
    
    # Check if a 'coordinates' column already exists, if not, add it.
    if 'coordinates' not in ref_df:
        ref_df.insert(2, column='coordinates',value="")
        ref_df['coordinates'] = ref_df['coordinates'].astype('object')

    # Iterate through dataframe passed to function. Limited to 50 first rows.
    for index, row in ref_df.head(50).iterrows():

        # Sample to access Trakus data from our tracking reference table
        # where index[0] equals Race Date, index[1] equals Race Number and index[2] equals each horse/jockey in the race
        sample = tracking_df[
            (tracking_df.race_date == index[0])
            & (tracking_df.race_number == index[1])
            & (tracking_df.program_number == str(index[2]).ljust(3))
            ]

        # List variable to store our coordinate tuples for each index, in order
        coordinates = []

        # Iterate each row in our sample, ordered by index value,
        # and store latitude and longitude in a tuple, adding it at the end of our list
        # TODO add the index value to tuple if needed
        for idx, rw in sample.sort_values('trakus_index').iterrows():
            tuple = (rw.latitude, rw.longitude)
            coordinates.append(tuple)


        # Adding coordinate list to each row in df
        ref_df.at[(index[0],
            index[1],
            index[2]),
            'coordinates'] = coordinates

    return


#### Other functions

## Creating new dataframes

In [None]:
# New dataframe filled only with info related to race and results.

results_df = load_data()

results_df.head(5)

## Manipulating dataframes

In [None]:
# Add coordinates to df as list of tuples

add_coordinates(results_df)

results_df.head(5)

## W/E

In [None]:
results_df.loc['2019-01-01':'2019-01-31']

In [None]:
for index, number in results_df.loc['2019-01-01'].iterrows():
    print(number.jockey + ' finished ' + str(number.position_at_finish) + ' during race number ' + str(index[0]))
    

In [None]:
results_df.loc['2019-12-31', 8]

In [None]:
results_df.jockey.describe()

In [None]:
results_df.loc[results_df.position_at_finish == 3].jockey.describe()

In [None]:
results_df.loc[results_df.position_at_finish == 3].jockey.value_counts()