## Import Libraries 

In [2]:
# import required libraries

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import chi2_contingency
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


## Stylesheet and Palettes

In [None]:
# invoke custom Matplotlib stylesheet 
plt.style.use('resources/yh-style.mpl')

# set color palettes for plots
color_palette = {'#ba324f', '#e75a7c', '#ffb3c8', '#f3722c', '#f8961e', '#f9c74f', '#f0ffb3', '#90be6d', '#25a18b', '#0d6a87', '#83D7FC', '#aaa1c8', '#6b3c80', '#00ffee', '#6d8886'}

color_palette_10 = {'#aaa1c8', '#ffb3c8','#e75a7c', '#f8961e', '#f9c74f', '#90be6d', '#ba324f','#0d6a87', '#6b3c80', '#00ffee'}

default_colors = sns.color_palette(palette=None)

sequential = sns.light_palette("#0d6a87", 12, reverse=True)

sns.color_palette(palette=None)



SyntaxError: positional argument follows keyword argument (3147157877.py, line 11)

## Load CSV Files
The data is divided into two files, one for championship results, one for information on each participating horse. 

In [4]:
# load CSV of championship results into a dataframe 
championship_df = pd.read_csv('resources/yh-championship-data.csv')



# display first 10 records
championship_df.head(10)

Unnamed: 0,Year,Division,Horse,OverallPlacing,USDFNumber,OverallScore
0,2002,FEI5,Rosabella,1,37232,7.84
1,2002,FEI5,Favereux,2,38714,7.68
2,2002,FEI5,Devon,3,38984,7.42
3,2002,FEI5,Welfenstein,4,40474,7.14
4,2002,FEI5,R-tistik,5,37123,7.1
5,2002,FEI5,Pampero,6,41386,6.82
6,2002,FEI6,Oleander,1,35062,8.24
7,2002,FEI6,Freestyle,2,39380,7.4
8,2002,FEI6,Wincenzo,3,1026740,7.0
9,2002,FEI6,Olympus,4,42683,6.86


In [13]:
# load CSV of horse data into a dataframe 
#horse_df = pd.read_csv('resources/yh-horse-data.csv')
horse_df = pd.read_csv('resources/yh-horse-data.csv')

# display first 10 records
horse_df.head(10)

Unnamed: 0,Horse,HighestLevel,CDI,USDFNumber,Sire,Damsire,Country,Breeder,Studbook,InternationalTeam,TeamMade,AverageGP,FirstYear,LastYear
0,Rosabella,Third Level,No,37232,Rohdiamant,Watzmann,Germany,Kerstin Ohlemeyer,Hanoverian,No,,,2001,2007
1,Favereux,Grand Prix,No,38714,Fidermark,Fidelio,Germany,Johannes Hilgers,Rhinelander,No,,60.818,2001,2013
2,Devon,Third Level,No,38984,Don Gregory,,,,Oldenburg,No,,,2001,2023
3,Welfenstein,Grand Prix,Yes,40474,Wolkenstein II,Lauries Crusador xx,Germany,Heinz Bruns,Hanoverian,No,,61.542,2001,2015
4,R-tistik,Grand Prix,Yes,37123,Ramires,Rex Fritz,Germany,Josef Kathmann,Oldenburg,No,,61.491,2001,2014
5,Pampero,FEI 5 Year Old,No,41386,Ferro,,USA,Margaret Avery,KWPN,No,,,2002,2003
6,Oleander,Grand Prix,Yes,35062,Jazz,Ulft,Netherlands,R. Van Wourdenbergh,KWPN,No,,64.122,2000,2013
7,Freestyle,Prix St. Georges,No,39380,Florestan I,Parademarsch I,Germany,,Westfalen,No,,,2002,2005
8,Wincenzo,Prix St. Georges,No,1026740,Werther,Graphit,Germany,,Hanoverian,No,,,2002,2007
9,Olympus,Grand Prix,Yes,42683,Clavecimbel,,Netherlands,G. Van Der Veen,KWPN,No,,67.532,2002,2009


In [15]:
# create merged dataframe on USDF number
merged_df = pd.merge(championship_df, horse_df, on='USDFNumber')

# drop duplicate columns
merged_df = merged_df.drop(columns =['Horse_y'])

# rename columns
merged_df = merged_df.rename(columns={'Horse_x': 'Horse', 'CDI': 'CDI Competitor', 'HighestLevel': 'Highest Level', 'Country': 'Country Bred', 'TeamMade': 'Team Made'})

# convert year columns to date time
merged_df['FirstYear'] = pd.to_datetime(merged_df['FirstYear'], format='%Y')
merged_df['LastYear'] = pd.to_datetime(merged_df['LastYear'], format='%Y')

# extract year only
merged_df['FirstYear'] = merged_df['FirstYear'].dt.year
merged_df['LastYear'] = merged_df['LastYear'].dt.year


# display first 10 records
merged_df.head(10) 



Unnamed: 0,Year,Division,Horse,OverallPlacing,USDFNumber,OverallScore,Highest Level,CDI Competitor,Sire,Damsire,Country Bred,Breeder,Studbook,InternationalTeam,Team Made,AverageGP,FirstYear,LastYear
0,2002,FEI5,Rosabella,1,37232,7.84,Third Level,No,Rohdiamant,Watzmann,Germany,Kerstin Ohlemeyer,Hanoverian,No,,,2001,2007
1,2002,FEI5,Favereux,2,38714,7.68,Grand Prix,No,Fidermark,Fidelio,Germany,Johannes Hilgers,Rhinelander,No,,60.818,2001,2013
2,2002,FEI5,Devon,3,38984,7.42,Third Level,No,Don Gregory,,,,Oldenburg,No,,,2001,2023
3,2002,FEI5,Welfenstein,4,40474,7.14,Grand Prix,Yes,Wolkenstein II,Lauries Crusador xx,Germany,Heinz Bruns,Hanoverian,No,,61.542,2001,2015
4,2002,FEI5,R-tistik,5,37123,7.1,Grand Prix,Yes,Ramires,Rex Fritz,Germany,Josef Kathmann,Oldenburg,No,,61.491,2001,2014
5,2002,FEI5,Pampero,6,41386,6.82,FEI 5 Year Old,No,Ferro,,USA,Margaret Avery,KWPN,No,,,2002,2003
6,2002,FEI6,Oleander,1,35062,8.24,Grand Prix,Yes,Jazz,Ulft,Netherlands,R. Van Wourdenbergh,KWPN,No,,64.122,2000,2013
7,2002,FEI6,Freestyle,2,39380,7.4,Prix St. Georges,No,Florestan I,Parademarsch I,Germany,,Westfalen,No,,,2002,2005
8,2002,FEI6,Wincenzo,3,1026740,7.0,Prix St. Georges,No,Werther,Graphit,Germany,,Hanoverian,No,,,2002,2007
9,2002,FEI6,Olympus,4,42683,6.86,Grand Prix,Yes,Clavecimbel,,Netherlands,G. Van Der Veen,KWPN,No,,67.532,2002,2009
