## Golf Analysis 

Data is from the Hole19 app of my round history.

In [31]:
# Golf Analysis from Hole19 data.

import pandas as pd
import numpy as np

df = pd.read_csv('./data/Hole19Download.csv', header=None)

df = pd.DataFrame(df.values.reshape(-1, 8), 
                  columns=['ID','Course','GameType','OverPar','Score', 'Putts', 'GIR', 'FairwayHit'])

df.head()

Unnamed: 0,ID,Course,GameType,OverPar,Score,Putts,GIR,FairwayHit
0,45055,William Sahm Golf Course,9 Holes STK PLAY,3,38,16,0.556,0.571
1,45053,StonyCreek Golf Club (Par 3),9 Holes STK PLAY,3,30,19,0.778,0.0
2,45052,StonyCreek Golf Club (The Championship),18 Holes STK PLAY,13,84,39,0.556,0.5
3,45038,William Sahm Golf Course,9 Holes STK PLAY,11,46,18,0.333,0.571
4,45031,William Sahm Golf Course,18 Holes STK PLAY,9,79,32,0.333,0.5


### Definitions

Table grain is at the round level.

|Column Name|Description|
|---|---|
|ID|test|
|Course|Name of golf course|
|GameType|Number of holes and type of game played|
|OverPar|Number of strokes over par|
|Score|Total score (Score - OverPar = Par for the Course)|
|Putts|Total number of putts|
|GIR|Greens hit in regulation|
|FairwayHit|Total percentage of fairways|



In [32]:
# create a new column 'new_column_name' based on the text of 'old_column_name'
df = df.assign(NumHoles=df['GameType'].str[:2])

In [33]:
# convert column "a" of a DataFrame
df["OverPar"] = pd.to_numeric(df["OverPar"])
df["Score"] = pd.to_numeric(df["Score"])
df["Putts"] = pd.to_numeric(df["Putts"])
df["GIR"] = pd.to_numeric(df["GIR"])
df["FairwayHit"] = pd.to_numeric(df["FairwayHit"])
df["NumHoles"] = pd.to_numeric(df["NumHoles"])

In [34]:
df.describe()

Unnamed: 0,OverPar,Score,Putts,GIR,FairwayHit,NumHoles
count,25.0,25.0,25.0,25.0,25.0,25.0
mean,11.84,70.52,29.76,0.34748,0.55052,15.12
std,4.26888,19.194444,8.476831,0.171569,0.182693,4.085748
min,3.0,30.0,16.0,0.0,0.0,9.0
25%,9.0,50.0,22.0,0.222,0.5,9.0
50%,12.0,81.0,32.0,0.333,0.571,18.0
75%,14.0,84.0,37.0,0.444,0.571,18.0
max,23.0,95.0,40.0,0.778,1.0,18.0


In [35]:
df.NumHoles.unique()

array([ 9, 18, 13, 14], dtype=int64)

In [36]:
df.dtypes


ID             object
Course         object
GameType       object
OverPar         int64
Score           int64
Putts           int64
GIR           float64
FairwayHit    float64
NumHoles        int64
dtype: object

In [37]:
#calculate quartiles for each numeric column in DataFrame
df.quantile(q=[0.25, 0.5, 0.75], axis=0, numeric_only=True)

Unnamed: 0,OverPar,Score,Putts,GIR,FairwayHit,NumHoles
0.25,9.0,50.0,22.0,0.222,0.5,9.0
0.5,12.0,81.0,32.0,0.333,0.571,18.0
0.75,14.0,84.0,37.0,0.444,0.571,18.0


In [38]:
df.tail()

Unnamed: 0,ID,Course,GameType,OverPar,Score,Putts,GIR,FairwayHit,NumHoles
20,44814,William Sahm Golf Course,18 Holes STK PLAY,14,84,38,0.389,0.643,18
21,44810,William Sahm Golf Course,18 Holes STK PLAY,12,82,33,0.333,0.5,18
22,44803,Brookshire Golf Club,9 Holes STK PLAY,8,44,18,0.222,1.0,9
23,44796,Brookshire Golf Club,9 Holes STK PLAY,14,50,18,0.222,0.833,9
24,44789,William Sahm Golf Course,18 Holes STK PLAY,16,86,35,0.389,0.714,18


In [39]:
dfplot = pd.DataFrame(df.groupby('Course').OverPar.mean())

In [40]:
import plotly.express as px
fig = px.bar(dfplot)
fig.show()

### Check correlations between GIR and Putts

In [41]:
df.corr()

Unnamed: 0,OverPar,Score,Putts,GIR,FairwayHit,NumHoles
OverPar,1.0,0.728735,0.657518,-0.284967,0.102582,0.552989
Score,0.728735,1.0,0.935756,-0.035594,-0.038911,0.96986
Putts,0.657518,0.935756,1.0,0.233175,-0.218574,0.930824
GIR,-0.284967,-0.035594,0.233175,1.0,-0.517148,0.097277
FairwayHit,0.102582,-0.038911,-0.218574,-0.517148,1.0,-0.141649
NumHoles,0.552989,0.96986,0.930824,0.097277,-0.141649,1.0


### Correlations

We can see the correlation between strokes over par and green in regulations is slightly negative. While I wouldn't say this is correlated, it is a good sanity check to understand that as my score goes down, the green in regulations go up. 

My putts are correlated to scoring over par. The more putts I hit, my score goes up.

In [42]:
df18 = df[df['NumHoles'] > 10]
df9 = df[df['NumHoles'] <= 10]

In [43]:
df18.head()

Unnamed: 0,ID,Course,GameType,OverPar,Score,Putts,GIR,FairwayHit,NumHoles
2,45052,StonyCreek Golf Club (The Championship),18 Holes STK PLAY,13,84,39,0.556,0.5,18
4,45031,William Sahm Golf Course,18 Holes STK PLAY,9,79,32,0.333,0.5,18
5,45019,William Sahm Golf Course,18 Holes STK PLAY,13,83,32,0.222,0.462,18
6,45013,William Sahm Golf Course,18 Holes STK PLAY,10,80,30,0.222,0.571,18
7,44986,William Sahm Golf Course,18 Holes STK PLAY,17,87,37,0.222,0.5,18


In [44]:
def CourseAverage(df):
        if df[df['NumHoles']] == 9:
                df = pd.DataFrame(df.groupby('Course').OverPar.mean())
                df['HoleNum'] == 9
        else:
                df = pd.DataFrame(df.groupby('Course').OverPar.mean())
                df['HoleNum'] == 18
        print(df)

In [45]:
dfplot9.head()

NameError: name 'dfplot9' is not defined

In [None]:
df['18HoleFlag'] = np.where(df['NumHoles'] > 10, True, False)

In [None]:
df = pd.DataFrame(df.groupby(['Course', 'NumHoles', '18HoleFlag']).OverPar.mean())

In [None]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OverPar
Course,NumHoles,18HoleFlag,Unnamed: 3_level_1
Brickyard Crossing GC,18,True,23.0
Brookshire Golf Club,9,False,10.75
StonyCreek Golf Club (Par 3),9,False,3.0
StonyCreek Golf Club (The Championship),18,True,13.0
William Sahm Golf Course,9,False,7.0


In [None]:
import plotly.express as px

df = df
fig = px.bar(df, x='Course', y='OverPar',
             hover_data=['GameType', 'Putts'], color='18HoleFlag',
             labels={'OverPar':'Strokes over par'}, height=400)
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['OverPar'] but received: Course

In [None]:
import plotly.express as px

df = px.data.gapminder().query("continent == 'Oceania'")
fig = px.bar(df, x='year', y='pop',
             hover_data=['lifeExp', 'gdpPercap'], color='country',
             labels={'pop':'population of Canada'}, height=400)
fig.show()

In [None]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
60,Australia,Oceania,1952,69.12,8691212,10039.59564,AUS,36
61,Australia,Oceania,1957,70.33,9712569,10949.64959,AUS,36
62,Australia,Oceania,1962,70.93,10794968,12217.22686,AUS,36
63,Australia,Oceania,1967,71.1,11872264,14526.12465,AUS,36
64,Australia,Oceania,1972,71.93,13177000,16788.62948,AUS,36


In [None]:
import plotly.express as px

df = dfplot
fig = px.bar(df, x='year', y='pop',
             hover_data=['lifeExp', 'gdpPercap'], color='country',
             labels={'pop':'population of Canada'}, height=400)
fig.show()

NameError: name 'dfplot' is not defined