In [None]:
# also explore: https://scikit-learn.org/stable/modules/feature_selection.html

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn import metrics

%matplotlib inline

In [42]:
# TRAIN_FILE = './data/tony-finau-round-course-train.csv'
# TEST_FILE = './data/tony-finau-round-course-test.csv'

TRAIN_FILE = './data/2015-2017-round-course-train.csv'
TEST_FILE = './data/2015-2017-round-course-test.csv'

In [43]:
def parse_teetime_hour(val):
    return val[0:val.find(':')]

def load_and_process_data(file):
    df = pd.read_csv(file, index_col=None)
    
    # Convert string fields (course data) to numeric
    label_encoder = LabelEncoder()
    df['CourseFairwayFirmness'] = label_encoder.fit_transform(df['CourseFairwayFirmness'])
    df['CourseGreenFirmness'] = label_encoder.fit_transform(df['CourseGreenFirmness'])
    df['CourseGreenGrass'] = label_encoder.fit_transform(df['CourseGreenGrass'])
    df['CourseFairwayGrass'] = label_encoder.fit_transform(df['CourseFairwayGrass'])
    df['CourseTeeGrass'] = label_encoder.fit_transform(df['CourseTeeGrass'])
    df['CourseRoughGrass'] = label_encoder.fit_transform(df['CourseRoughGrass'])

    # Convert TeeTime into just the hour
    df['TeeTime'] = df['TeeTime'].apply(parse_teetime_hour)
    
    # Convert -1 for no attempts to mean value??
    # imputer = preprocessing.Imputer(missing_values='-1', strategy='mean', axis=0)
    # imputer1 = imputer.fit_transform(df['Approach50-75'].values.reshape(-1,1))
    # df['Approach50-75'] = imputer1
    
    # Drop score related columns
    df.drop(
        columns=['ScoreEagles', 'ScoreBirdies', 'ScorePars', 'ScoreBogeys', 'ScoreDoubles', 'ScoreOthers', 'ScoreOverPar'],
        axis=1,
        inplace=True
    )
    
    # Drop strokes-gained related columns (calculated after the round)
    df.drop(
        columns=['SGPutting', 'SGTeeToGreen', 'SGTotal', 'SGOffTheTee', 'SGApproach', 'SGAroundTheGreen'],
        axis=1,
        inplace=True
    )
    
    # Drop end of round/event position
    df.drop(
        columns=['EndOfRoundFinishPosition', 'EndOfEventFinishPosition'],
        axis=1,
        inplace=True
    )
    
    return df

def split_data(df):
    # return X, y
    return df[df.columns[1:]], df[df.columns[0]]

In [44]:
df_train = load_and_process_data(TRAIN_FILE)
df_test = load_and_process_data(TEST_FILE)
X_train, y_train = split_data(df_train)
X_test, y_test = split_data(df_test)

In [45]:
X = X_train.values
y = y_train.values

X = StandardScaler().fit_transform(X)
X



array([[-1.21557683, -1.14019838,  1.02578388, ...,  0.3153713 ,
        -0.37037008,  0.32336365],
       [-1.21557683, -1.14019838,  1.02578388, ...,  0.3153713 ,
        -0.37037008,  0.32336365],
       [-1.21557683, -1.14019838,  1.02578388, ...,  0.3153713 ,
        -0.37037008,  0.32336365],
       ...,
       [ 1.21838416,  1.61240602, -1.33148482, ...,  0.7887544 ,
         0.36885292,  0.43184605],
       [ 1.21838416,  1.61240602, -1.33148482, ...,  0.7887544 ,
         0.36885292,  0.43184605],
       [ 1.21838416,  1.61240602, -1.33148482, ...,  0.7887544 ,
         0.36885292,  0.43184605]])

In [46]:
n_components = 10
cols = ['pc' + str(i+1) for i in range(n_components)]

pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(X)
df_principal = pd.DataFrame(data=principalComponents, columns=cols)
df_principal.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,-1.178702,1.981185,3.410324,-0.232543,1.526125,-2.945778,-0.528402,-0.970089,0.568414,-0.278267
1,-1.471551,2.556305,-1.381781,-2.049728,0.388242,-1.9094,-3.497386,0.743764,2.937712,0.073706
2,-0.013205,1.799104,3.678729,-1.031677,1.764268,-1.621545,-0.649825,-0.967563,-1.098659,0.074878
3,-2.831524,3.072544,-1.263909,-2.862413,0.149422,0.098846,-0.34776,-0.844354,0.973776,-0.246929
4,-2.747055,3.138805,-2.239306,-3.087656,0.390938,-0.259356,-0.903145,-0.536206,0.622093,0.368298


In [47]:
df_final = pd.concat([df_principal, y_train], axis = 1)
df_final.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,Score
0,-1.178702,1.981185,3.410324,-0.232543,1.526125,-2.945778,-0.528402,-0.970089,0.568414,-0.278267,77
1,-1.471551,2.556305,-1.381781,-2.049728,0.388242,-1.9094,-3.497386,0.743764,2.937712,0.073706,73
2,-0.013205,1.799104,3.678729,-1.031677,1.764268,-1.621545,-0.649825,-0.967563,-1.098659,0.074878,72
3,-2.831524,3.072544,-1.263909,-2.862413,0.149422,0.098846,-0.34776,-0.844354,0.973776,-0.246929,69
4,-2.747055,3.138805,-2.239306,-3.087656,0.390938,-0.259356,-0.903145,-0.536206,0.622093,0.368298,70


In [48]:
pca.explained_variance_ratio_

array([0.2664863 , 0.05918143, 0.04342693, 0.03727715, 0.03006946,
       0.02661499, 0.02077027, 0.02000563, 0.01747479, 0.01717817])

In [49]:
#plt.matshow(df_train.corr())
corr = df_train.corr()
corr.style.background_gradient()

Unnamed: 0,Score,Year,RoundNumber,TotalPar,LongestDrive,Driving300,DrivingFairway,DrivingLeftRough,DrivingRightRough,Approach50-75,Approach75-100,Approach100-125,Approach125-150,Approach150-175,Approach175-200,Approach200,Approach50-75Rough,Approach75-100Rough,Approach100-125Rough,Approach125-150Rough,Approach150-175Rough,Approach175-200Rough,Approach200Rough,GreenInRegulation,AvgProximityToHole,AvgProximityFairway,AvgProximityRough,AvgProximityLeftRough,AvgProximityRightRough,GoingForGreen,ScramblingRough,ScramblingFringe,ScramblingOver30,Scrambling20Yd30Yd,Scrambling10Yd20Yd,ScramblingInside10Yd,ScramblingSand,ScramblingSandProximity,PuttingTotal,PuttingTotalGIR,Putting1,Putting3,Putting3Ft,Putting4Ft,Putting5Ft,Putting6Ft,Putting7Ft,Putting8Ft,Putting9Ft,Putting10Ft,PuttingInside5Ft,Putting5Ft10Ft,Putting4Ft8Ft,PuttingInside10Ft,Putting10Ft15Ft,Putting15Ft20Ft,Putting20Ft25Ft,PuttingOver25Ft,PuttingOver10Ft,CoursePar3Holes,CoursePar3AvgScorecardYardage,CoursePar3AvgActualYardage,CoursePar3TotalScorecardYardage,CoursePar3TotalActualYardage,CoursePar4Holes,CoursePar4AvgScorecardYardage,CoursePar4AvgActualYardage,CoursePar4TotalScorecardYardage,CoursePar4TotalActualYardage,CoursePar5Holes,CoursePar5AvgScorecardYardage,CoursePar5AvgActualYardage,CoursePar5TotalScorecardYardage,CoursePar5TotalActualYardage,CourseFairwayFirmness,CourseGreenFirmness,CourseGreenHeight,CourseRoughtHeight,CourseFairwayHeight,CourseStimp,CourseGreenGrass,CourseFairwayGrass,CourseTeeGrass,CourseRoughGrass,CourseAvgFairwayWidth250,CourseAvgActualDistance250,CourseAvgFairwayWidth275,CourseAvgActualDistance275,CourseAvgFairwayWidth300,CourseAvgActualDistance300,CourseAvgFairwayWidth325,CourseAvgActualDistance325,CourseAvgFairwayWidth350,CourseAvgActualDistance350,CourseTotalScorecardYardage,CourseTotalActualYardage
Score,1.0,-0.00255366,-0.0497206,0.216548,-0.0218798,-0.0853307,-0.244338,0.0395664,0.0496023,0.084113,0.0961416,0.0897869,0.0982582,0.123156,0.136249,0.0961335,0.0777873,0.0763712,0.0790164,0.0510379,0.0477769,0.0731402,0.0602541,-0.57045,0.207023,0.206537,0.100268,0.083079,0.0912631,-0.129881,0.00091385,0.00265218,0.146309,0.0684189,-0.0429678,0.0028771,0.048092,0.15735,0.407784,-0.313454,-0.324074,0.290498,-0.0160746,-0.0147519,-0.0521029,-0.0613436,-0.0760974,-0.0784549,-0.0818983,-0.0872831,0.0235133,0.156933,-0.179248,-0.0854894,-0.182455,-0.143486,-0.108486,-0.136256,-0.294596,0.0476387,0.0680278,-0.0160924,0.078971,-0.00788767,-0.22397,0.0726403,-0.0137827,-0.179931,-0.069231,0.22611,0.113727,-0.00830232,0.237212,0.1396,-0.0680419,-0.100326,-0.00246262,-0.0128334,-0.0557723,-0.0526314,-0.0176084,0.0372518,-0.0698507,0.00896472,-0.0591761,-0.0377449,-0.0477519,-0.0406105,-0.0540877,-0.040279,-0.0555691,-0.0406674,-0.0414301,-0.0247565,0.224211,-0.00287009
Year,-0.00255366,1.0,0.00530029,-0.0289031,0.0453017,0.117418,-0.0122457,-0.000257753,0.00620523,0.00358562,-0.00376182,0.0214033,0.0208195,0.0372889,0.0160934,0.0192273,0.00699229,0.00790295,0.00940402,0.0130246,0.0153967,-0.00144149,0.00428501,-0.00741978,0.0448963,0.0335296,0.00733689,0.00572945,0.00746499,0.0215167,0.0110667,-0.0889456,0.0231113,0.0130308,0.020416,0.00776414,0.00868492,0.0219078,0.00417001,-0.00250644,-0.00375461,0.0078545,0.0235257,0.0105725,0.0112654,0.0061592,0.00192698,0.00416043,-0.00272593,0.0076179,0.0247465,0.0179798,0.0122812,0.0195519,0.0128173,0.00316414,0.000830103,-0.000480525,0.00687438,-0.0290748,-0.0120618,-0.0129299,-0.0239812,-0.0168168,0.0398442,0.0120588,-0.0111944,0.0390585,0.00551642,-0.0354477,0.0169238,-0.0112512,-0.0305469,-0.0480417,0.05529,0.0463839,0.00938597,-0.0484522,-0.0398321,-0.00960799,-0.000466701,-0.0130626,-0.0419993,-0.0794833,-0.00664252,-0.0745811,-0.0421979,-0.103159,-0.0443795,-0.0835725,-0.0147143,-0.0296372,-0.0613178,-0.0766967,0.00202545,-0.0135046
RoundNumber,-0.0497206,0.00530029,1.0,0.0254522,-0.00932929,0.0522268,0.0164009,-0.0208357,-0.0208994,-0.0193447,-0.0271047,-0.0225086,-0.0266995,-0.0295782,-0.0394423,-0.0152376,-0.00212608,-0.00903783,-0.0125321,-0.0136325,-0.0191806,-0.00870742,0.00565319,0.0364405,-0.0168276,-0.0313901,-0.0106399,-0.00609387,-0.00917026,0.020457,0.00257137,-0.00394027,-0.0305229,-0.025292,-0.0258779,0.00514237,0.013225,-0.00487461,-0.0242173,0.0223289,0.0196513,-0.0130718,-0.0307386,-0.0184824,-0.00632742,-0.0096776,0.0012269,-0.00714195,-0.0010349,9.43201e-05,-0.0378939,-0.0224803,-0.00793427,-0.0243897,-0.00853449,-0.00836429,-0.00472807,-0.00581036,-0.0163689,-0.0118788,-0.00626253,-0.0178882,-0.0121054,-0.0196419,-0.0186686,-0.0354127,-0.0195721,-0.0266136,-0.0219271,0.0225228,0.0144971,-0.0212053,0.0224292,-0.00639188,-0.0630291,-0.0825713,0.0179582,-0.00644996,-0.0447851,-0.0113155,0.0139267,-0.00490315,0.000217025,-0.00707679,-0.0132539,-0.0235706,-0.0120607,-0.0195775,-0.0130129,-0.0225718,-0.0159486,-0.0201873,-0.0157893,-0.0253708,-0.00744782,-0.0189922
TotalPar,0.216548,-0.0289031,0.0254522,1.0,-0.158602,-0.0944582,0.0361877,-0.199451,-0.0939974,0.0402869,0.0624348,-0.00598451,-0.0634799,-0.0880344,-0.0673487,-0.0166937,-0.00914585,-0.00321909,-0.0350874,-0.132418,-0.154153,-0.105954,-0.0191322,0.00149675,-0.0872251,-0.0422572,-0.135745,-0.138257,-0.0953491,-0.0974158,-0.0468818,0.000850931,-0.0915596,-0.0774747,-0.12517,-0.0397739,0.0108835,-0.0261054,-0.0503569,-0.0205879,0.0546053,0.00942175,-0.183841,-0.0949825,-0.0655757,-0.0507936,-0.036907,-0.0276514,-0.025145,-0.0225745,-0.179353,-0.0726251,-0.115911,-0.185674,-0.0427569,-0.0467243,-0.0404986,-0.0343909,-0.0664258,-0.0870778,-0.0429386,-0.21089,-0.082187,-0.22027,-0.899767,-0.248109,-0.212115,-0.882506,-0.417074,0.972946,0.0541057,-0.196066,0.962249,0.446189,-0.0572108,-0.0655472,0.0243587,-0.193539,-0.123361,-0.104844,-0.0720796,0.0852029,-0.139602,0.0888203,-0.130712,-0.175966,-0.0814986,-0.132892,-0.0874784,-0.192442,-0.0987722,-0.195882,-0.0774184,-0.166159,0.421444,-0.162644
LongestDrive,-0.0218798,0.0453017,-0.00932929,-0.158602,1.0,0.446924,-0.106206,0.23235,0.221766,0.0714817,0.128169,0.185573,0.245049,0.278643,0.274309,0.265441,0.0610107,0.0795618,0.12481,0.153412,0.153431,0.102951,0.121546,-0.0428345,0.433227,0.398851,0.276036,0.198748,0.20478,0.346122,0.25291,0.125729,0.178134,0.177609,0.32148,0.216525,0.00966401,0.165528,-0.0159252,-0.0340385,0.0208453,0.0201082,0.494289,0.287698,0.198956,0.15158,0.129597,0.104417,0.0944378,0.0877703,0.489028,0.233804,0.310535,0.47988,0.137777,0.0862862,0.060818,0.0632053,0.188054,-0.0251959,0.0627734,0.426532,0.0331255,0.413259,0.159791,0.343922,0.436762,0.239511,0.43215,-0.163358,0.097677,0.432117,-0.145222,0.305611,0.131417,0.143855,0.062458,0.355545,0.406746,0.396591,0.190966,0.12184,0.120803,0.101946,0.397422,0.365436,0.370908,0.274572,0.37646,0.349301,0.387922,0.420915,0.378961,0.370565,0.183609,0.434694
Driving300,-0.0853307,0.117418,0.0522268,-0.0944582,0.446924,1.0,-0.19824,0.254014,0.226178,0.0512631,0.0835253,0.141695,0.18873,0.195525,0.180796,0.161337,0.0567086,0.0807194,0.156295,0.183299,0.152651,0.055978,0.10521,0.0328994,0.338067,0.274296,0.203363,0.176442,0.163289,0.362393,0.10915,0.0609341,0.109496,0.099929,0.229096,0.176235,0.0298543,0.149101,-0.00355201,0.0206822,0.00665618,0.0159256,0.37825,0.211031,0.155024,0.113469,0.106618,0.0744057,0.075984,0.0717766,0.376038,0.179076,0.233605,0.364382,0.108157,0.0662934,0.0467862,0.049838,0.145643,-0.0733265,0.0283866,0.321758,-0.0212316,0.303052,0.120713,0.250161,0.347231,0.18055,0.354217,-0.110816,0.0314614,0.325923,-0.105002,0.207561,-0.0999196,-0.109064,-0.00107965,0.244733,0.105721,0.137424,-0.0239683,-0.020126,0.0252092,-0.0260625,0.285081,0.263063,0.259679,0.187376,0.277582,0.256828,0.316297,0.356659,0.333769,0.305419,0.133239,0.338643
DrivingFairway,-0.244338,-0.0122457,0.0164009,0.0361877,-0.106206,-0.19824,1.0,-0.494668,-0.52131,-0.00146113,0.0182686,0.0333072,0.0190777,-0.0294757,-0.0391807,0.0318181,-0.0869278,-0.119125,-0.192582,-0.240961,-0.245018,-0.216753,-0.241497,0.322862,-0.173103,-0.0561253,-0.148429,-0.221991,-0.217539,-0.0077487,-0.0815519,-0.0303363,-0.202532,-0.114034,-0.078287,-0.045522,-0.0741682,-0.072055,0.0760654,0.28769,-0.0743035,0.00763545,-0.0705019,-0.0441723,-0.0313813,-0.0392511,-0.0211185,-0.0193677,-0.0135931,-0.0153443,-0.0733746,-0.038697,-0.0464295,-0.063923,-0.0255628,-0.0133608,-0.00273297,-0.0107103,-0.0345483,-0.0353988,-0.0175114,-0.0923613,-0.0332327,-0.0959196,-0.0184349,-0.109013,-0.112655,-0.0498031,-0.110972,0.02773,0.00713036,-0.101558,0.0266275,-0.0680358,0.0752292,0.0866301,0.0562495,-0.0377781,-0.0578045,-0.0612448,-0.0174041,-0.0400645,0.0234772,-0.0141468,0.0323746,-0.0376668,0.0442964,0.000655716,0.0335417,-0.0322399,-0.0161088,-0.0955349,-0.037618,-0.0930348,-0.0566497,-0.107164
DrivingLeftRough,0.0395664,-0.000257753,-0.0208357,-0.199451,0.23235,0.254014,-0.494668,1.0,0.0974912,0.0296356,0.0592174,0.0854632,0.138612,0.183467,0.199784,0.163155,0.0944024,0.128826,0.206577,0.263862,0.271857,0.227482,0.244802,-0.140129,0.354183,0.279728,0.293576,0.455802,0.14993,0.205135,0.151261,0.0766083,0.191153,0.176974,0.269308,0.178281,0.036736,0.124615,-0.0428982,-0.122895,0.0363497,-0.0147657,0.380842,0.224008,0.148826,0.115838,0.0913546,0.0779016,0.0678338,0.0643141,0.379777,0.177981,0.233846,0.367885,0.0981144,0.0597779,0.0455484,0.0489627,0.140973,0.0473683,0.0785876,0.390396,0.0861044,0.389322,0.166318,0.152197,0.406392,0.194566,0.418151,-0.187098,-0.0172788,0.397122,-0.18656,0.226701,0.0416954,0.0292138,-0.045743,0.219164,0.21381,0.225226,0.0554966,0.0437472,0.0235973,0.0639565,0.277092,0.302336,0.240309,0.212525,0.259075,0.293654,0.311586,0.391279,0.317021,0.356611,-0.00370357,0.397359
DrivingRightRough,0.0496023,0.00620523,-0.0208994,-0.0939974,0.221766,0.226178,-0.52131,0.0974912,1.0,0.0323765,0.0595042,0.0955661,0.140959,0.20477,0.20204,0.1865,0.0888604,0.113747,0.199772,0.270703,0.290778,0.26654,0.269669,-0.1332,0.389491,0.303786,0.320899,0.147361,0.44488,0.230106,0.147387,0.0799802,0.197819,0.202118,0.285747,0.176322,0.0444524,0.136051,-0.050234,-0.120067,0.0431981,-0.012537,0.399368,0.229126,0.15826,0.128883,0.100074,0.0895352,0.0804068,0.0709523,0.396402,0.185363,0.250705,0.386004,0.112534,0.072296,0.0473463,0.0581616,0.163069,-0.00226062,0.0740871,0.405806,0.0528866,0.396368,0.0891514,0.12835,0.42332,0.117978,0.413576,-0.0938769,-0.00528377,0.415041,-0.0912269,0.307049,0.0551139,0.0498788,0.0121508,0.218156,0.245248,0.245243,0.0590013,0.0610972,0.0684672,0.0424527,0.301626,0.341054,0.278383,0.249788,0.283299,0.316979,0.323023,0.407503,0.32121,0.375986,0.0535474,0.420772
Approach50-75,0.084113,0.00358562,-0.0193447,0.0402869,0.0714817,0.0512631,-0.00146113,0.0296356,0.0323765,1.0,0.0272576,0.0481437,0.0721155,0.0756064,0.0839507,0.0811056,0.00846283,0.0285002,0.0200891,0.0152355,0.0107253,0.0114099,0.0135254,-0.0550447,0.121963,0.127878,0.0634866,0.0380317,0.0329131,0.0237495,0.0247812,0.0257867,0.0911955,0.0616895,0.0824061,0.0564527,0.00426597,0.0421234,0.0172473,-0.0372775,-0.013324,0.0192331,0.129005,0.0842337,0.0505234,0.0409338,0.034084,0.016335,0.0226904,0.0294665,0.128652,0.0633993,0.0829503,0.126147,0.0340907,0.0206643,0.0152583,0.00608155,0.035671,-0.0327491,0.019195,0.098915,-0.00438015,0.0915851,-0.0234403,-0.014532,0.0940382,-0.0267144,0.081404,0.0324156,0.0714402,0.10428,0.042584,0.110949,0.0328245,0.0156078,0.00898211,0.0469621,0.0292188,0.0156133,0.0175598,0.0202173,-0.00366269,0.0271584,0.104053,0.0824731,0.110019,0.0614547,0.106234,0.0796207,0.0937286,0.0836149,0.0865978,0.073292,0.0462186,0.100482


In [50]:
df_corr = pd.DataFrame(corr['Score'], index=None, columns=['Score'])
df_corr.reset_index(level=0, inplace=True)
df_corr.columns = ['Name', 'Score']
df_corr.sort_values(by='Score', ascending=False, inplace=True)
df_corr.head(11)

Unnamed: 0,Name,Score
0,Score,1.0
38,PuttingTotal,0.407784
41,Putting3,0.290498
72,CoursePar5TotalScorecardYardage,0.237212
69,CoursePar5Holes,0.22611
94,CourseTotalScorecardYardage,0.224211
3,TotalPar,0.216548
24,AvgProximityToHole,0.207023
25,AvgProximityFairway,0.206537
37,ScramblingSandProximity,0.15735
