In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split #Divide data into training and testing data
from sklearn.tree import DecisionTreeRegressor #To create Decision Tree ML model
from sklearn.metrics import mean_absolute_error #To analyze model

In [None]:
#Import data
freestyle_heats_2016 = pd.read_csv('../input/mens100mfree/2016100mfree.csv')

#Target variable
y = freestyle_heats_2016.Time

#Training variables (note the exclusion of rank)
swimmer_features = ['Age', 'R.T.', 'Height', 'lbs/inch']  
X = freestyle_heats_2016[swimmer_features]

#Split data set into validation and training values to prevent overfitting
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [None]:
#Function to find mae from given max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    
    val_predictions = model.predict(val_X)
    mae = mean_absolute_error(val_y, val_predictions)
    return mae

    

In [None]:
#Tune model
possible_max_leaf_nodes = {} 

#Identify the ideal # of max_leaf_nodes (optimize for lowest possible mae)

for max_leaf_nodes in range(2, 20): #Create dict to store # of leaf nodes as key and MAE as value
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    possible_max_leaf_nodes[max_leaf_nodes] = mae
    

minimum_mae = min(possible_max_leaf_nodes.values()) #Identify minimum mae possible


for key, value in possible_max_leaf_nodes.items(): #Trace back the minimum MAE
    if value == minimum_mae:
        ideal_max_leaf_nodes = key
        
print(ideal_max_leaf_nodes)

In [None]:
#Test model against testing data

#Create model  
practice_swimmer_model = DecisionTreeRegressor(max_leaf_nodes=ideal_max_leaf_nodes, random_state=0)
#Fit model to training data
practice_swimmer_model.fit(train_X, train_y)

#Apply model to validation data

time_predictions = practice_swimmer_model.predict(val_X)

#Test validity of model against the actual validation data
print(mean_absolute_error(val_y, time_predictions))

#mean absolute error with ideal leaf nodes = 1.01496 seconds. 
#This being the difference between the difference between 1st and 25th place in the sample, this may not
#Be the best model

#Interestingly, given just the rank of the swimmers, the model can get to just .0820 seconds off



In [None]:
#Creating "full power" model with all data

swimmer_model = DecisionTreeRegressor(max_leaf_nodes=ideal_max_leaf_nodes, random_state=0)
swimmer_model.fit(X, y)

In [None]:
#Import data from 2012 Olympics to test against model
freestyle_heats_2012_path = '../input/mens100mfree/2016100mfree.csv'
freestyle_heats_2012 = pd.read_csv(freestyle_heats_2012_path)
y_2012 = freestyle_heats_2012.Time
X_2012 = freestyle_heats_2012[swimmer_features]

#Run model with 2012 input data
freestyle_2012_prediction = swimmer_model.predict(X_2012)

#Display
print(mean_absolute_error(y_2012, freestyle_2012_prediction))

#.801831 seconds. lower than the results from the validation data perhaps due to the small sample size
#(model going from some 30 to 50 data points produces much more useful results)

#This is the difference between 1st and 17th place (of 56 swimmers). Relatively mediocre model but not useless

#Given rank, gets 0.0407 seconds off (even for 2012 olympics)