In [1]:
# clean the data
%run "Data Analysis Cleaning.ipynb"

# Model Creation

## Model 1

In [2]:
def single_attribute_model(attribute_name, performance_metric_name):
    '''
    This function creates a single variable linear regression model for predicting a given 
    performance metric using a single input attribute
    
    Parameters:
    attribute_name - the name of the attribute to be used as the input to our model
    performance_metric_name - the name of the metric to be predicted by the model
    
    Returns:
    error - the percent error of the model on the test set
    model.params - a list of the parameters that the model uses along with their coefficients
    '''
    data = normalized_pitchers[[performance_metric_name, attribute_name]].dropna()
    X = data.iloc[:, 1]
    Y = data.iloc[:, 0]
 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
    model = sm.OLS(y_train,X_train).fit()
    predictions = model.predict(X_test)
    error = (np.sum(np.abs(predictions - y_test) / y_test) / y_test.size)
    return (error, model.params)

In [3]:
# here we define model1 as the single variable models for each attrribute of each pitch
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
pitch_name = ['fastball_break_vertical','fastball_break_horizontal', 
                     'percent_fastball_thrown','fastball_speed','fastball_spin','offspeed_break_vertical', 
                     'offspeed_break_horizontal','percent_offspeed_thrown','offspeed_speed', 'offspeed_spin', 
                     'breaking_break_vertical','breaking_break_horizontal','percent_breaking_thrown', 
                     'breaking_speed','breaking_spin']
pitches = []
metrics = []
errors = []
coefs = []
for pitch in pitch_name:
    for metric in performance_measures:
        pitches +=[pitch]
        metrics +=[metric]
        error, params = single_attribute_model(pitch, metric)
        errors.append(error)
        coefs.append(params)
model1 = {'pitch': pitches, 'metric': metrics, 'error': errors, 'coefs': coefs}
model1_df = pd.DataFrame(data=model1)

## Model 2

In [4]:
def pitch_model2(pitch_name, performance_metric_name):
    '''
    This function creates a multivariable linear regression model for predicting a given 
    perfromance metric using all attributes of the specified pitch
    
    Parameters:
    pitch_name - the name of the pitch whose attribiutes will be used as the input to our model
    performance_metric_name - the name of the metric to be predicted by the model
    
    Returns:
    error - the percent error of the model on the test set
    model.params - a list of the parameters that the model uses along with their coefficients
    '''
    data = normalized_pitchers[[performance_metric_name, pitch_name + '_break_vertical', 
                     pitch_name + '_break_horizontal', 'percent_' + pitch_name + '_thrown', 
                     pitch_name + '_speed', pitch_name + '_spin']].dropna()
    X = data.iloc[:, 1:]
    Y = data.iloc[:, 0]
 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
    # model selection using p values on training set
    variables = X.columns
    model = sm.OLS(y_train,X_train[variables]).fit()
    stat_sig_vars = variables
    while any(model.pvalues > .05):
       stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
       model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()

    predictions = model.predict(X_test[stat_sig_vars])
    error = (np.sum(np.abs(predictions - y_test) / y_test) / y_test.size)

    return (error, model.params)

In [5]:
# here we define model2 as the multivariable models for all attributes of each pitch combined
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
pitch_classes = ['fastball', 'breaking', 'offspeed']


pitches = []
metrics = []
errors = []
coefs = []
for pitch in pitch_classes:
    for metric in performance_measures:
        pitches +=[pitch]
        metrics +=[metric]
        error, params = pitch_model2(pitch, metric)
        errors.append(error)
        coefs.append(params)
model2 = {'pitch': pitches, 'metric': metrics, 'error': errors, 'coefs': coefs}
model2_df = pd.DataFrame(data=model2)

## Model 3

In [6]:
# here we define model3 as the single variable models for each overall attrribute of a pitcher
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
attributes = ['1st_pitch_strike_percent', 'fastball_speed', 'delta']

metrics = []
errors = []
attribute = []
coefs = []
for a in attributes:
    for metric in performance_measures:
        attribute +=[a]
        metrics +=[metric]
        error, params = single_attribute_model(a, metric)
        errors.append(error)
        coefs.append(params)
model3 = {'attributes': attribute, 'metric': metrics, 'error': errors, 'coefs': coefs}
model3_df = pd.DataFrame(data=model3)

In [7]:
def multiple_attribute_model(attribute_list, performance_metric_name):
    '''
    This function creates a multivariable linear regression model for predicting a given 
    performance metric using all given attributes
    
    Parameters:
    attribute_list - the list of attributes to be used as inputs to our model
    performance_metric_name - the name of the metric to be predicted by the model
    
    Returns:
    error - the percent error of the model on the test set
    model.params - a list of the parameters that the model uses along with their coefficients
    '''
    feature_list = attribute_list.copy()
    feature_list.append(performance_metric_name)
    data = normalized_pitchers[feature_list].dropna()
    X = data[attribute_list]
    Y = data[performance_metric_name]
 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
    # model selection using p values on training set
    variables = X.columns
    model = sm.OLS(y_train,X_train[variables]).fit()
    stat_sig_vars = variables
    while any(model.pvalues > .05):
        stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
        model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()

    predictions = model.predict(X_test[stat_sig_vars])
    error = (np.sum(np.abs(predictions - y_test) / y_test) / y_train.size)
    
    return (error, model.params)

In [8]:
def multiple_attribute_model_with_model(attribute_list, performance_metric_name):
    '''
    This function creates a multivariable linear regression model for predicting a given 
    performance metric using all given attributes
    
    Parameters:
    attribute_list - the list of attributes to be used as inputs to our model
    performance_metric_name - the name of the metric to be predicted by the model
    
    Returns:
    error - the percent error of the model on the test set
    model.params - a list of the parameters that the model uses along with their coefficients
    '''
    feature_list = attribute_list.copy()
    feature_list.append(performance_metric_name)
    data = normalized_pitchers[feature_list].dropna()
    X = data[attribute_list]
    Y = data[performance_metric_name]
 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
    # model selection using p values on training set
    variables = X.columns
    model = sm.OLS(y_train,X_train[variables]).fit()
    stat_sig_vars = variables
    while any(model.pvalues > .05):
        stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
        model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()

    predictions = model.predict(X_test[stat_sig_vars])
    error = (np.sum(np.abs(predictions - y_test) / y_test) / y_train.size)
    
    return (error, model)

## Model 4

In [9]:
# here we define model4 as the multivariable models for all overall attrribute combined
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
attributes = ['1st_pitch_strike_percent', 'fastball_speed', 'delta']

errors = []
coefs = []

for metric in performance_measures:
        error, params = multiple_attribute_model(attributes, metric)
        errors.append(error)
        coefs.append(params)
model4 = {'Attributes': ["", "", ""], 'metrics': performance_measures, 'error': errors, 'coefs': coefs}
model4_df = pd.DataFrame(data=model4)

## Model 5

In [10]:
# here we define model5 as the multivariable models for all attrributes of all pitches combined
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
attributes = ['fastball_break_vertical','fastball_break_horizontal', 
                     'percent_fastball_thrown','fastball_speed','fastball_spin','offspeed_break_vertical', 
                     'offspeed_break_horizontal','percent_offspeed_thrown','offspeed_speed', 'offspeed_spin', 
                     'breaking_break_vertical','breaking_break_horizontal','percent_breaking_thrown', 
                     'breaking_speed','breaking_spin']

errors = []
coefs = []
model5_models = []

for metric in performance_measures:
        error, model5_model = multiple_attribute_model_with_model(attributes, metric)
        errors.append(error)
        coefs.append(model5_model.params)
        model5_models.append(model5_model)
model5 = {'Attributes': ["", "", ""], 'metrics': performance_measures, 'error': errors, 'coefs': coefs}
model5_df = pd.DataFrame(data=model5)

## Model 6

In [11]:
# here we define model6 as the multivariable models for all attrribute combined
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
attributes = ['fastball_break_vertical','fastball_break_horizontal', 
                     'percent_fastball_thrown','fastball_speed','fastball_spin','offspeed_break_vertical', 
                     'offspeed_break_horizontal','percent_offspeed_thrown','offspeed_speed', 'offspeed_spin', 
                     'breaking_break_vertical','breaking_break_horizontal','percent_breaking_thrown', 
                     'breaking_speed','breaking_spin', '1st_pitch_strike_percent', 'delta']

errors = []
coefs = []
for metric in performance_measures:
        error, params = multiple_attribute_model(attributes, metric)
        errors.append(error)
        coefs.append(params)
model6 = {'Attributes': ["", "", ""], 'metrics': performance_measures, 'error': errors, 'coefs': coefs}
model6_df = pd.DataFrame(data=model6)

## Model 7

In [12]:
# first, we must create the new attributes in our dataframe
pitchers['breaking_delta'] = pitchers['fastball_speed'] - pitchers['breaking_speed']

# add these to our normalized df
normalized_pitchers['breaking_delta'] = (pitchers['breaking_delta'] - pitchers['breaking_delta'].min()) / (pitchers['breaking_delta'].max() - pitchers['breaking_delta'].min())

In [13]:
# here we define model7 as the multivariable models for all attrribute combined

def model7(attribute_list, performance_metric_name):
    '''
    This function creates a multivariable linear regression model for predicting a given 
    performance metric using all given attributes
    
    Parameters:
    attribute_list - the list of attributes to be used as inputs to our model
    performance_metric_name - the name of the metric to be predicted by the model
    
    Returns:
    error - the percent error of the model on the test set
    model.params - a list of the parameters that the model uses along with their coefficients
    '''
    feature_list = attribute_list.copy()
    feature_list.append(performance_metric_name)
    data = normalized_pitchers[feature_list].dropna()
    X = data[attribute_list]
    Y = data[performance_metric_name]
 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
    # model selection using p values on training set
    variables = X.columns
    model = sm.OLS(y_train,X_train[variables]).fit()
    stat_sig_vars = variables
    while any(model.pvalues > .05):
        stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
        model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()
        
    if ('breaking_delta' not in stat_sig_vars):
        stat_sig_vars = stat_sig_vars.append(pd.Index(['breaking_delta']))
    model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()


    while any(model.pvalues > .05):
        stat_sig_vars = stat_sig_vars[model.pvalues <= .05]
        model = sm.OLS(y_train,X_train[stat_sig_vars]).fit()

    predictions = model.predict(X_test[stat_sig_vars])
    error = (np.sum(np.abs(predictions - y_test) / y_test) / y_train.size)
    
    return (error, model.params)

In [14]:
# now we create a dataframe with data for this type of model
performance_measures = ['z_contact_percent', 'ERA', 'WHIP']
attributes = ['fastball_break_vertical','fastball_break_horizontal', 
                     'percent_fastball_thrown','fastball_speed','fastball_spin','offspeed_break_vertical', 
                     'offspeed_break_horizontal','percent_offspeed_thrown','offspeed_speed', 'offspeed_spin', 
                     'breaking_break_vertical','breaking_break_horizontal','percent_breaking_thrown', 
                     'breaking_speed','breaking_spin', '1st_pitch_strike_percent', 'delta', 'breaking_delta']

errors = []
coefs = []

for metric in performance_measures:
        error, params = model7(attributes, metric)
        errors.append(error)
        coefs.append(params)
model7 = {'Attributes': ["", "", ""], 'metrics': performance_measures, 'error': errors, 'coefs': coefs}
model7_df = pd.DataFrame(data=model7)

## Print Best

In [15]:
def print_best(model_number):
    '''
    This method calculates the best model of the given type and prints its coefficients 
    as well as its percent error
    
    Parameters:
    model_number - an integer between 1 and 7 inclusive representing the type of model
    '''
    
    description_string = ["\n\nCoefficients for the best individual pitch attribute model of {} with percent error {:.2%}:\n", 
                         "\n\nCoefficients for the best individual pitch model of {} with percent error {:.2%}:\n",
                          "\n\nCoefficients for the best individual overall attribute model of {} with percent error {:.2%}:\n",
                          "\n\nCoefficients for the combined overall attribute model of {} with percent error {:.2%}:\n",
                          "\n\nCoefficients for the combined pitch attribute model of {} with percent error {:.2%}:\n",
                          "\n\nCoefficients for the all attribute model of {} with percent error {:.2%}:\n",
                          "\n\nCoefficients for the all attribute (including newly added attributes) model of {} with percent error {:.2%}:\n"][model_number - 1]
    metrics = ["ERA", "WHIP", "z_contact_percent"]
    model = globals()["model" + str(model_number)]
    if (model_number in [1, 2, 3]):
        errors = model['error']
        coefs = model['coefs']
        for i in [0,1,2]:
            errors_i = [errors[3 * j + ((i + 1) % 3)] for j in range(int(len(errors) / 3))]
            coefs_i = [coefs[3 * j + ((i + 1) % 3)] for j in range(int(len(errors) / 3))]
            print(description_string.format(metrics[i], min(errors_i)))
            print(coefs_i[errors_i.index(min(errors_i))])
        
    else:
        errors = model['error']
        coefs = model['coefs']
        for i in [0, 1, 2]:
            print(description_string.format(metrics[i], (errors[(i + 1) % 3])))
            print(coefs[(i + 1) % 3])

## Make a List that Contains the Errors for the Best Performing Model for Each Performance Metric

In [16]:
era_errors = []
whip_errors = []
z_contact_percent_errors = []

for model in [model1, model2, model3, model4, model5, model6, model7]:
    errors = model['error']
    for i in [0,1,2]:
        errors_i = [errors[3 * j + ((i + 1) % 3)] for j in range(int(len(errors) / 3))]
        if i ==0:
            era_errors.append((min(errors_i)*100).round(3))
        if i ==1:
            whip_errors.append((min(errors_i)*100).round(3))
        if i ==2:
            z_contact_percent_errors.append((min(errors_i)*100).round(3))

## Create Figure for Percent Error Display

In [17]:
def model_error_fig():
    '''
    This function plots a bar chart and table displaying the percent errors for 
    the best performing model in each of our model classes.
    '''

    # Add table data
    table_data = [['Model Number', 'ERA % Error', 'WHIP % Error', 'Z Contact % Error']]
    for i in range(7):
        table_data.append([str(i + 1), era_errors[i] , whip_errors[i],  z_contact_percent_errors[i]])

    # Initialize a figure with ff.create_table(table_data)
    fig = ff.create_table(table_data, height_constant=30)

    # Add graph data
    models = ['Model 1', 'Model 2', 'Model 3',
             'Model 4', 'Model 5', 'Model 6', 'Model 7']

    # Make traces for graph
    trace1 = go.Bar(x=models, y=era_errors, xaxis='x2', yaxis='y2',
                    marker=dict(color='#0099ff'),
                    name='ERA Error')
    trace2 = go.Bar(x=models, y=whip_errors, xaxis='x2', yaxis='y2',
                    marker=dict(color='#404040'),
                    name='WHIP Error')
    trace3 = go.Bar(x=models, y=z_contact_percent_errors, xaxis='x2', yaxis='y2',
                    marker=dict(color='#808080'),
                    name='Z Contact Percent Error')

    # Add trace data to figure
    fig.add_traces([trace1, trace2, trace3])

    # initialize xaxis2 and yaxis2
    fig['layout']['xaxis2'] = {}
    fig['layout']['yaxis2'] = {}

    # Edit layout for subplots
    fig.layout.yaxis.update({'domain': [0, .4]})
    fig.layout.yaxis2.update({'domain': [.6, 1]})

    # The graph's yaxis2 MUST BE anchored to the graph's xaxis2 and vice versa
    fig.layout.yaxis2.update({'anchor': 'x2'})
    fig.layout.xaxis2.update({'anchor': 'y2'})
    fig.layout.yaxis2.update({'title': 'Percent Error'})

    # Update the margins to add a title and see graph x-labels.
    fig.layout.margin.update({'t':75, 'l':5})
    fig.layout.update({'title': 'Model Percent Error'})

    # Update the height because adding a graph vertically will interact with
    # the plot height calculated for the table
    fig.layout.update({'height':800})

    fig.show()
    

# Create Pitch Speeds Histogram

In [18]:
def pitch_speeds_fig():
    '''
    This function plots a histogram displaying the pitch speeds for 
    each of the different types of pitches.
    '''
    # Add histogram data
    x1 = pitchers['fastball_speed'].dropna()
    x2 = pitchers['offspeed_speed'].dropna()
    x3 = pitchers['breaking_speed'].dropna()

    # Group data together
    hist_data = [x1, x2, x3]

    group_labels = ['Fastball Speed', 'Offspeed Speed', 'Breaking Speed']

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels, bin_size=.2, show_rug=False)
    fig.update_layout(
        title='Pitch Speeds vs Pitch Frequency')
    fig.update_layout(
        title_text='Pitch Speeds', # title of plot
        xaxis_title_text='Pitch Speed (mph)', # xaxis label
        yaxis_title_text='Relative Frequency', # yaxis label
    )

    fig.show()

# Create Pitch Spins Histogram

In [19]:
def pitch_spins_fig():
    '''
    This function plots a histogram displaying the pitch spins for 
    each of the different types of pitches.
    '''
    # Add histogram data
    x1 = pitchers['fastball_spin'].dropna()
    x2 = pitchers['offspeed_spin'].dropna()
    x3 = pitchers['breaking_spin'].dropna()

    # Group data together
    hist_data = [x1, x2, x3]

    group_labels = ['Fastball Spin', 'Offspeed Spin', 'Breaking Spin']

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels, bin_size=15,  show_rug=False)
    fig.update_layout(
        title_text='Pitch Spins', # title of plot
        xaxis_title_text='Pitch Spin (rpm)', # xaxis label
        yaxis_title_text='Relative Frequency', # yaxis label
    )

    fig.show()

# Create Performance Measures Histogram

In [20]:
def performance_measures_fig(metric):
    '''
    This function plots a histogram of the data in pitchers[metric].
    
    Parameters:
    - metric - str - the name of the metric to be plotted in the histogram
    '''
    fig = px.histogram(pitchers, x=metric)
    fig.update_layout(title= metric + ' Distribution')
    fig.add_vline(x=pitchers[metric].mean(), annotation_text="Mean", annotation_position="top right", line_color = 'black')
    fig.add_vline(x=pitchers[metric].median(), annotation_text="Median", annotation_position="bottom left", line_color="red")
    fig.show()
    print("The mean {} of pitchers in our dataset is {:.2f}".format(metric, pitchers[metric].mean()))
    print("The median {} of pitchers in our dataset is {:.2f}".format(metric, pitchers[metric].median()))
    print("The standard deviation of {} of pitchers in our dataset is {:.2f}".format(metric, pitchers[metric].std()))

# Create 3-D Plot of Pitch Movement and Pitch Velocity

In [21]:
def threeD_plot():
    '''
    This function plots the horizontal and vertical breaks of each pitch for each pitcher
    against the pitches velocity in a 3D plot. Each type of pitch is represented by a 
    different color in our plot.
    '''
    %matplotlib notebook
    fig=p.figure()
    ax = p3.Axes3D(fig)
    for pitch in ['fastball', 'offspeed', 'breaking']:
        ax.scatter(pitchers[pitch + '_break_horizontal'], pitchers[pitch + '_speed'], pitchers[pitch + '_break_vertical'], alpha = 0.2)
        ax.legend(['fastball', 'offspeed', 'breaking'])
    ax.set_title('3-D Plot of Pitch Movement and Pitch Velocity')
    ax.set(xlabel = 'Horizontal Break (inches)', ylabel = 'Velocity (mph)', zlabel='Vertical Break (inches)')
    plt.show()

# Model 5 (on Cornell Baseball Data)

In [22]:
#get Cornell Baseball Data
cornell_data = pd.read_csv('cornell_pitching.csv')

In [23]:
# normalize each attribute
normalized_cornell = cornell_data.copy()
for col_name in cornell_data.columns[1:]:
    normalized_cornell[col_name] = (cornell_data[col_name] - pitchers[col_name].min()) / (pitchers[col_name].max() - pitchers[col_name].min()) 

In [32]:
attributes = ['fastball_break_vertical','fastball_break_horizontal', 
                     'percent_fastball_thrown','fastball_speed','fastball_spin','offspeed_break_vertical', 
                     'offspeed_break_horizontal','percent_offspeed_thrown','offspeed_speed', 'offspeed_spin', 
                     'breaking_break_vertical','breaking_break_horizontal','percent_breaking_thrown', 
                     'breaking_speed','breaking_spin']

def predict(pitchers_dataframe):
    '''
    This function predicts the ERA, WHIP, and Z Contact Percentage of pitchers in pitchers_dataframe
    
    Parameters: 
    pitchers_dataframe - the data on pitchers to be used in predicting their metrics
    
    Returns:
    Score_Model - a dataframe containing the Name of each pitcher, as well as their projected performance metrics
    '''
    Score_Model = pd.DataFrame(columns = ['Name', 'ERA Projected', 'WHIP Projected', 'Z Contact Percent Projected'])
    features = pitchers_dataframe[attributes]
    ERAscore = model5_models[1].predict(features[['percent_fastball_thrown', 'fastball_speed', 'fastball_spin', 'offspeed_break_horizontal', 'percent_offspeed_thrown', 'offspeed_spin', 'breaking_break_horizontal', 'percent_breaking_thrown', 'breaking_spin']])
    WHIPscore = model5_models[2].predict(features[['percent_fastball_thrown', 'fastball_spin', 'offspeed_break_horizontal', 'percent_offspeed_thrown', 'breaking_break_horizontal', 'percent_breaking_thrown', 'breaking_spin']])
    z_contactscore = model5_models[0].predict(features[['fastball_break_vertical', 'fastball_break_horizontal', 'percent_fastball_thrown', 'fastball_speed', 'fastball_spin', 'offspeed_break_vertical', 'percent_offspeed_thrown', 'offspeed_spin', 'breaking_break_vertical', 'percent_breaking_thrown', 'breaking_speed']])
    Score_Model['ERA Projected'] = ERAscore
    Score_Model['WHIP Projected'] = WHIPscore
    Score_Model['Z Contact Percent Projected'] = z_contactscore
    Score_Model['Name'] = pitchers_dataframe['Name']
    return Score_Model      