# Libraries, Preferences and Data Input

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
%config Completer.use_jedi = False

In [90]:
# read in the historical training data
players_training = pd.read_csv("players_training.csv", index_col='player')

# Choose appropriate week for model selection
players_test = pd.read_csv("players_week1.csv", index_col='player')
# players_test = pd.read_csv("players_week2.csv", index_col='player')

y_test = players_test['overall_score']
players_test.drop('overall_score', axis=1, inplace=True)

# Choose appropriate week for predictions
players_pred = pd.read_csv("players_week1.csv", index_col='player', usecols=[0,1,2,3,4,5])
# players_pred = pd.read_csv("players_week2.csv", index_col='player', usecols=[0,1,2,3,4,5])

In [5]:


results_week1 = pd.read_csv("results_week1.csv", index_col='country')
results_week2 = pd.read_csv("results_week2.csv", index_col='country')
results_week3 = pd.read_csv("results_week3.csv", index_col='country')
# results_week4 = pd.read_csv("results_week4.csv", index_col='country')
# results_week5 = pd.read_csv("results_week5.csv", index_col='country')

In [6]:
# Input current kickers for each team

kickers = [
    'O. Farrell', 'G. Ford', 'M. Malins', # england
    'M. Jalibert', 'L. Carbonel', # france
    'J. Sexton', 'B. Burns', 'R. Byrne', # ireland
    'P. Garbisi', 'T. Allan', 'C. Canna', # italy
    'F. Russell', 'J. van der Walt' # scotland
    'D. Biggar', 'L. Halfpenny', 'C. Sheedy', 'J. Evans' #wales
]

# Helper Code

In [7]:
def team_points_calculator(country, mins, week):
    """
    Calculates the number of team points assigned to each player, based on their country, the number of minutes they played, and the result.
    """
    
    if week == 1:
        results = results_week1
    elif week == 2:
        results = results_week2
    elif week == 3:
        results = results_week3
    elif week == 4:
        results = results_week4
    elif week == 5:
        results = results_week5
    
    
    # calculate the the home/away points
    outcome = 0
    if results.loc[country, 'home'] == 1:
        if results.loc[country, 'for'] > results.loc[country, 'against']:
            outcome += 12
        elif results.loc[country, 'for'] == results.loc[country, 'against']:
            outcome += 4
        else:
            outcome += 1
    else:
        if results.loc[country, 'for'] > results.loc[country, 'against']:
            outcome += 18
        elif results.loc[country, 'for'] == results.loc[country, 'against']:
            outcome += 10
        else:
            outcome += 3
    
    # calculate the score difference points
    score_diff = (results.loc[country, 'for'] - results.loc[country, 'against']) * 0.5
    
    # calculate the points assigned to each player
    team_points = round((outcome + score_diff) * (mins / 80), 2)

    return team_points

In [8]:
def ind_points_calculator(tackles, d_tackles, t_breaks, m_ball, mom, tries, conversions, penalties, d_goals, yellow, red):
    """
    Calculates the number of individual points assigned to each play, based on their fixure metrics.
    """
    
    ind_points = round(
        tackles * 1 +
        d_tackles * 2 +
        t_breaks * 2 +
        m_ball * 0.3 +
        mom * 15 +
        tries * 15 +
        conversions * 3 +
        penalties * 3 +
        d_goals * 6 +
        yellow * -5 +
        red * -10
    ,2)
    
    return ind_points

In [9]:
def v_italy_generator(week, country):
    """
    Determines whether each player is playing against Italy or not in a given week.
    """
    
    if week == 1 and country == 'France':
        return 1
    elif week == 2 and country == 'England':
        return 1
    elif week == 3 and country == 'Ireland':
        return 1
    elif week == 4 and country == 'Wales':
        return 1
    elif week == 5 and country == 'Scotland':
        return 1
    else:
        return 0

In [10]:

unused_features = ['mom', 'yellow', 'red', 'd_goals', 'week']
unused_scores = ['team_score', 'ind_score']

# create a list of all columns that we will be separately addressing as target features
outputs = ['mins', 'tackles', 'd_tackles', 't_breaks', 'm_ball', 'tries', 'conversions', 'penalties']

# create a list of all columns that we will be using as input features
features = ['cost', 'started', 'England', 'France', 'Ireland', 'Italy', 'Scotland', 'Wales', 'Back Row', 'Centre', 'Fly Half', 'Full Back', 'Hooker', 
            'Prop', 'Scrum Half', 'Second Row', 'Wing', 'v_italy', 'kicker']


feature_interactions = ['cost_started', 'cost_England', 'cost_France', 'cost_Ireland', 'cost_Italy', 'cost_Scotland', 'cost_Wales', 'cost_Back Row',
                        'cost_Centre', 'cost_Fly Half', 'cost_Full Back', 'cost_Hooker', 'cost_Prop', 'cost_Scrum Half', 'cost_Second Row', 'cost_Wing',
                        'cost_v_italy', 'cost_kicker']

# Training Data Prep

In [11]:
def training_data_processor(data):
    """
    Prepares the training data for modelling purposes.
    """
    
    # replace any NaN values with zero.
    data = data.fillna(0)
    
    # designate the prospective kickers
    data = data.reset_index()
    data['kicker'] = data['player'].apply(lambda x: 1 if x in kickers else 0)
    data = data.set_index('player')
    
    # designate the players playing against Italy
    data['v_italy'] = data.apply(lambda x: v_italy_generator(x['week'], x['country']), axis=1)
    
    # calculate the team scores based on mins and result
    data['team_score'] = data.apply(lambda x: team_points_calculator(x['country'], x['mins'], x['week']), axis=1)

    # calculate the individual score based on metrics
    data['ind_score'] = data.apply(lambda x: ind_points_calculator(
        x['tackles'], x['d_tackles'], x['t_breaks'], x['m_ball'], x['mom'],
        x['tries'], x['conversions'], x['penalties'], x['d_goals'], x['yellow'], x['red']
    ), axis=1)
    
    # calculate the overall score
    data['overall_score'] = data['team_score'] + data['ind_score']
    
    # convert the categorical columns into dummy variables
    data = pd.get_dummies(data, columns=['country', 'position'], prefix="", prefix_sep="")
    
    # generate feature interactions for all dummy variables with the cost variable
    data['cost_started'] = data['cost'] * data['started'] 
    data['cost_England'] = data['cost'] * data['England'] 
    data['cost_France'] = data['cost'] * data['France'] 
    data['cost_Ireland'] = data['cost'] * data['Ireland'] 
    data['cost_Italy'] = data['cost'] * data['Italy'] 
    data['cost_Scotland'] = data['cost'] * data['Scotland'] 
    data['cost_Wales'] = data['cost'] * data['Wales'] 
    data['cost_Back Row'] = data['cost'] * data['Back Row'] 
    data['cost_Centre'] = data['cost'] * data['Centre'] 
    data['cost_Fly Half'] = data['cost'] * data['Fly Half'] 
    data['cost_Full Back'] = data['cost'] * data['Full Back'] 
    data['cost_Hooker'] = data['cost'] * data['Hooker'] 
    data['cost_Prop'] = data['cost'] * data['Prop'] 
    data['cost_Scrum Half'] = data['cost'] * data['Scrum Half'] 
    data['cost_Second Row'] = data['cost'] * data['Second Row'] 
    data['cost_Wing'] = data['cost'] * data['Wing'] 
    data['cost_v_italy'] = data['cost'] * data['v_italy'] 
    data['cost_kicker'] = data['cost'] * data['kicker'] 
    
    # drop the features which won't be used for modelling
    data = data.drop(unused_features + unused_scores, axis=1)
    
    # move the target variable to the end for readibility
    data['overall_score'] = data.pop('overall_score')
    
    return data

In [12]:
# process the training data for modelling
players_training_processed = training_data_processor(players_training)

In [13]:
players_training_processed.to_csv("players_training_processed.csv")

In [14]:
def feature_coefficients(model, feature_interactions=[]):
    """
    
    """
    
    features_list = players_training_processed[features + feature_interactions].columns
    coefficients = model.coef_
    
    df = pd.DataFrame({'features': features_list, 'coefficient': coefficients})
    
    return df

In [15]:
def test_data_predictor_single(data, model, feature_interactions=False):
    """
    
    """

    data = data.reset_index()
    data['kicker'] = data['player'].apply(lambda x: 1 if x in kickers else 0)
    data = data.set_index('player')
    
    data['v_italy'] = data.apply(lambda x: v_italy_generator(x['week'], x['country']), axis=1)
    data = data.drop('week', axis=1)

    data = pd.get_dummies(data, columns=['country', 'position'], prefix="", prefix_sep="")

    if feature_interactions == True:
        data['cost_started'] = data['cost'] * data['started'] 
        data['cost_England'] = data['cost'] * data['England'] 
        data['cost_France'] = data['cost'] * data['France'] 
        data['cost_Ireland'] = data['cost'] * data['Ireland'] 
        data['cost_Italy'] = data['cost'] * data['Italy'] 
        data['cost_Scotland'] = data['cost'] * data['Scotland'] 
        data['cost_Wales'] = data['cost'] * data['Wales'] 
        data['cost_Back Row'] = data['cost'] * data['Back Row'] 
        data['cost_Centre'] = data['cost'] * data['Centre'] 
        data['cost_Fly Half'] = data['cost'] * data['Fly Half'] 
        data['cost_Full Back'] = data['cost'] * data['Full Back'] 
        data['cost_Hooker'] = data['cost'] * data['Hooker'] 
        data['cost_Prop'] = data['cost'] * data['Prop'] 
        data['cost_Scrum Half'] = data['cost'] * data['Scrum Half'] 
        data['cost_Second Row'] = data['cost'] * data['Second Row'] 
        data['cost_Wing'] = data['cost'] * data['Wing'] 
        data['cost_v_italy'] = data['cost'] * data['v_italy'] 
        data['cost_kicker'] = data['cost'] * data['kicker']

    
    data['overall_score'] = model.predict(data).round(2)
    
    return data

In [16]:
def test_data_predictor_multiple(players, model, feature_interactions=False):
    """
    
    """

    players = players.reset_index()
    players['kicker'] = players['player'].apply(lambda x: 1 if x in kickers else 0)
    players = players.set_index('player')
    
    players['v_italy'] = players.apply(lambda x: v_italy_generator(x['week'], x['country']), axis=1)

    players_interim = players.drop('week', axis=1).copy()
    players_interim = pd.get_dummies(players_interim, columns=['country', 'position'], prefix="", prefix_sep="")

    if feature_interactions == False:
        
        if model == 'lin_reg':
            players['mins'] = model_lin_reg_mins.best_estimator_.predict(players_interim).round()
            players['tackles'] = model_lin_reg_tackles.best_estimator_.predict(players_interim).round(2)
            players['d_tackles'] = model_lin_reg_d_tackles.best_estimator_.predict(players_interim).round(2)
            players['t_breaks'] = model_lin_reg_t_breaks.best_estimator_.predict(players_interim).round(2)
            players['m_ball'] = model_lin_reg_m_ball.best_estimator_.predict(players_interim).round(2)
            players['tries'] = model_lin_reg_tries.best_estimator_.predict(players_interim).round(2)
            players['conversions'] = model_lin_reg_conversions.best_estimator_.predict(players_interim).round(2)
            players['penalties'] = model_lin_reg_penalties.best_estimator_.predict(players_interim).round(2)

        elif model == 'ridge':
            players['mins'] = model_ridge_mins.best_estimator_.predict(players_interim).round()
            players['tackles'] = model_ridge_tackles.best_estimator_.predict(players_interim).round(2)
            players['d_tackles'] = model_ridge_d_tackles.best_estimator_.predict(players_interim).round(2)
            players['t_breaks'] = model_ridge_t_breaks.best_estimator_.predict(players_interim).round(2)
            players['m_ball'] = model_ridge_m_ball.best_estimator_.predict(players_interim).round(2)
            players['tries'] = model_ridge_tries.best_estimator_.predict(players_interim).round(2)
            players['conversions'] = model_ridge_conversions.best_estimator_.predict(players_interim).round(2)
            players['penalties'] = model_ridge_penalties.best_estimator_.predict(players_interim).round(2)

        elif model == 'lasso':
            players['mins'] = model_lasso_mins.best_estimator_.predict(players_interim).round()
            players['tackles'] = model_lasso_tackles.best_estimator_.predict(players_interim).round(2)
            players['d_tackles'] = model_lasso_d_tackles.best_estimator_.predict(players_interim).round(2)
            players['t_breaks'] = model_lasso_t_breaks.best_estimator_.predict(players_interim).round(2)
            players['m_ball'] = model_lasso_m_ball.best_estimator_.predict(players_interim).round(2)
            players['tries'] = model_lasso_tries.best_estimator_.predict(players_interim).round(2)
            players['conversions'] = model_lasso_conversions.best_estimator_.predict(players_interim).round(2)
            players['penalties'] = model_lasso_penalties.best_estimator_.predict(players_interim).round(2)
    
    if feature_interactions == True:
        players_interim['cost_started'] = players_interim['cost'] * players_interim['started'] 
        players_interim['cost_England'] = players_interim['cost'] * players_interim['England'] 
        players_interim['cost_France'] = players_interim['cost'] * players_interim['France'] 
        players_interim['cost_Ireland'] = players_interim['cost'] * players_interim['Ireland'] 
        players_interim['cost_Italy'] = players_interim['cost'] * players_interim['Italy'] 
        players_interim['cost_Scotland'] = players_interim['cost'] * players_interim['Scotland'] 
        players_interim['cost_Wales'] = players_interim['cost'] * players_interim['Wales'] 
        players_interim['cost_Back Row'] = players_interim['cost'] * players_interim['Back Row'] 
        players_interim['cost_Centre'] = players_interim['cost'] * players_interim['Centre'] 
        players_interim['cost_Fly Half'] = players_interim['cost'] * players_interim['Fly Half'] 
        players_interim['cost_Full Back'] = players_interim['cost'] * players_interim['Full Back'] 
        players_interim['cost_Hooker'] = players_interim['cost'] * players_interim['Hooker'] 
        players_interim['cost_Prop'] = players_interim['cost'] * players_interim['Prop'] 
        players_interim['cost_Scrum Half'] = players_interim['cost'] * players_interim['Scrum Half'] 
        players_interim['cost_Second Row'] = players_interim['cost'] * players_interim['Second Row'] 
        players_interim['cost_Wing'] = players_interim['cost'] * players_interim['Wing'] 
        players_interim['cost_v_italy'] = players_interim['cost'] * players_interim['v_italy'] 
        players_interim['cost_kicker'] = players_interim['cost'] * players_interim['kicker']
    
        if model == 'lin_reg':
            players['mins'] = model_lin_reg_interactions_mins.best_estimator_.predict(players_interim).round()
            players['tackles'] = model_lin_reg_interactions_tackles.best_estimator_.predict(players_interim).round(2)
            players['d_tackles'] = model_lin_reg_interactions_d_tackles.best_estimator_.predict(players_interim).round(2)
            players['t_breaks'] = model_lin_reg_interactions_t_breaks.best_estimator_.predict(players_interim).round(2)
            players['m_ball'] = model_lin_reg_interactions_m_ball.best_estimator_.predict(players_interim).round(2)
            players['tries'] = model_lin_reg_interactions_tries.best_estimator_.predict(players_interim).round(2)
            players['conversions'] = model_lin_reg_interactions_conversions.best_estimator_.predict(players_interim).round(2)
            players['penalties'] = model_lin_reg_interactions_penalties.best_estimator_.predict(players_interim).round(2)

        elif model == 'ridge':
            players['mins'] = model_ridge_interactions_mins.best_estimator_.predict(players_interim).round()
            players['tackles'] = model_ridge_interactions_tackles.best_estimator_.predict(players_interim).round(2)
            players['d_tackles'] = model_ridge_interactions_d_tackles.best_estimator_.predict(players_interim).round(2)
            players['t_breaks'] = model_ridge_interactions_t_breaks.best_estimator_.predict(players_interim).round(2)
            players['m_ball'] = model_ridge_interactions_m_ball.best_estimator_.predict(players_interim).round(2)
            players['tries'] = model_ridge_interactions_tries.best_estimator_.predict(players_interim).round(2)
            players['conversions'] = model_ridge_interactions_conversions.best_estimator_.predict(players_interim).round(2)
            players['penalties'] = model_ridge_interactions_penalties.best_estimator_.predict(players_interim).round(2)

        elif model == 'lasso':
            players['mins'] = model_lasso_interactions_mins.best_estimator_.predict(players_interim).round()
            players['tackles'] = model_lasso_interactions_tackles.best_estimator_.predict(players_interim).round(2)
            players['d_tackles'] = model_lasso_interactions_d_tackles.best_estimator_.predict(players_interim).round(2)
            players['t_breaks'] = model_lasso_interactions_t_breaks.best_estimator_.predict(players_interim).round(2)
            players['m_ball'] = model_lasso_interactions_m_ball.best_estimator_.predict(players_interim).round(2)
            players['tries'] = model_lasso_interactions_tries.best_estimator_.predict(players_interim).round(2)
            players['conversions'] = model_lasso_interactions_conversions.best_estimator_.predict(players_interim).round(2)
            players['penalties'] = model_lasso_interactions_penalties.best_estimator_.predict(players_interim).round(2)
    
    
    players['d_goals'] = 0
    players['mom'] = 0
    players['yellow'] = 0
    players['red'] = 0
    
    players['team_score'] = players.apply(lambda x: team_points_calculator(x['country'], x['mins'], x['week']), axis=1)
    players = players.drop('week', axis=1)
    
    players['ind_score'] = players.apply(lambda x: ind_points_calculator(
        x['tackles'], x['d_tackles'], x['t_breaks'], x['m_ball'], x['mom'],
        x['tries'], x['conversions'], x['penalties'], x['d_goals'], x['yellow'], x['red']
    ), axis=1)

    players['overall_score'] = (players['team_score'] + players['ind_score']).round(2)
    
    return players

# Prediction Modelling

## Linear Regression Models

In [17]:
def lin_reg_model_fitter(target, feature_interactions=[]):
    """

    """
    
    X = players_training_processed[features + feature_interactions]
    y = players_training_processed[target]
    
    model = GridSearchCV(
        estimator = Pipeline([
            ('scaler', StandardScaler()),
            ('lin_reg', LinearRegression())
        ]),
        param_grid = {},
        cv = KFold(n_splits=10, shuffle=True, random_state=101),
        scoring = 'neg_root_mean_squared_error'
    ).fit(X,y)
    
    return model

### Single Models

#### No Interactions

In [18]:
# fit the model to the training data
model_lin_reg_no_interactions = lin_reg_model_fitter('overall_score')

print("RMSE: {}".format(-model_lin_reg_no_interactions.best_score_.round(2)))

RMSE: 14.66


In [19]:
y_pred_single_lin_reg_no_interactions = test_data_predictor_single(players_test, model_lin_reg_no_interactions.best_estimator_, False)['overall_score']
mean_squared_error(y_test, y_pred_single_lin_reg_no_interactions)**0.5

18.269370244494876

#### Interactions

In [20]:
# fit the model to the training data
model_lin_reg_interactions = lin_reg_model_fitter('overall_score', feature_interactions)

print("RMSE: {}".format(-model_lin_reg_interactions.best_score_.round(2)))

RMSE: 22.46


In [21]:
y_pred_single_linear_reg_interactions = test_data_predictor_single(players_test, model_lin_reg_interactions.best_estimator_, True)['overall_score']

mean_squared_error(y_test, y_pred_single_linear_reg_interactions)**0.5

634405033268517.6

### Multiple Models

#### No Interactions

In [22]:
# fit models for each of the individual target variables
model_lin_reg_mins = lin_reg_model_fitter('mins')
model_lin_reg_tackles = lin_reg_model_fitter('tackles')
model_lin_reg_d_tackles = lin_reg_model_fitter('d_tackles')
model_lin_reg_t_breaks = lin_reg_model_fitter('t_breaks')
model_lin_reg_m_ball = lin_reg_model_fitter('m_ball')
model_lin_reg_tries = lin_reg_model_fitter('tries')
model_lin_reg_conversions = lin_reg_model_fitter('conversions')
model_lin_reg_penalties = lin_reg_model_fitter('penalties')

In [23]:
y_pred_multiple_linear_reg_no_interactions = test_data_predictor_multiple(players_test, 'lin_reg')['overall_score']

mean_squared_error(y_test, y_pred_multiple_linear_reg_no_interactions)**0.5

16.886098926528938

#### Interactions

In [24]:
# fit models for each of the individual target variables
model_lin_reg_interactions_mins = lin_reg_model_fitter('mins', feature_interactions)
model_lin_reg_interactions_tackles = lin_reg_model_fitter('tackles', feature_interactions)
model_lin_reg_interactions_d_tackles = lin_reg_model_fitter('d_tackles', feature_interactions)
model_lin_reg_interactions_t_breaks = lin_reg_model_fitter('t_breaks', feature_interactions)
model_lin_reg_interactions_m_ball = lin_reg_model_fitter('m_ball', feature_interactions)
model_lin_reg_interactions_tries = lin_reg_model_fitter('tries', feature_interactions)
model_lin_reg_interactions_conversions = lin_reg_model_fitter('conversions', feature_interactions)
model_lin_reg_interactions_penalties = lin_reg_model_fitter('penalties', feature_interactions)

In [25]:
y_pred_multiple_linear_reg_interactions = test_data_predictor_multiple(players_test, 'lin_reg', True)['overall_score']

mean_squared_error(y_test, y_pred_multiple_linear_reg_interactions)**0.5

525486361452490.9

## Ridge Regression Models

In [26]:
def ridge_reg_model_fitter(target, feature_interactions=[]):
    """

    """
    
    X = players_training_processed[features + feature_interactions]
    y = players_training_processed[target]
    
    model = GridSearchCV(
        estimator = Pipeline([
            ('scaler', StandardScaler()),
            ('ridge', Ridge())
        ]),
        param_grid = {'ridge__alpha': np.logspace(-2, 3, num=100)}, # 0.01 to 1000
        cv = KFold(n_splits=10, shuffle=True, random_state=101),
        scoring = 'neg_root_mean_squared_error'
    ).fit(X,y)
    
    return model

### Single Models

#### No Interactions

In [27]:
# fit the model to the training data
model_ridge_reg_no_interactions = ridge_reg_model_fitter('overall_score')

print("RMSE: {}".format(-model_ridge_reg_no_interactions.best_score_.round(2)))
print("alpha: {}".format(model_ridge_reg_no_interactions.best_params_['ridge__alpha'].round(2)))

RMSE: 14.21
alpha: 27.19


In [28]:
y_pred_single_ridge_reg_no_interactions = test_data_predictor_single(players_test, model_ridge_reg_no_interactions.best_estimator_, False)['overall_score']

mean_squared_error(y_test, y_pred_single_ridge_reg_no_interactions)**0.5

18.070444140896907

#### Interactions

In [29]:
# fit the model to the training data
model_ridge_reg_interactions = ridge_reg_model_fitter('overall_score', feature_interactions)

print("RMSE: {}".format(-model_ridge_reg_interactions.best_score_.round(2)))
print("alpha: {}".format(model_ridge_reg_interactions.best_params_['ridge__alpha'].round(2)))

RMSE: 13.41
alpha: 5.34


In [30]:
y_pred_single_ridge_reg_interactions = test_data_predictor_single(players_test, model_ridge_reg_interactions.best_estimator_, True)['overall_score']

mean_squared_error(y_test, y_pred_single_ridge_reg_interactions)**0.5

16.738777096778968

### Multiple Models

#### No Interactions

In [31]:
model_ridge_mins = ridge_reg_model_fitter('mins')

print("RMSE: {}".format(-model_ridge_mins.best_score_.round(2)))
print("alpha: {}".format(model_ridge_mins.best_params_['ridge__alpha'].round(2)))

RMSE: 14.76
alpha: 6.73


In [32]:
model_ridge_tackles = ridge_reg_model_fitter('tackles')

print("RMSE: {}".format(-model_ridge_tackles.best_score_.round(2)))
print("alpha: {}".format(model_ridge_tackles.best_params_['ridge__alpha'].round(2)))

RMSE: 3.61
alpha: 27.19


In [33]:
model_ridge_d_tackles = ridge_reg_model_fitter('d_tackles')

print("RMSE: {}".format(-model_ridge_d_tackles.best_score_.round(2)))
print("alpha: {}".format(model_ridge_d_tackles.best_params_['ridge__alpha'].round(2)))

RMSE: 0.45
alpha: 559.08


In [34]:
model_ridge_t_breaks = ridge_reg_model_fitter('t_breaks')

print("RMSE: {}".format(-model_ridge_t_breaks.best_score_.round(2)))
print("alpha: {}".format(model_ridge_t_breaks.best_params_['ridge__alpha'].round(2)))

RMSE: 1.27
alpha: 43.29


In [35]:
model_ridge_m_ball = ridge_reg_model_fitter('m_ball')

print("RMSE: {}".format(-model_ridge_m_ball.best_score_.round(2)))
print("alpha: {}".format(model_ridge_m_ball.best_params_['ridge__alpha'].round(2)))

RMSE: 24.69
alpha: 27.19


In [36]:
model_ridge_tries = ridge_reg_model_fitter('tries')

print("RMSE: {}".format(-model_ridge_tries.best_score_.round(2)))
print("alpha: {}".format(model_ridge_tries.best_params_['ridge__alpha'].round(2)))

RMSE: 0.25
alpha: 351.12


In [37]:
model_ridge_conversions = ridge_reg_model_fitter('conversions')

print("RMSE: {}".format(-model_ridge_conversions.best_score_.round(2)))
print("alpha: {}".format(model_ridge_conversions.best_params_['ridge__alpha'].round(2)))

RMSE: 0.27
alpha: 1000.0


In [38]:
model_ridge_penalties = ridge_reg_model_fitter('penalties')

print("RMSE: {}".format(-model_ridge_penalties.best_score_.round(2)))
print("alpha: {}".format(model_ridge_penalties.best_params_['ridge__alpha'].round(2)))

RMSE: 0.26
alpha: 21.54


In [39]:
y_pred_multiple_ridge_reg_no_interactions = test_data_predictor_multiple(players_test, 'ridge', False)['overall_score']

mean_squared_error(y_test, y_pred_multiple_ridge_reg_no_interactions)**0.5

15.753346320380794

#### Interactions

In [40]:
model_ridge_interactions_mins = ridge_reg_model_fitter('mins', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_mins.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_mins.best_params_['ridge__alpha'].round(2)))

RMSE: 14.71
alpha: 13.53


In [41]:
model_ridge_interactions_tackles = ridge_reg_model_fitter('tackles', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_tackles.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_tackles.best_params_['ridge__alpha'].round(2)))

RMSE: 3.51
alpha: 43.29


In [42]:
model_ridge_interactions_d_tackles = ridge_reg_model_fitter('d_tackles', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_d_tackles.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_d_tackles.best_params_['ridge__alpha'].round(2)))

RMSE: 0.45
alpha: 1000.0


In [43]:
model_ridge_interactions_t_breaks = ridge_reg_model_fitter('t_breaks', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_t_breaks.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_t_breaks.best_params_['ridge__alpha'].round(2)))

RMSE: 1.25
alpha: 43.29


In [44]:
model_ridge_interactions_m_ball = ridge_reg_model_fitter('m_ball', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_m_ball.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_m_ball.best_params_['ridge__alpha'].round(2)))

RMSE: 24.32
alpha: 43.29


In [45]:
model_ridge_interactions_tries = ridge_reg_model_fitter('tries', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_tries.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_tries.best_params_['ridge__alpha'].round(2)))

RMSE: 0.25
alpha: 559.08


In [46]:
model_ridge_interactions_conversions = ridge_reg_model_fitter('conversions', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_conversions.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_conversions.best_params_['ridge__alpha'].round(2)))

RMSE: 0.27
alpha: 1000.0


In [47]:
model_ridge_interactions_penalties = ridge_reg_model_fitter('penalties', feature_interactions)

print("RMSE: {}".format(-model_ridge_interactions_penalties.best_score_.round(2)))
print("alpha: {}".format(model_ridge_interactions_penalties.best_params_['ridge__alpha'].round(2)))

RMSE: 0.24
alpha: 17.07


In [48]:
y_pred_multiple_ridge_reg_interactions = test_data_predictor_multiple(players_test, 'ridge', True)['overall_score']

mean_squared_error(y_test, y_pred_multiple_ridge_reg_interactions)**0.5

14.057310105112727

## Lasso Models

In [49]:
def lasso_model_fitter(target, feature_interactions=[]):
    """

    """
    
    X = players_training_processed[features + feature_interactions]
    y = players_training_processed[target]
    
    model = GridSearchCV(
        estimator = Pipeline([
            ('scaler', StandardScaler()),
            ('lasso', Lasso())
        ]),
        param_grid = {'lasso__alpha': np.linspace(start=0.01, stop=1, num=100)},
        cv = KFold(n_splits=10, shuffle=True, random_state=101),
        scoring = 'neg_root_mean_squared_error'
    ).fit(X,y)
    
    return model

### Single Models

#### No Interactions

In [50]:
model_lasso_no_interactions = lasso_model_fitter('overall_score')

print("RMSE: {}".format(-model_lasso_no_interactions.best_score_.round(2)))
print("alpha: {}".format(model_lasso_no_interactions.best_params_['lasso__alpha'].round(2)))

  positive)
  positive)
  positive)


RMSE: 14.36
alpha: 0.43


In [51]:
y_pred_single_lasso_no_interactions = test_data_predictor_single(players_test, model_lasso_no_interactions.best_estimator_, False)['overall_score']

mean_squared_error(y_test, y_pred_single_lasso_no_interactions)**0.5

16.620225957235515

#### Interactions

In [52]:
model_lasso_interactions = lasso_model_fitter('overall_score', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions.best_params_['lasso__alpha'].round(2)))

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


RMSE: 13.13
alpha: 0.19


In [53]:
y_pred_single_lasso_interactions = test_data_predictor_single(players_test, model_lasso_interactions.best_estimator_, True)['overall_score']

mean_squared_error(y_test, y_pred_single_lasso_interactions)**0.5

17.14354148047461

### Multiple Models

#### No Interactions

In [54]:
model_lasso_mins = lasso_model_fitter('mins')

print("RMSE: {}".format(-model_lasso_mins.best_score_.round(2)))
print("alpha: {}".format(model_lasso_mins.best_params_['lasso__alpha'].round(2)))

RMSE: 14.38
alpha: 0.94


In [55]:
model_lasso_tackles = lasso_model_fitter('tackles')

print("RMSE: {}".format(-model_lasso_tackles.best_score_.round(2)))
print("alpha: {}".format(model_lasso_tackles.best_params_['lasso__alpha'].round(2)))

  positive)
  positive)
  positive)
  positive)
  positive)


RMSE: 3.66
alpha: 0.08


In [56]:
model_lasso_d_tackles = lasso_model_fitter('d_tackles')

print("RMSE: {}".format(-model_lasso_d_tackles.best_score_.round(2)))
print("alpha: {}".format(model_lasso_d_tackles.best_params_['lasso__alpha'].round(2)))

RMSE: 0.46
alpha: 0.12


In [57]:
model_lasso_t_breaks = lasso_model_fitter('t_breaks')

print("RMSE: {}".format(-model_lasso_t_breaks.best_score_.round(2)))
print("alpha: {}".format(model_lasso_t_breaks.best_params_['lasso__alpha'].round(2)))

RMSE: 1.28
alpha: 0.07


In [58]:
model_lasso_m_ball = lasso_model_fitter('m_ball')

print("RMSE: {}".format(-model_lasso_m_ball.best_score_.round(2)))
print("alpha: {}".format(model_lasso_m_ball.best_params_['lasso__alpha'].round(2)))

  positive)
  positive)
  positive)


RMSE: 24.8
alpha: 0.76


In [59]:
model_lasso_tries = lasso_model_fitter('tries')

print("RMSE: {}".format(-model_lasso_tries.best_score_.round(2)))
print("alpha: {}".format(model_lasso_tries.best_params_['lasso__alpha'].round(2)))

RMSE: 0.26
alpha: 0.03


In [60]:
model_lasso_conversions = lasso_model_fitter('conversions')

print("RMSE: {}".format(-model_lasso_conversions.best_score_.round(2)))
print("alpha: {}".format(model_lasso_conversions.best_params_['lasso__alpha'].round(2)))

RMSE: 0.27
alpha: 0.15


In [61]:
model_lasso_penalties = lasso_model_fitter('penalties')

print("RMSE: {}".format(-model_lasso_penalties.best_score_.round(2)))
print("alpha: {}".format(model_lasso_penalties.best_params_['lasso__alpha'].round(2)))

RMSE: 0.23
alpha: 0.07


In [62]:
y_pred_multiple_lasso_no_interactions = test_data_predictor_multiple(players_test, 'lasso', False)['overall_score']

mean_squared_error(y_test, y_pred_multiple_lasso_no_interactions)**0.5

15.536860402263397

#### Interactions

In [63]:
model_lasso_interactions_mins = lasso_model_fitter('mins', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_mins.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_mins.best_params_['lasso__alpha'].round(2)))

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


RMSE: 14.7
alpha: 0.81


In [100]:
df = feature_coefficients(model_lasso_interactions_mins.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
0,cost,3.668823
1,started,14.33879
4,Ireland,-0.379666
5,Italy,2.759819
11,Full Back,1.868457
15,Second Row,0.686331
16,Wing,1.786341
19,cost_started,4.204112
21,cost_France,-0.009783
25,cost_Wales,0.380242


In [64]:
model_lasso_interactions_tackles = lasso_model_fitter('tackles', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_tackles.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_tackles.best_params_['lasso__alpha'].round(2)))

  positive)


RMSE: 3.5
alpha: 0.07


In [99]:
df = feature_coefficients(model_lasso_interactions_tackles.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
0,cost,0.320935
1,started,1.556767
2,England,0.705308
4,Ireland,-0.007055
11,Full Back,-0.833606
12,Hooker,0.4998
13,Prop,0.664427
18,kicker,-0.000257
19,cost_started,0.399248
21,cost_France,0.381985


In [65]:
model_lasso_interactions_d_tackles = lasso_model_fitter('d_tackles', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_d_tackles.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_d_tackles.best_params_['lasso__alpha'].round(2)))

RMSE: 0.46
alpha: 0.14


In [101]:
df = feature_coefficients(model_lasso_interactions_d_tackles.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient


In [66]:
model_lasso_interactions_t_breaks = lasso_model_fitter('t_breaks', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_t_breaks.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_t_breaks.best_params_['lasso__alpha'].round(2)))

RMSE: 1.23
alpha: 0.07


In [102]:
df = feature_coefficients(model_lasso_interactions_t_breaks.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
0,cost,0.377162
10,Fly Half,0.034106
19,cost_started,0.144192
20,cost_England,-0.148552
23,cost_Italy,0.163761
24,cost_Scotland,0.079119
25,cost_Wales,-0.053575
27,cost_Centre,0.202168
29,cost_Full Back,0.473897
33,cost_Second Row,-0.111669


In [67]:
model_lasso_interactions_m_ball = lasso_model_fitter('m_ball', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_m_ball.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_m_ball.best_params_['lasso__alpha'].round(2)))

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)


RMSE: 24.11
alpha: 0.97


In [103]:
df = feature_coefficients(model_lasso_interactions_m_ball.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
0,cost,5.112264
2,England,-1.726856
7,Wales,-1.247085
9,Centre,1.088898
10,Fly Half,0.939964
13,Prop,-0.753554
15,Second Row,-2.033564
19,cost_started,9.201873
20,cost_England,-1.421541
22,cost_Ireland,6.763918


In [68]:
model_lasso_interactions_tries = lasso_model_fitter('tries', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_tries.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_tries.best_params_['lasso__alpha'].round(2)))

RMSE: 0.25
alpha: 0.03


In [104]:
df = feature_coefficients(model_lasso_interactions_tries.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
19,cost_started,0.02941112
21,cost_France,0.07963258
34,cost_Wing,0.08508535
35,cost_v_italy,5.669902e-08


In [69]:
model_lasso_interactions_conversions = lasso_model_fitter('conversions', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_conversions.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_conversions.best_params_['lasso__alpha'].round(2)))

RMSE: 0.28
alpha: 0.18


In [105]:
df = feature_coefficients(model_lasso_interactions_conversions.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
36,cost_kicker,0.068945


In [70]:
model_lasso_interactions_penalties = lasso_model_fitter('penalties', feature_interactions)

print("RMSE: {}".format(-model_lasso_interactions_penalties.best_score_.round(2)))
print("alpha: {}".format(model_lasso_interactions_penalties.best_params_['lasso__alpha'].round(2)))

RMSE: 0.21
alpha: 0.07


In [106]:
df = feature_coefficients(model_lasso_interactions_penalties.best_estimator_[1], feature_interactions)
df[df['coefficient'] != 0]

Unnamed: 0,features,coefficient
36,cost_kicker,0.251529


In [71]:
y_pred_multiple_lasso_interactions = test_data_predictor_multiple(players_test, 'lasso', True)['overall_score']

mean_squared_error(y_test, y_pred_multiple_lasso_interactions)**0.5

13.716631473707649