# WHEN WILL PLAYER BE PICKED?

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#### Data

2 approaches:
1) use only players with non zero values - 1673 records
2) use players with missing values (at most 3) that have been filled  - 2940 records

In [27]:
test_size = 0.2
df_drop = pd.read_csv('../data/combine_drop.csv', index_col=0)
df_drop_train, df_drop_test = train_test_split(df_drop, test_size=test_size)

df_fill = pd.read_csv('../data/combine_fill.csv', index_col=0)
df_fill_train, df_fill_test = train_test_split(df_fill, test_size=test_size)

numeric_cols = df_drop.select_dtypes(include=['int64', 'float64']).columns
target_cols = df_drop.columns[-2:]
df_drop.shape, df_fill.shape, numeric_cols

((1673, 11),
 (2940, 11),
 Index(['heightinchestotal', 'weight', 'fortyyd', 'twentyss', 'vertical',
        'broad', 'bench', 'pickround', 'picktotal'],
       dtype='object'))

#### LINEAR REGRESSION - Baseline

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [41]:
def lr_for_y(X, y):
    reg = LinearRegression().fit(X, y)
    pred_train = reg.predict(X)
    mse_train = mean_squared_error(y, pred_train, squared=False)
    r2_train = r2_score(y, pred_train)
    return reg, {'rmse': mse_train, 'r2': r2_train}

def predict_lr(reg, X, y):
    pred_test = reg.predict(X)
    mse_test = mean_squared_error(y, pred_test, squared=False)
    r2_test = r2_score(y, pred_test)
    return {'rmse': mse_test, 'r2': r2_test}

def lr(df_train, df_test, target_cols, numeric_cols):
    df_train_lr = df_train[numeric_cols].drop(columns=target_cols)
    df_test_lr = df_test[numeric_cols].drop(columns=target_cols)
    scaler = StandardScaler()
    df_train_lr = scaler.fit_transform(df_train_lr)
    df_test_lr = scaler.transform(df_test_lr)
    
    lr_round, results_round = lr_for_y(df_train_lr, df_train['pickround'])
    lr_total, results_total = lr_for_y(df_train_lr, df_train['picktotal'])
    
    test_round = predict_lr(lr_round, df_test_lr, df_test['pickround'])
    test_total = predict_lr(lr_total, df_test_lr, df_test['picktotal'])
    
    return {'round':{
                'train': results_round,
                'test': test_round
            },
            'total':{
                'train': results_total,
                'test': test_total
            }}

###### DROP

In [42]:
lr(df_drop_train, df_drop_test, target_cols, numeric_cols)

{'round': {'train': {'rmse': 10.611729535688601, 'r2': 0.016817569811364397},
  'test': {'rmse': 10.821731492121053, 'r2': 0.009878016612537621}},
 'total': {'train': {'rmse': 65.70362579727454, 'r2': 0.1094093536430496},
  'test': {'rmse': 66.78652601992567, 'r2': 0.06749463590777449}}}

###### FILL

In [43]:
lr(df_fill_train, df_fill_test, target_cols, numeric_cols)

{'round': {'train': {'rmse': 10.670499332008188, 'r2': 0.012429571013983565},
  'test': {'rmse': 10.577499252544564, 'r2': 0.0008370425882940902}},
 'total': {'train': {'rmse': 67.5087628713327, 'r2': 0.09061811597775371},
  'test': {'rmse': 67.65738711274639, 'r2': 0.05113536131720364}}}