# WHEN WILL PLAYER BE PICKED?

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

#### Data

2 approaches:
1) use only players with non zero values - 1673 records
2) use players with missing values (at most 3) that have been filled  - 2940 records

In [2]:
test_size = 0.2
df_drop = pd.read_csv('../data/combine_drop.csv', index_col=0)
df_drop_train, df_drop_test = train_test_split(df_drop, test_size=test_size)

df_fill = pd.read_csv('../data/combine_fill.csv', index_col=0)
df_fill_train, df_fill_test = train_test_split(df_fill, test_size=test_size)

numeric_cols = df_drop.select_dtypes(include=['int64', 'float64']).columns
target_cols = df_drop.columns[-2:]
df_drop.shape, df_fill.shape, numeric_cols

((1673, 11),
 (2940, 11),
 Index(['heightinchestotal', 'weight', 'fortyyd', 'twentyss', 'vertical',
        'broad', 'bench', 'pickround', 'picktotal'],
       dtype='object'))

In [None]:
def train(model, X, y):
    reg = model.fit(X, y)
    pred_train = reg.predict(X)
    mse_train = mean_squared_error(y, pred_train, squared=False)
    r2_train = r2_score(y, pred_train)
    return reg, {'rmse': mse_train, 'r2': r2_train}

def predict(reg, X, y):
    pred_test = reg.predict(X)
    mse_test = mean_squared_error(y, pred_test, squared=False)
    r2_test = r2_score(y, pred_test)
    return {'rmse': mse_test, 'r2': r2_test}

def standarize(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

#### DUMMY - Baseline

In [3]:
from sklearn.dummy import DummyRegressor

In [None]:
def dummy(X_train, X_test, y_train, y_test):
    reg = DummyRegressor(strategy='mean')
    reg, results = train(reg, X_train, y_train)
    results = predict(reg, X_test, y_test)
    return reg, results

#### LINEAR REGRESSION - Baseline

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
def lr(df_train, df_test, target_cols, numeric_cols, standarize=False):
    df_train_lr = df_train[numeric_cols].drop(columns=target_cols)
    df_test_lr = df_test[numeric_cols].drop(columns=target_cols)
    
    if standarize:
        df_train_lr, df_test_lr = standarize(df_train_lr, df_test_lr)
    
    lr_round, results_round = train(LinearRegression(), df_train['pickround'])
    lr_total, results_total = train(LinearRegression(), df_train_lr, df_train['picktotal'])
    
    test_round = predict(lr_round, df_test_lr, df_test['pickround'])
    test_total = predict(lr_total, df_test_lr, df_test['picktotal'])
    
    return {'round':{
                'train': results_round,
                'test': test_round
            },
            'total':{
                'train': results_total,
                'test': test_total
            }}

###### DROP

In [6]:
lr(df_drop_train, df_drop_test, target_cols, numeric_cols)

{'round': {'train': {'rmse': 10.60477131231643, 'r2': 0.020379782611535502},
  'test': {'rmse': 10.871380224169265, 'r2': -0.008527605619557566}},
 'total': {'train': {'rmse': 66.24999714214555, 'r2': 0.10497447401831395},
  'test': {'rmse': 64.55150127134748, 'r2': 0.08365668075586641}}}

###### FILL

In [7]:
lr(df_fill_train, df_fill_test, target_cols, numeric_cols)

{'round': {'train': {'rmse': 10.694386436866195, 'r2': 0.009581413277553463},
  'test': {'rmse': 10.482137358030181, 'r2': 0.012310780651190623}},
 'total': {'train': {'rmse': 68.21063029475282, 'r2': 0.07970740566384704},
  'test': {'rmse': 64.71020570619683, 'r2': 0.09730790534314115}}}