___
this notebook select features that are important before throwing into the model

In [85]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_selection import *
from sklearn import *
from scipy import *
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, SelectFromModel, SelectPercentile,RFECV
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from aggregate_function import build_features_table, combine_features_table, coach_stats, win_rate_type_of_location

In [86]:
coach_file = 'data/DataFiles/TeamCoaches.csv'
regularseason_file = 'data/DataFiles/RegularSeasonDetailedResults.csv'
postseason_file = 'data/DataFiles/NCAATourneyCompactResults.csv'

In [87]:
initial_features = build_features_table.BuildFeaturesTable(regularseason_file)
win_rate_features = win_rate_type_of_location.WinRateTypeLocation(regularseason_file)
coach_features = coach_stats.CoachStats(coach_file,regularseason_file,postseason_file)

features = combine_features_table.CombineFeaturesTable(initial_features,win_rate_features,coach_features)

## Final data transformation for feature table
- post season is only what we care about
- post season match ups will be what we are joining all the features table to
- additional variable of seeding differential

In [88]:
features_table = (
    features.final_table_cum_processed
    .drop(['total_score','total_opponent_score','total_rebounds','total_blocks',
           'total_assist_turnover_ratio','expectation_per_game', 'win_rate','fg3p','win_rate_overall',
           'total_off_rebounds_percent','total_def_rebounds_percent','total_rebound_possession_percent',
           'total_rebound_possessiongain_percent'
          ],1)
    .fillna(0)
)

In [89]:
seeding_data = pd.read_csv("input/tour-results-seed.csv")

In [90]:
winning_team_perspective_df = (
    seeding_data
    .pipe(lambda x:x.assign(diff_seed = x.L_seed - x.W_seed))
    .pipe(lambda x:x.assign(outcome = 1))
    .merge(features_table,how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
    .merge(features_table,how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
    .pipe(lambda x:x.assign(diff_total_off_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_off_rebounds_x - x.total_off_rebounds_y,
                                                                x.total_off_rebounds_y - x.total_off_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_def_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_def_rebounds_x - x.total_def_rebounds_y,
                                                                x.total_def_rebounds_y - x.total_def_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_assists = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assists_x - x.total_assists_y,
                                                                x.total_assists_y - x.total_assists_x)))
    .pipe(lambda x:x.assign(diff_total_steals = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_steals_x - x.total_steals_y,
                                                                x.total_steals_y - x.total_steals_x)))
    .pipe(lambda x:x.assign(diff_total_turnover = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_turnover_x - x.total_turnover_y,
                                                                x.total_turnover_y - x.total_turnover_x)))
    .pipe(lambda x:x.assign(diff_total_personalfoul = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_personalfoul_x - x.total_personalfoul_y,
                                                                x.total_personalfoul_y - x.total_personalfoul_x)))
    .pipe(lambda x:x.assign(diff_total_assist_per_fgm = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assist_per_fgm_x - x.total_assist_per_fgm_y,
                                                                x.total_assist_per_fgm_y - x.total_assist_per_fgm_x)))
    .pipe(lambda x:x.assign(diff_avg_lose_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_lose_score_by_x - x.avg_lose_score_by_y,
                                                                x.avg_lose_score_by_y - x.avg_lose_score_by_x)))
    .pipe(lambda x:x.assign(diff_avg_win_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_win_score_by_x - x.avg_win_score_by_y,
                                                                x.avg_win_score_by_y - x.avg_win_score_by_x)))
    .pipe(lambda x:x.assign(diff_num_season = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.num_season_x - x.num_season_y,
                                                                x.num_season_y - x.num_season_x)))
    .pipe(lambda x:x.assign(diff_is_playoff = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_playoff_x - x.is_playoff_y,
                                                                x.is_playoff_y - x.is_playoff_x)))
    .pipe(lambda x:x.assign(diff_is_champion = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_champion_x - x.is_champion_y,
                                                                x.is_champion_y - x.is_champion_x)))
    .pipe(lambda x:x.assign(diff_fgp = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.fgp_x - x.fgp_y,
                                                                x.fgp_y - x.fgp_x)))
    .pipe(lambda x:x.assign(diff_total_block_opp_FGA_percent = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_block_opp_FGA_percent_x - x.total_block_opp_FGA_percent_y,
                                                                x.total_block_opp_FGA_percent_y - x.total_block_opp_FGA_percent_x)))
    .pipe(lambda x:x.assign(diff_win_rate_away = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_away_x - x.win_rate_away_y,
                                                                x.win_rate_away_y - x.win_rate_away_x)))
    .pipe(lambda x:x.assign(diff_win_rate_home = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_home_x - x.win_rate_home_y,
                                                                x.win_rate_home_y - x.win_rate_home_x)))
    .pipe(lambda x:x.assign(diff_win_rate_neutral = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_neutral_x - x.win_rate_neutral_y,
                                                                x.win_rate_neutral_y - x.win_rate_neutral_x)))
    .pipe(lambda x:x.assign(diff_win_rate_post = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_post_x - x.win_rate_post_y,
                                                                x.win_rate_post_y - x.win_rate_post_x)))
    .pipe(lambda x:x.assign(diff_win_rate_regular = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_regular_x - x.win_rate_regular_y,
                                                                x.win_rate_regular_y - x.win_rate_regular_x)))
    .pipe(lambda x:x.assign(diff_seed = x.W_seed - x.L_seed))

    
)

In [91]:
losing_team_perspective_df = (
    seeding_data
    .pipe(lambda x:x.assign(diff_seed = x.W_seed - x.L_seed))
    .pipe(lambda x:x.assign(outcome = 0))
    .merge(features_table,how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
    .merge(features_table,how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
    .pipe(lambda x:x.assign(diff_total_off_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_off_rebounds_x - x.total_off_rebounds_y,
                                                                x.total_off_rebounds_y - x.total_off_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_def_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_def_rebounds_x - x.total_def_rebounds_y,
                                                                x.total_def_rebounds_y - x.total_def_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_assists = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assists_x - x.total_assists_y,
                                                                x.total_assists_y - x.total_assists_x)))
    .pipe(lambda x:x.assign(diff_total_steals = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_steals_x - x.total_steals_y,
                                                                x.total_steals_y - x.total_steals_x)))
    .pipe(lambda x:x.assign(diff_total_turnover = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_turnover_x - x.total_turnover_y,
                                                                x.total_turnover_y - x.total_turnover_x)))
    .pipe(lambda x:x.assign(diff_total_personalfoul = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_personalfoul_x - x.total_personalfoul_y,
                                                                x.total_personalfoul_y - x.total_personalfoul_x)))
    .pipe(lambda x:x.assign(diff_total_assist_per_fgm = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assist_per_fgm_x - x.total_assist_per_fgm_y,
                                                                x.total_assist_per_fgm_y - x.total_assist_per_fgm_x)))
    .pipe(lambda x:x.assign(diff_avg_lose_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_lose_score_by_x - x.avg_lose_score_by_y,
                                                                x.avg_lose_score_by_y - x.avg_lose_score_by_x)))
    .pipe(lambda x:x.assign(diff_avg_win_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_win_score_by_x - x.avg_win_score_by_y,
                                                                x.avg_win_score_by_y - x.avg_win_score_by_x)))
    .pipe(lambda x:x.assign(diff_num_season = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.num_season_x - x.num_season_y,
                                                                x.num_season_y - x.num_season_x)))
    .pipe(lambda x:x.assign(diff_is_playoff = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_playoff_x - x.is_playoff_y,
                                                                x.is_playoff_y - x.is_playoff_x)))
    .pipe(lambda x:x.assign(diff_is_champion = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_champion_x - x.is_champion_y,
                                                                x.is_champion_y - x.is_champion_x)))
    .pipe(lambda x:x.assign(diff_fgp = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.fgp_x - x.fgp_y,
                                                                x.fgp_y - x.fgp_x)))
    .pipe(lambda x:x.assign(diff_total_block_opp_FGA_percent = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_block_opp_FGA_percent_x - x.total_block_opp_FGA_percent_y,
                                                                x.total_block_opp_FGA_percent_y - x.total_block_opp_FGA_percent_x)))
    .pipe(lambda x:x.assign(diff_win_rate_away = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_away_x - x.win_rate_away_y,
                                                                x.win_rate_away_y - x.win_rate_away_x)))
    .pipe(lambda x:x.assign(diff_win_rate_home = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_home_x - x.win_rate_home_y,
                                                                x.win_rate_home_y - x.win_rate_home_x)))
    .pipe(lambda x:x.assign(diff_win_rate_neutral = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_neutral_x - x.win_rate_neutral_y,
                                                                x.win_rate_neutral_y - x.win_rate_neutral_x)))
    .pipe(lambda x:x.assign(diff_win_rate_post = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_post_x - x.win_rate_post_y,
                                                                x.win_rate_post_y - x.win_rate_post_x)))
    .pipe(lambda x:x.assign(diff_win_rate_regular = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_regular_x - x.win_rate_regular_y,
                                                                x.win_rate_regular_y - x.win_rate_regular_x)))
    .pipe(lambda x:x.assign(diff_seed = x.W_seed - x.L_seed))
    
)

In [92]:
prediction_df = (
    winning_team_perspective_df.append(losing_team_perspective_df)
)

train_df = prediction_df.query("Season >= 2003 & Season <= 2016")
test_df = prediction_df.query("Season == 2017")

train_df.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,outcome,total_off_rebounds_x,total_def_rebounds_x,total_assists_x,...,diff_num_season,diff_is_playoff,diff_is_champion,diff_fgp,diff_total_block_opp_FGA_percent,diff_win_rate_away,diff_win_rate_home,diff_win_rate_neutral,diff_win_rate_post,diff_win_rate_regular
1136,2003,1421,16,1411,16,0,1,0.037785,0.03577,0.031439,...,0.152174,0.0,0.0,-0.018262,0.012232,-0.071429,-0.162281,0.25,0.5,-0.151724
1137,2003,1112,1,1436,16,-15,1,0.047813,0.043772,0.045466,...,0.003557,-0.59375,-0.2,-0.016969,-0.011306,-0.041667,-0.370833,0.6,-0.641509,-0.237685
1138,2003,1113,10,1272,7,3,1,0.043744,0.036084,0.040266,...,0.0,-0.09375,0.0,0.040251,-0.011396,-0.206349,-0.111111,-0.5,-0.397059,-0.172414
1139,2003,1141,11,1166,6,5,1,0.030664,0.036006,0.040508,...,-0.243478,-0.15625,0.0,0.005763,-0.011456,-0.1,-0.122024,0.0,0.25,-0.085684
1140,2003,1143,8,1301,9,-1,1,0.033425,0.038516,0.041838,...,-0.227668,-0.125,0.0,-0.009399,0.010209,-0.375,-0.114706,0.25,-0.1,-0.124138


In [93]:
train_data_x = train_df[['diff_seed','diff_total_off_rebounds','diff_total_def_rebounds','diff_total_assists',
                         'diff_total_steals','diff_total_turnover','diff_total_personalfoul',
                         'diff_total_assist_per_fgm','diff_avg_lose_score_by','diff_avg_win_score_by',
                         'diff_num_season','diff_is_playoff','diff_is_champion','diff_fgp',
                         'diff_total_block_opp_FGA_percent','diff_win_rate_away','diff_win_rate_home',
                         'diff_win_rate_neutral','diff_win_rate_post','diff_win_rate_regular']]
train_data_y = train_df['outcome']

test_data_x = test_df[['diff_seed','diff_total_off_rebounds','diff_total_def_rebounds','diff_total_assists',
                         'diff_total_steals','diff_total_turnover','diff_total_personalfoul',
                         'diff_total_assist_per_fgm','diff_avg_lose_score_by','diff_avg_win_score_by',
                         'diff_num_season','diff_is_playoff','diff_is_champion','diff_fgp',
                         'diff_total_block_opp_FGA_percent','diff_win_rate_away','diff_win_rate_home',
                         'diff_win_rate_neutral','diff_win_rate_post','diff_win_rate_regular']]
test_data_y = test_df['outcome']

## Feature selection for logistics regression
- Univariate selection
- SelectFromModel from RF, lassoCV

In [94]:
# univariate selection
percentile_list = [10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
for i in percentile_list:
    select = SelectPercentile(percentile=i)

    select.fit(train_data_x, train_data_y)

    train_data_x_selected = select.transform(train_data_x)
    test_data_x_selected = select.transform(test_data_x)

    mask = select.get_support()    
#     print(mask)
    logreg = LogisticRegression()
    logreg.fit(train_data_x,train_data_y)
    
    print("\nWhich percentile : " + str(i))
    print("normal logreg: {}".format(logreg.score(test_data_x,test_data_y)))

    logreg.fit(train_data_x_selected,train_data_y)
    print("feature selection logreg: {}".format(logreg.score(test_data_x_selected,test_data_y)))
    print(Counter(logreg.predict(test_data_x_selected)[:67]))

    
# 2o percentile is the best FE for logistics regression


Which percentile : 10
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 15
normal logreg: 0.8059701492537313
feature selection logreg: 0.8656716417910447
Counter({1: 58, 0: 9})

Which percentile : 20
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 25
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 30
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 35
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 40
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 45
normal logreg: 0.8059701492537313
feature selection logreg: 0.8208955223880597
Counter({1: 55, 0

In [95]:
# based on the output of the univariate, we can narrow to 15
select_15 = SelectPercentile(percentile=15)

In [96]:
select_15.fit(train_data_x, train_data_y)

train_data_x_selected_15 = select_15.transform(train_data_x)
test_data_x_selected_15 = select_15.transform(test_data_x)

mask = select_15.get_support()    
#     print(mask)
logreg_15 = LogisticRegression()
logreg_15.fit(train_data_x_selected_15,train_data_y)

logreg_15.score(test_data_x_selected_15,test_data_y)
logreg_15.predict(test_data_x_selected_15)[:67]

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [97]:
rf = RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [5,10,50,100,150,500,1000],
    'max_depth': [1,2,5,10,15,50,100]
}
grid_rf = GridSearchCV(rf, param_grid, scoring='accuracy', cv=5, verbose=0)
grid_rf.fit(train_data_x, train_data_y)

rf_model = grid_rf.best_estimator_
rf_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [135]:
## selectfrommodel RF
select_rf = SelectFromModel(RandomForestClassifier(n_estimators =150, max_depth = 15, random_state=0),threshold=0.05)

In [136]:
select_rf.fit(train_data_x,train_data_y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.05)

In [137]:
train_data_x_selected = select_rf.transform(train_data_x)
test_data_x_selected = select_rf.transform(test_data_x)

In [138]:
LogisticRegression().fit(train_data_x_selected,train_data_y).score(test_data_x_selected,test_data_y)

0.83582089552238803

In [139]:
LogisticRegression().fit(train_data_x_selected,train_data_y).predict(test_data_x_selected)[:67]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [140]:
## selectfrommodel lassoCV --> same as univariate

In [141]:
select_lcv = SelectFromModel(LassoCV(max_iter=100,n_alphas=10,eps=1e-05),threshold=0.01)

select_lcv.fit(train_data_x,train_data_y)

SelectFromModel(estimator=LassoCV(alphas=None, copy_X=True, cv=None, eps=1e-05, fit_intercept=True,
    max_iter=100, n_alphas=10, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False),
        norm_order=1, prefit=False, threshold=0.01)

In [109]:
train_data_x_selected_lcv = select_lcv.transform(train_data_x)
test_data_x_selected_lcv = select_lcv.transform(test_data_x)

In [110]:
LogisticRegression().fit(train_data_x_selected_lcv,train_data_y).score(test_data_x_selected_lcv,test_data_y)

0.80597014925373134

In [111]:
LogisticRegression().fit(train_data_x_selected_lcv,train_data_y).predict(test_data_x_selected_lcv)[:67]

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

## Feature selection for RF
- univariate
- SelectFromModel SVM
- RFE(CV)

In [112]:
# univariate selection
percentile_list = [10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
for i in percentile_list:
    select_rf = SelectPercentile(percentile=i)
    select_rf.fit(train_data_x, train_data_y)
    
    train_data_x_selected = select_rf.transform(train_data_x)
    test_data_x_selected = select_rf.transform(test_data_x)
    
    mask = select_rf.get_support()
#     print(mask)
    rf = RandomForestClassifier(n_estimators = 150, max_depth=15,random_state=0)
    rf.fit(train_data_x,train_data_y)
    
    print("\nWhich percentile : " + str(i))
    print("normal rf: {}".format(rf.score(test_data_x,test_data_y)))
    
    rf.fit(train_data_x_selected,train_data_y)
    print("feature selection rf: {}".format(rf.score(test_data_x_selected,test_data_y)))
    print(Counter(rf.predict(test_data_x_selected)[:67]))



Which percentile : 10
normal rf: 0.8582089552238806
feature selection rf: 0.7985074626865671
Counter({1: 53, 0: 14})

Which percentile : 15
normal rf: 0.8582089552238806
feature selection rf: 0.7985074626865671
Counter({1: 54, 0: 13})

Which percentile : 20
normal rf: 0.8582089552238806
feature selection rf: 0.7985074626865671
Counter({1: 53, 0: 14})

Which percentile : 25
normal rf: 0.8582089552238806
feature selection rf: 0.7761194029850746
Counter({1: 52, 0: 15})

Which percentile : 30
normal rf: 0.8582089552238806
feature selection rf: 0.7985074626865671
Counter({1: 53, 0: 14})

Which percentile : 35
normal rf: 0.8582089552238806
feature selection rf: 0.7985074626865671
Counter({1: 54, 0: 13})

Which percentile : 40
normal rf: 0.8582089552238806
feature selection rf: 0.8283582089552238
Counter({1: 56, 0: 11})

Which percentile : 45
normal rf: 0.8582089552238806
feature selection rf: 0.8283582089552238
Counter({1: 55, 0: 12})

Which percentile : 50
normal rf: 0.8582089552238806
fea

In [113]:
# based on the output of the univariate, we can narrow to 60, 80
select_80_rf = SelectPercentile(percentile=80)
select_90_rf = SelectPercentile(percentile=90)

In [114]:
select_80_rf.fit(train_data_x, train_data_y)

train_data_x_selected_80_rf = select_80_rf.transform(train_data_x)
test_data_x_selected_80_rf = select_80_rf.transform(test_data_x)

mask = select_80_rf.get_support()        
# print(mask)
rf_80 = RandomForestClassifier(n_estimators = 150, max_depth=15,random_state=0,warm_start=False)
rf_80.fit(train_data_x_selected_80_rf,train_data_y)

rf_80.score(test_data_x_selected_80_rf,test_data_y)

0.87313432835820892

In [117]:
rf_80.predict(test_data_x_selected_80_rf)[:67]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0])

In [118]:
select_90_rf.fit(train_data_x, train_data_y)

train_data_x_selected_90_rf = select_90_rf.transform(train_data_x)
test_data_x_selected_90_rf = select_90_rf.transform(test_data_x)

mask = select_90_rf.get_support()        
# print(mask)
rf_90 = RandomForestClassifier(n_estimators = 150, max_depth=15,random_state=0,warm_start=False)
rf_90.fit(train_data_x_selected_90_rf,train_data_y)

rf_90.score(test_data_x_selected_90_rf,test_data_y)

0.85820895522388063

In [121]:
rf_90.predict(test_data_x_selected_90_rf)[:67]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [124]:
Counter(rf_80.predict(test_data_x_selected_80_rf)[:67])

Counter({0: 9, 1: 58})

In [123]:
Counter(rf_90.predict(test_data_x_selected_90_rf)[:67])

Counter({0: 8, 1: 59})

In [125]:
## RFE CV from LR
# for i in [1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20]:
model_rfe = LogisticRegression()
rfe = RFECV(model_rfe, step=1, cv=5)
fit = rfe.fit(train_data_x, train_data_y)
print("Num Features: "+ str(fit.n_features_))
print("Selected Features: " + str(fit.support_))
print("Feature Ranking: " + str(fit.ranking_))

train_data_x_selected_rfe = fit.transform(train_data_x)
test_data_x_selected_rfe = fit.transform(test_data_x)

model_rfe.fit(train_data_x_selected_rfe,train_data_y).score(test_data_x_selected_rfe,test_data_y)

Num Features: 10
Selected Features: [False False  True False  True False False  True  True  True False  True
 False False False  True  True False  True  True]
Feature Ranking: [11  6  1  5  1  9  3  1  1  1  7  1  8  4  2  1  1 10  1  1]


0.82089552238805974

In [126]:
rf_model = RandomForestClassifier(n_estimators = 150, max_depth=15,random_state=0,warm_start=False)

In [127]:
rf_model.fit(train_data_x_selected_rfe,train_data_y).score(test_data_x_selected_rfe,test_data_y)

0.85074626865671643

In [129]:
rf_model.fit(train_data_x_selected_rfe,train_data_y).predict(test_data_x_selected_rfe)[:67]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

## SVM

In [131]:
## SVM
# univariate selection
percentile_list = [10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
for i in percentile_list:
    select = SelectPercentile(percentile=i)

    select.fit(train_data_x, train_data_y)

    train_data_x_selected = select.transform(train_data_x)
    test_data_x_selected = select.transform(test_data_x)

    mask = select.get_support()    
#     print(mask)
    svm = SVC(probability=True)
    svm.fit(train_data_x,train_data_y)
    
    print("\nWhich percentile : " + str(i))
    print("normal svm: {}".format(svm.score(test_data_x,test_data_y)))

    svm.fit(train_data_x_selected,train_data_y)
    print("feature selection svm: {}".format(svm.score(test_data_x_selected,test_data_y)))
    print(Counter(svm.predict(test_data_x_selected)[:67]))
#     print(svm.predict(test_data_x_selected)[:67])
#     print(svm.predict_proba(test_data_x_selected)[0:10])

    
# 2o percentile is the best FE for logistics regression


Which percentile : 10
normal svm: 0.8955223880597015
feature selection svm: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 15
normal svm: 0.8955223880597015
feature selection svm: 0.8656716417910447
Counter({1: 58, 0: 9})

Which percentile : 20
normal svm: 0.8955223880597015
feature selection svm: 0.835820895522388
Counter({1: 56, 0: 11})

Which percentile : 25
normal svm: 0.8955223880597015
feature selection svm: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 30
normal svm: 0.8955223880597015
feature selection svm: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 35
normal svm: 0.8955223880597015
feature selection svm: 0.8208955223880597
Counter({1: 55, 0: 12})

Which percentile : 40
normal svm: 0.8955223880597015
feature selection svm: 0.835820895522388
Counter({1: 56, 0: 11})

Which percentile : 45
normal svm: 0.8955223880597015
feature selection svm: 0.835820895522388
Counter({1: 56, 0: 11})

Which percentile : 50
normal svm: 0.8955223

In [802]:
test_df.head(10)

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,outcome,total_off_rebounds,total_def_rebounds,total_assists,...,is_playoff,is_champion,TeamID,fgp,total_block_opp_FGA_percent,win_rate_away,win_rate_home,win_rate_neutral,win_rate_post,win_rate_regular
2053,2017,1243,11,1448,11,0,1,0.474946,0.679408,0.63621,...,0.34375,0.0,1243.0,0.458474,0.068493,0.5,0.647059,0.666667,0.521739,0.738318
2054,2017,1291,16,1309,16,0,1,0.357298,0.681874,0.441624,...,0.0625,0.0,1291.0,0.443673,0.070649,0.666667,0.5,0.0,0.333333,0.530303
2055,2017,1413,16,1300,16,0,1,0.503268,0.72873,0.477157,...,0.0625,0.0,1413.0,0.432099,0.061336,1.0,0.5,0.666667,0.6,0.645161
2056,2017,1425,11,1344,11,0,1,0.566449,0.763255,0.654822,...,0.09375,0.0,1425.0,0.453718,0.086307,0.666667,0.736842,0.8,0.571429,0.683673
2057,2017,1112,2,1315,15,13,1,0.553377,0.843403,0.593909,...,0.3125,0.0,1112.0,0.475707,0.056555,0.9,0.9375,0.75,0.655172,0.805471
2058,2017,1139,4,1457,13,9,1,0.383442,0.622688,0.492386,...,0.09375,0.0,1139.0,0.477586,0.052378,0.7,0.764706,0.75,0.571429,0.702128
2059,2017,1196,4,1190,13,9,1,0.59695,0.750925,0.483926,...,0.03125,0.0,1196.0,0.449551,0.083874,0.888889,0.692308,0.7,0.75,0.75
2060,2017,1199,3,1195,14,11,1,0.638344,0.789149,0.620981,...,0.25,0.0,1199.0,0.483268,0.086777,1.0,0.75,0.666667,0.466667,0.722222
2061,2017,1211,1,1355,16,15,1,0.464052,0.992602,0.695431,...,0.5625,0.0,1211.0,0.517829,0.072989,0.9,1.0,1.0,0.590909,0.834395
2062,2017,1235,5,1305,12,7,1,0.461874,0.775586,0.683587,...,0.09375,0.0,1235.0,0.468629,0.057589,0.625,0.666667,0.857143,0.571429,0.763441


In [132]:
# based on the output of the univariate, we can narrow to 100
select_100_svm = SelectPercentile(percentile=100)

select_100_svm.fit(train_data_x, train_data_y)

train_data_x_selected_100_svm = select_100_svm.transform(train_data_x)
test_data_x_selected_100_svm = select_100_svm.transform(test_data_x)

mask = select_100_svm.get_support()        
# print(mask)
svm_100 = SVC(probability=True)
svm_100.fit(train_data_x_selected_100_svm,train_data_y)

svm_100.score(test_data_x_selected_100_svm,test_data_y)

svm_100.predict(test_data_x_selected_100_svm)[:67]

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [768]:
rf_fi = RandomForestClassifier(n_estimators = 100, max_depth=10,random_state=0)

In [796]:
rf_fi_values = (
    pd.DataFrame(rf_fi.fit(train_data_x,train_data_y).feature_importances_,index=train_data_x.columns)
    .rename(columns={0:"feature_importance_values"})
    .reset_index()
    .rename(columns={"index":"features"})
    .sort_values(['feature_importance_values'],ascending=False)
    .pipe(lambda x:x.assign(fi_cumsum = x.feature_importance_values.cumsum()))
    .query("fi_cumsum <= 0.95")
)

In [798]:
rf_fi_values.features.unique()

array(['diff_seed', 'win_rate_post', 'win_rate_home', 'avg_win_score_by',
       'win_rate_regular', 'fgp', 'total_assist_per_fgm',
       'total_block_opp_FGA_percent', 'is_playoff', 'total_steals',
       'total_def_rebounds', 'total_assists', 'total_turnover',
       'total_personalfoul', 'avg_lose_score_by', 'total_off_rebounds',
       'win_rate_away'], dtype=object)

In [816]:
svm_train_data_x = train_df[['diff_seed', 'win_rate_post', 'win_rate_home', 'avg_win_score_by',
       'win_rate_regular', 'fgp', 'total_assist_per_fgm',
       'total_block_opp_FGA_percent', 'is_playoff', 'total_steals',
       'total_def_rebounds', 'total_assists', 'total_turnover',
       'total_personalfoul', 'avg_lose_score_by', 'total_off_rebounds',
       'win_rate_away']]
svm_train_data_y = train_df[['outcome']]

svm_test_data_x = test_df[['diff_seed', 'win_rate_post', 'win_rate_home', 'avg_win_score_by',
       'win_rate_regular', 'fgp', 'total_assist_per_fgm',
       'total_block_opp_FGA_percent', 'is_playoff', 'total_steals',
       'total_def_rebounds', 'total_assists', 'total_turnover',
       'total_personalfoul', 'avg_lose_score_by', 'total_off_rebounds',
       'win_rate_away']]
svm_test_data_y = test_df[['outcome']]

In [817]:
svm_fs = SVC()

In [818]:
svm_fs.fit(svm_train_data_x,svm_train_data_y)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [819]:
svm_fs.score(svm_test_data_x,svm_test_data_y)

0.77611940298507465

In [146]:
svm_fs_df = pd.DataFrame(svm_100.predict(test_data_x_selected_100_svm)[:67]).rename(columns={0:"svm_100"})

In [147]:
log_rf_fs_df = pd.DataFrame(LogisticRegression().fit(train_data_x_selected,train_data_y).predict(test_data_x_selected)[:67]).rename(columns={0:"log_rf_fs_df"})

In [148]:
rf_80_df = pd.DataFrame(rf_80.predict(test_data_x_selected_80_rf)[:67]).rename(columns={0:"rf_80_fs"})

In [149]:
rf_90_df = pd.DataFrame(rf_90.predict(test_data_x_selected_90_rf)[:67]).rename(columns={0:"rf_90_fs"})

In [150]:
rf_rfe_df = pd.DataFrame(rf_model.fit(train_data_x_selected_rfe,train_data_y).predict(test_data_x_selected_rfe)[:67]).rename(columns={0:"rf_rfe"})

In [151]:
log_15_df = pd.DataFrame(logreg_25.predict(test_data_x_selected_25)[:67]).rename(columns={0:"log_25_fs"})


In [153]:
(
    svm_fs_df
    .merge(log_rf_fs_df,how='outer', left_index=True, right_index=True)
    .merge(rf_80_df,how='outer',left_index=True, right_index=True)
    .merge(rf_90_df,how='outer', left_index=True, right_index=True)
    .merge(rf_rfe_df,how='outer', left_index=True, right_index=True)
    .merge(log_15_df,how='outer', left_index=True, right_index=True)
).to_csv("output/final_results_cumulative_year.csv",index=False)

In [156]:
seeding_data = pd.read_csv("data/DataFiles/Stage2UpdatedDataFiles/NCAATourneySeeds.csv")

In [159]:
seeding_data.query("Season == 2018").to_csv("output/different_teams.csv")

In [161]:
seeding_data_teams = pd.read_csv("output/different_teams.csv")

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed
0,1985.0,1116,9.0,1234.0,8.0
1,1985.0,1120,11.0,1345.0,6.0
2,1985.0,1207,1.0,1250.0,16.0
3,1985.0,1229,9.0,1425.0,8.0
4,1985.0,1242,3.0,1325.0,14.0
5,1985.0,1246,12.0,1449.0,5.0
6,1985.0,1256,5.0,1338.0,12.0
7,1985.0,1260,4.0,1233.0,13.0
8,1985.0,1314,2.0,1292.0,15.0
9,1985.0,1323,7.0,1333.0,10.0


In [163]:
unique_teams = seeding_data_teams.TeamID.unique()

In [176]:
seeding_data_2018 = pd.read_csv("output/match_up_2018.csv")

seeding_data_2018.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed
0,2018.0,1437,1.0,1345.0,2.0
1,2018.0,1437,1.0,1403.0,3.0
2,2018.0,1437,1.0,1455.0,4.0
3,2018.0,1437,1.0,1452.0,5.0
4,2018.0,1437,1.0,1196.0,6.0


In [177]:
seeding_data.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [170]:
train_df.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,outcome,total_off_rebounds_x,total_def_rebounds_x,total_assists_x,...,diff_num_season,diff_is_playoff,diff_is_champion,diff_fgp,diff_total_block_opp_FGA_percent,diff_win_rate_away,diff_win_rate_home,diff_win_rate_neutral,diff_win_rate_post,diff_win_rate_regular
1136,2003,1421,16,1411,16,0,1,0.037785,0.03577,0.031439,...,0.152174,0.0,0.0,-0.018262,0.012232,-0.071429,-0.162281,0.25,0.5,-0.151724
1137,2003,1112,1,1436,16,-15,1,0.047813,0.043772,0.045466,...,0.003557,-0.59375,-0.2,-0.016969,-0.011306,-0.041667,-0.370833,0.6,-0.641509,-0.237685
1138,2003,1113,10,1272,7,3,1,0.043744,0.036084,0.040266,...,0.0,-0.09375,0.0,0.040251,-0.011396,-0.206349,-0.111111,-0.5,-0.397059,-0.172414
1139,2003,1141,11,1166,6,5,1,0.030664,0.036006,0.040508,...,-0.243478,-0.15625,0.0,0.005763,-0.011456,-0.1,-0.122024,0.0,0.25,-0.085684
1140,2003,1143,8,1301,9,-1,1,0.033425,0.038516,0.041838,...,-0.227668,-0.125,0.0,-0.009399,0.010209,-0.375,-0.114706,0.25,-0.1,-0.124138
