___
this notebook select features that are important before throwing into the model

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_selection import *
from sklearn import *
from scipy import *
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, SelectFromModel, SelectPercentile,RFECV
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from aggregate_function import build_features_table, combine_features_table, coach_stats, win_rate_type_of_location



In [2]:
coach_file = 'data/DataFiles/Stage2UpdatedDataFiles/TeamCoaches.csv'
regularseason_file = 'data/DataFiles/Stage2UpdatedDataFiles/RegularSeasonDetailedResults.csv'
postseason_file = 'data/DataFiles/NCAATourneyCompactResults.csv'

In [3]:
initial_features = build_features_table.BuildFeaturesTable(regularseason_file)
win_rate_features = win_rate_type_of_location.WinRateTypeLocation(regularseason_file)
coach_features = coach_stats.CoachStats(coach_file,regularseason_file,postseason_file)

features = combine_features_table.CombineFeaturesTable(initial_features,win_rate_features,coach_features)

## Final data transformation for feature table
- post season is only what we care about
- post season match ups will be what we are joining all the features table to
- additional variable of seeding differential

In [20]:
features_table = (
    features.final_table_processed
    .drop(['total_score','total_rebounds','total_blocks','total_assist_turnover_ratio','expectation_per_game',
           'win_rate','total_rebound_possession_percent','win_rate_overall','total_off_rebounds_percent','total_def_rebounds_percent',
           'total_opponent_score','total_rebound_possessiongain_percent','fg3p'
          ],1)
    .fillna(0)
)

In [21]:
seeding_data = pd.read_csv("input/tour-results-seed.csv")
seeding_data_2018 = pd.read_csv("output/match_up_2018.csv")
seeding_data = pd.read_csv("input/tour-results-seed.csv").append(seeding_data_2018)

In [22]:
winning_team_perspective_df = (
    seeding_data
    .pipe(lambda x:x.assign(diff_seed = x.L_seed - x.W_seed))
    .pipe(lambda x:x.assign(outcome = 1))
    .merge(features_table,how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
    .merge(features_table,how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
    .pipe(lambda x:x.assign(diff_total_off_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_off_rebounds_x - x.total_off_rebounds_y,
                                                                x.total_off_rebounds_y - x.total_off_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_def_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_def_rebounds_x - x.total_def_rebounds_y,
                                                                x.total_def_rebounds_y - x.total_def_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_assists = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assists_x - x.total_assists_y,
                                                                x.total_assists_y - x.total_assists_x)))
    .pipe(lambda x:x.assign(diff_total_steals = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_steals_x - x.total_steals_y,
                                                                x.total_steals_y - x.total_steals_x)))
    .pipe(lambda x:x.assign(diff_total_turnover = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_turnover_x - x.total_turnover_y,
                                                                x.total_turnover_y - x.total_turnover_x)))
    .pipe(lambda x:x.assign(diff_total_personalfoul = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_personalfoul_x - x.total_personalfoul_y,
                                                                x.total_personalfoul_y - x.total_personalfoul_x)))
    .pipe(lambda x:x.assign(diff_total_assist_per_fgm = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assist_per_fgm_x - x.total_assist_per_fgm_y,
                                                                x.total_assist_per_fgm_y - x.total_assist_per_fgm_x)))
    .pipe(lambda x:x.assign(diff_avg_lose_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_lose_score_by_x - x.avg_lose_score_by_y,
                                                                x.avg_lose_score_by_y - x.avg_lose_score_by_x)))
    .pipe(lambda x:x.assign(diff_avg_win_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_win_score_by_x - x.avg_win_score_by_y,
                                                                x.avg_win_score_by_y - x.avg_win_score_by_x)))
    .pipe(lambda x:x.assign(diff_num_season = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.num_season_x - x.num_season_y,
                                                                x.num_season_y - x.num_season_x)))
    .pipe(lambda x:x.assign(diff_is_playoff = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_playoff_x - x.is_playoff_y,
                                                                x.is_playoff_y - x.is_playoff_x)))
    .pipe(lambda x:x.assign(diff_is_champion = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_champion_x - x.is_champion_y,
                                                                x.is_champion_y - x.is_champion_x)))
    .pipe(lambda x:x.assign(diff_fgp = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.fgp_x - x.fgp_y,
                                                                x.fgp_y - x.fgp_x)))
    .pipe(lambda x:x.assign(diff_total_block_opp_FGA_percent = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_block_opp_FGA_percent_x - x.total_block_opp_FGA_percent_y,
                                                                x.total_block_opp_FGA_percent_y - x.total_block_opp_FGA_percent_x)))
    .pipe(lambda x:x.assign(diff_win_rate_away = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_away_x - x.win_rate_away_y,
                                                                x.win_rate_away_y - x.win_rate_away_x)))
    .pipe(lambda x:x.assign(diff_win_rate_home = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_home_x - x.win_rate_home_y,
                                                                x.win_rate_home_y - x.win_rate_home_x)))
    .pipe(lambda x:x.assign(diff_win_rate_neutral = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_neutral_x - x.win_rate_neutral_y,
                                                                x.win_rate_neutral_y - x.win_rate_neutral_x)))
    .pipe(lambda x:x.assign(diff_win_rate_post = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_post_x - x.win_rate_post_y,
                                                                x.win_rate_post_y - x.win_rate_post_x)))
    .pipe(lambda x:x.assign(diff_win_rate_regular = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_regular_x - x.win_rate_regular_y,
                                                                x.win_rate_regular_y - x.win_rate_regular_x)))
    .pipe(lambda x:x.assign(diff_seed = x.W_seed - x.L_seed
    ))
)

winning_team_perspective_df.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,outcome,total_off_rebounds_x,total_def_rebounds_x,total_assists_x,...,diff_num_season,diff_is_playoff,diff_is_champion,diff_fgp,diff_total_block_opp_FGA_percent,diff_win_rate_away,diff_win_rate_home,diff_win_rate_neutral,diff_win_rate_post,diff_win_rate_regular
0,1985,1116,9,1234,8,1,1,,,,...,,,,,,,,,,
1,1985,1120,11,1345,6,5,1,,,,...,,,,,,,,,,
2,1985,1207,1,1250,16,-15,1,,,,...,,,,,,,,,,
3,1985,1229,9,1425,8,1,1,,,,...,,,,,,,,,,
4,1985,1242,3,1325,14,-11,1,,,,...,,,,,,,,,,


In [23]:
losing_team_perspective_df = (
    seeding_data
    .pipe(lambda x:x.assign(diff_seed = x.W_seed - x.L_seed))
    .pipe(lambda x:x.assign(outcome = 0))
    .merge(features_table,how='left',left_on=['Season','LTeamID'],right_on=['Season','TeamID'])
    .merge(features_table,how='left',left_on=['Season','WTeamID'],right_on=['Season','TeamID'])
    .pipe(lambda x:x.assign(diff_total_off_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_off_rebounds_x - x.total_off_rebounds_y,
                                                                x.total_off_rebounds_y - x.total_off_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_def_rebounds = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_def_rebounds_x - x.total_def_rebounds_y,
                                                                x.total_def_rebounds_y - x.total_def_rebounds_x)))
    .pipe(lambda x:x.assign(diff_total_assists = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assists_x - x.total_assists_y,
                                                                x.total_assists_y - x.total_assists_x)))
    .pipe(lambda x:x.assign(diff_total_steals = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_steals_x - x.total_steals_y,
                                                                x.total_steals_y - x.total_steals_x)))
    .pipe(lambda x:x.assign(diff_total_turnover = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_turnover_x - x.total_turnover_y,
                                                                x.total_turnover_y - x.total_turnover_x)))
    .pipe(lambda x:x.assign(diff_total_personalfoul = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_personalfoul_x - x.total_personalfoul_y,
                                                                x.total_personalfoul_y - x.total_personalfoul_x)))
    .pipe(lambda x:x.assign(diff_total_assist_per_fgm = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_assist_per_fgm_x - x.total_assist_per_fgm_y,
                                                                x.total_assist_per_fgm_y - x.total_assist_per_fgm_x)))
    .pipe(lambda x:x.assign(diff_avg_lose_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_lose_score_by_x - x.avg_lose_score_by_y,
                                                                x.avg_lose_score_by_y - x.avg_lose_score_by_x)))
    .pipe(lambda x:x.assign(diff_avg_win_score_by = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.avg_win_score_by_x - x.avg_win_score_by_y,
                                                                x.avg_win_score_by_y - x.avg_win_score_by_x)))
    .pipe(lambda x:x.assign(diff_num_season = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.num_season_x - x.num_season_y,
                                                                x.num_season_y - x.num_season_x)))
    .pipe(lambda x:x.assign(diff_is_playoff = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_playoff_x - x.is_playoff_y,
                                                                x.is_playoff_y - x.is_playoff_x)))
    .pipe(lambda x:x.assign(diff_is_champion = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.is_champion_x - x.is_champion_y,
                                                                x.is_champion_y - x.is_champion_x)))
    .pipe(lambda x:x.assign(diff_fgp = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.fgp_x - x.fgp_y,
                                                                x.fgp_y - x.fgp_x)))
    .pipe(lambda x:x.assign(diff_total_block_opp_FGA_percent = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.total_block_opp_FGA_percent_x - x.total_block_opp_FGA_percent_y,
                                                                x.total_block_opp_FGA_percent_y - x.total_block_opp_FGA_percent_x)))
    .pipe(lambda x:x.assign(diff_win_rate_away = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_away_x - x.win_rate_away_y,
                                                                x.win_rate_away_y - x.win_rate_away_x)))
    .pipe(lambda x:x.assign(diff_win_rate_home = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_home_x - x.win_rate_home_y,
                                                                x.win_rate_home_y - x.win_rate_home_x)))
    .pipe(lambda x:x.assign(diff_win_rate_neutral = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_neutral_x - x.win_rate_neutral_y,
                                                                x.win_rate_neutral_y - x.win_rate_neutral_x)))
    .pipe(lambda x:x.assign(diff_win_rate_post = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_post_x - x.win_rate_post_y,
                                                                x.win_rate_post_y - x.win_rate_post_x)))
    .pipe(lambda x:x.assign(diff_win_rate_regular = np.where(
                                                                x.W_seed >= x.L_seed,
                                                                x.win_rate_regular_x - x.win_rate_regular_y,
                                                                x.win_rate_regular_y - x.win_rate_regular_x)))
    .pipe(lambda x:x.assign(diff_seed = x.W_seed - x.L_seed
    ))
)

In [25]:
prediction_df = (
    winning_team_perspective_df.append(losing_team_perspective_df)
)

train_df = prediction_df.query("Season >= 2003 & Season <= 2017")
test_df = prediction_df.query("Season == 2018")

test_df.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,outcome,total_off_rebounds_x,total_def_rebounds_x,total_assists_x,...,diff_num_season,diff_is_playoff,diff_is_champion,diff_fgp,diff_total_block_opp_FGA_percent,diff_win_rate_away,diff_win_rate_home,diff_win_rate_neutral,diff_win_rate_post,diff_win_rate_regular
2124,2018,1437,1,1345,2,-1,1,0.485839,0.832306,0.759729,...,-0.295359,-0.125,-0.2,-0.007414,0.018162,-0.011111,0.088889,-0.428571,-0.076923,-0.045139
2125,2018,1437,1,1403,3,-2,1,0.485839,0.832306,0.759729,...,-0.620253,-0.40625,-0.2,-0.040324,0.01313,-0.1,-0.06087,-0.4,-0.076923,-0.01013
2126,2018,1437,1,1455,4,-3,1,0.485839,0.832306,0.759729,...,-0.118143,-0.03125,-0.2,-0.025907,0.001719,-0.15,0.066667,-0.4,-0.021368,-0.006173
2127,2018,1437,1,1452,5,-4,1,0.485839,0.832306,0.759729,...,0.265823,0.28125,-0.2,-0.070479,0.033439,-0.275,-0.063158,-0.285714,-0.035256,-0.049791
2128,2018,1437,1,1196,6,-5,1,0.485839,0.832306,0.759729,...,-0.50211,-0.40625,-0.2,-0.070126,0.016339,-0.4,-0.05,-0.5,0.173077,-0.044436


In [26]:
train_data_x = train_df[['diff_seed','diff_total_off_rebounds','diff_total_def_rebounds','diff_total_assists',
                         'diff_total_steals','diff_total_turnover','diff_total_personalfoul',
                         'diff_total_assist_per_fgm','diff_avg_lose_score_by',
                         'diff_avg_win_score_by','diff_num_season','diff_is_playoff','diff_is_champion',
                         'diff_fgp','diff_total_block_opp_FGA_percent','diff_win_rate_away','diff_win_rate_home',
                         'diff_win_rate_neutral','diff_win_rate_post','diff_win_rate_regular']]
train_data_y = train_df['outcome']

test_data_x = test_df[['diff_seed','diff_total_off_rebounds','diff_total_def_rebounds','diff_total_assists',
                       'diff_total_steals','diff_total_turnover','diff_total_personalfoul',
                       'diff_total_assist_per_fgm','diff_avg_lose_score_by',
                       'diff_avg_win_score_by','diff_num_season','diff_is_playoff','diff_is_champion',
                       'diff_fgp','diff_total_block_opp_FGA_percent','diff_win_rate_away','diff_win_rate_home',
                       'diff_win_rate_neutral','diff_win_rate_post','diff_win_rate_regular']]
test_data_y = test_df['outcome']

## Feature selection for logistics regression
- Univariate selection
- SelectFromModel from RF, lassoCV

In [35]:
# univariate selection
percentile_list = [10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
for i in percentile_list:
    select = SelectPercentile(percentile=i)

    select.fit(train_data_x, train_data_y)

    train_data_x_selected = select.transform(train_data_x)
    test_data_x_selected = select.transform(test_data_x)

    mask = select.get_support()    
#     print(mask)
    logreg = LogisticRegression()
    logreg.fit(train_data_x,train_data_y)
    
    print("\nWhich percentile : " + str(i))
    print("normal logreg: {}".format(logreg.score(train_data_x,train_data_y)))

    logreg.fit(train_data_x_selected,train_data_y)
    print("feature selection logreg: {}".format(logreg.score(train_data_x_selected,train_data_y)))
    
# 2o percentile is the best FE for logistics regression


Which percentile : 10
normal logreg: 0.832995951417004
feature selection logreg: 0.8071862348178138

Which percentile : 15
normal logreg: 0.832995951417004
feature selection logreg: 0.8076923076923077

Which percentile : 20
normal logreg: 0.832995951417004
feature selection logreg: 0.8107287449392713

Which percentile : 25
normal logreg: 0.832995951417004
feature selection logreg: 0.8299595141700404

Which percentile : 30
normal logreg: 0.832995951417004
feature selection logreg: 0.8269230769230769

Which percentile : 35
normal logreg: 0.832995951417004
feature selection logreg: 0.8279352226720648

Which percentile : 40
normal logreg: 0.832995951417004
feature selection logreg: 0.8279352226720648

Which percentile : 45
normal logreg: 0.832995951417004
feature selection logreg: 0.8259109311740891

Which percentile : 50
normal logreg: 0.832995951417004
feature selection logreg: 0.8289473684210527

Which percentile : 55
normal logreg: 0.832995951417004
feature selection logreg: 0.8279352

In [36]:
# based on the output of the univariate, we can narrow to 10, 25, 80
select_25 = SelectPercentile(percentile=80)

In [37]:
select_25.fit(train_data_x, train_data_y)

train_data_x_selected_25 = select_25.transform(train_data_x)
test_data_x_selected_25 = select_25.transform(test_data_x)

mask = select_25.get_support()    
#     print(mask)
logreg_25 = LogisticRegression()
logreg_25.fit(train_data_x_selected_25,train_data_y)

logreg_25.score(test_data_x_selected_25,test_data_y)

logreg_25.predict(test_data_x_selected_25)[:67]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [85]:
select_85.fit(train_data_x, train_data_y)

train_data_x_selected_85 = select_85.transform(train_data_x)
test_data_x_selected_85 = select_85.transform(test_data_x)

mask = select_85.get_support()    
#     print(mask)
logreg_85 = LogisticRegression()
logreg_85.fit(train_data_x_selected_85,train_data_y)

logreg_85.score(test_data_x_selected_85,test_data_y)

logreg_85.predict(test_data_x_selected_85)

array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [31]:
## selectfrommodel RF
select_rf = SelectFromModel(RandomForestClassifier(n_estimators =100, max_depth = 10, random_state=0),threshold=0.04)

In [32]:
select_rf.fit(train_data_x,train_data_y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.04)

In [33]:
train_data_x_selected = select_rf.transform(train_data_x)
test_data_x_selected = select_rf.transform(test_data_x)

In [34]:
LogisticRegression().fit(train_data_x_selected,train_data_y).score(test_data_x_selected,test_data_y)

0.82835820895522383

In [82]:
## selectfrommodel lassoCV --> same as univariate

In [88]:
select_lcv = SelectFromModel(LassoCV(max_iter=100,n_alphas=10,eps=1e-05),threshold=0.01)

select_lcv.fit(train_data_x,train_data_y)



SelectFromModel(estimator=LassoCV(alphas=None, copy_X=True, cv=None, eps=1e-05, fit_intercept=True,
    max_iter=100, n_alphas=10, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False),
        norm_order=1, prefit=False, threshold=0.01)

In [89]:
train_data_x_selected_lcv = select_lcv.transform(train_data_x)
test_data_x_selected_lcv = select_lcv.transform(test_data_x)

In [90]:
LogisticRegression().fit(train_data_x_selected_lcv,train_data_y).score(test_data_x_selected_lcv,test_data_y)

0.85074626865671643

In [493]:
# select from model lassocv
lassocv = LassoCV(random_state=0)
param_grid_lasso = {
    'n_alphas': [1,5,10,25,50,100,150,200,500,1000],
    'max_iter': [100,500,1000,1500,2000,3000],
    'eps': [0.00001,0.0001,0.001,0.01]
}
grid_lcv = GridSearchCV(lassocv, param_grid_lasso, cv=5, verbose=2)
grid_lcv.fit(train_data_x_selected, train_data_y)

lcv_model = grid_lcv.best_estimator_


Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV] eps=1e-05, max_iter=100, n_alphas=1 .............................
[CV] .................... eps=1e-05, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=1 .............................
[CV] .................... eps=1e-05, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=1 .............................
[CV] .................... eps=1e-05, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=1 .............................
[CV] .................... eps=1e-05, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=1 .............................
[CV] .................... eps=1e-05, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=5 .............................
[CV] .................... eps=1e-05, max_iter=100, n_alphas=5 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=5 .............................
[CV] ........

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................... eps=1e-05, max_iter=100, n_alphas=25 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=50 ............................
[CV] ................... eps=1e-05, max_iter=100, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=50 ............................
[CV] ................... eps=1e-05, max_iter=100, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=50 ............................
[CV] ................... eps=1e-05, max_iter=100, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=50 ............................
[CV] ................... eps=1e-05, max_iter=100, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=50 ............................
[CV] ................... eps=1e-05, max_iter=100, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=100 ...........................
[CV] .................. eps=1e-05, max_iter=100, n_alphas=100 -   0.0s
[CV] eps=1e-05, max_iter=100, n_alphas=100 ...........................
[CV] .

[CV] .................. eps=1e-05, max_iter=500, n_alphas=150 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=150 ...........................
[CV] .................. eps=1e-05, max_iter=500, n_alphas=150 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=150 ...........................
[CV] .................. eps=1e-05, max_iter=500, n_alphas=150 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=150 ...........................
[CV] .................. eps=1e-05, max_iter=500, n_alphas=150 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=150 ...........................
[CV] .................. eps=1e-05, max_iter=500, n_alphas=150 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=200 ...........................
[CV] .................. eps=1e-05, max_iter=500, n_alphas=200 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=200 ...........................
[CV] .................. eps=1e-05, max_iter=500, n_alphas=200 -   0.0s
[CV] eps=1e-05, max_iter=500, n_alphas=200 ...........................
[CV] .

[CV] ................. eps=1e-05, max_iter=1000, n_alphas=500 -   0.1s
[CV] eps=1e-05, max_iter=1000, n_alphas=500 ..........................
[CV] ................. eps=1e-05, max_iter=1000, n_alphas=500 -   0.1s
[CV] eps=1e-05, max_iter=1000, n_alphas=500 ..........................
[CV] ................. eps=1e-05, max_iter=1000, n_alphas=500 -   0.1s
[CV] eps=1e-05, max_iter=1000, n_alphas=500 ..........................
[CV] ................. eps=1e-05, max_iter=1000, n_alphas=500 -   0.1s
[CV] eps=1e-05, max_iter=1000, n_alphas=1000 .........................
[CV] ................ eps=1e-05, max_iter=1000, n_alphas=1000 -   0.2s
[CV] eps=1e-05, max_iter=1000, n_alphas=1000 .........................
[CV] ................ eps=1e-05, max_iter=1000, n_alphas=1000 -   0.2s
[CV] eps=1e-05, max_iter=1000, n_alphas=1000 .........................
[CV] ................ eps=1e-05, max_iter=1000, n_alphas=1000 -   0.2s
[CV] eps=1e-05, max_iter=1000, n_alphas=1000 .........................
[CV] .

[CV] ................ eps=1e-05, max_iter=1500, n_alphas=1000 -   0.2s
[CV] eps=1e-05, max_iter=2000, n_alphas=1 ............................
[CV] ................... eps=1e-05, max_iter=2000, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=2000, n_alphas=1 ............................
[CV] ................... eps=1e-05, max_iter=2000, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=2000, n_alphas=1 ............................
[CV] ................... eps=1e-05, max_iter=2000, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=2000, n_alphas=1 ............................
[CV] ................... eps=1e-05, max_iter=2000, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=2000, n_alphas=1 ............................
[CV] ................... eps=1e-05, max_iter=2000, n_alphas=1 -   0.0s
[CV] eps=1e-05, max_iter=2000, n_alphas=5 ............................
[CV] ................... eps=1e-05, max_iter=2000, n_alphas=5 -   0.0s
[CV] eps=1e-05, max_iter=2000, n_alphas=5 ............................
[CV] .

[CV] .................. eps=1e-05, max_iter=3000, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=50 ...........................
[CV] .................. eps=1e-05, max_iter=3000, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=50 ...........................
[CV] .................. eps=1e-05, max_iter=3000, n_alphas=50 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=100 ..........................
[CV] ................. eps=1e-05, max_iter=3000, n_alphas=100 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=100 ..........................
[CV] ................. eps=1e-05, max_iter=3000, n_alphas=100 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=100 ..........................
[CV] ................. eps=1e-05, max_iter=3000, n_alphas=100 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=100 ..........................
[CV] ................. eps=1e-05, max_iter=3000, n_alphas=100 -   0.0s
[CV] eps=1e-05, max_iter=3000, n_alphas=100 ..........................
[CV] .

[CV] ................. eps=0.0001, max_iter=100, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=100, n_alphas=150 ..........................
[CV] ................. eps=0.0001, max_iter=100, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=100, n_alphas=150 ..........................
[CV] ................. eps=0.0001, max_iter=100, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=100, n_alphas=150 ..........................
[CV] ................. eps=0.0001, max_iter=100, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=100, n_alphas=150 ..........................
[CV] ................. eps=0.0001, max_iter=100, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=100, n_alphas=200 ..........................
[CV] ................. eps=0.0001, max_iter=100, n_alphas=200 -   0.0s
[CV] eps=0.0001, max_iter=100, n_alphas=200 ..........................
[CV] ................. eps=0.0001, max_iter=100, n_alphas=200 -   0.1s
[CV] eps=0.0001, max_iter=100, n_alphas=200 ..........................
[CV] .

[CV] ................. eps=0.0001, max_iter=500, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=500, n_alphas=500 ..........................
[CV] ................. eps=0.0001, max_iter=500, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=500, n_alphas=500 ..........................
[CV] ................. eps=0.0001, max_iter=500, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=500, n_alphas=500 ..........................
[CV] ................. eps=0.0001, max_iter=500, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=500, n_alphas=500 ..........................
[CV] ................. eps=0.0001, max_iter=500, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=500, n_alphas=1000 .........................
[CV] ................ eps=0.0001, max_iter=500, n_alphas=1000 -   0.2s
[CV] eps=0.0001, max_iter=500, n_alphas=1000 .........................
[CV] ................ eps=0.0001, max_iter=500, n_alphas=1000 -   0.2s
[CV] eps=0.0001, max_iter=500, n_alphas=1000 .........................
[CV] .

[CV] ............... eps=0.0001, max_iter=1000, n_alphas=1000 -   0.2s
[CV] eps=0.0001, max_iter=1500, n_alphas=1 ...........................
[CV] .................. eps=0.0001, max_iter=1500, n_alphas=1 -   0.0s
[CV] eps=0.0001, max_iter=1500, n_alphas=1 ...........................
[CV] .................. eps=0.0001, max_iter=1500, n_alphas=1 -   0.0s
[CV] eps=0.0001, max_iter=1500, n_alphas=1 ...........................
[CV] .................. eps=0.0001, max_iter=1500, n_alphas=1 -   0.0s
[CV] eps=0.0001, max_iter=1500, n_alphas=1 ...........................
[CV] .................. eps=0.0001, max_iter=1500, n_alphas=1 -   0.0s
[CV] eps=0.0001, max_iter=1500, n_alphas=1 ...........................
[CV] .................. eps=0.0001, max_iter=1500, n_alphas=1 -   0.0s
[CV] eps=0.0001, max_iter=1500, n_alphas=5 ...........................
[CV] .................. eps=0.0001, max_iter=1500, n_alphas=5 -   0.0s
[CV] eps=0.0001, max_iter=1500, n_alphas=5 ...........................
[CV] .

[CV] ................ eps=0.0001, max_iter=2000, n_alphas=100 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=100 .........................
[CV] ................ eps=0.0001, max_iter=2000, n_alphas=100 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=100 .........................
[CV] ................ eps=0.0001, max_iter=2000, n_alphas=100 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=100 .........................
[CV] ................ eps=0.0001, max_iter=2000, n_alphas=100 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=100 .........................
[CV] ................ eps=0.0001, max_iter=2000, n_alphas=100 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=150 .........................
[CV] ................ eps=0.0001, max_iter=2000, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=150 .........................
[CV] ................ eps=0.0001, max_iter=2000, n_alphas=150 -   0.0s
[CV] eps=0.0001, max_iter=2000, n_alphas=150 .........................
[CV] .

[CV] ................ eps=0.0001, max_iter=3000, n_alphas=200 -   0.0s
[CV] eps=0.0001, max_iter=3000, n_alphas=200 .........................
[CV] ................ eps=0.0001, max_iter=3000, n_alphas=200 -   0.0s
[CV] eps=0.0001, max_iter=3000, n_alphas=500 .........................
[CV] ................ eps=0.0001, max_iter=3000, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=3000, n_alphas=500 .........................
[CV] ................ eps=0.0001, max_iter=3000, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=3000, n_alphas=500 .........................
[CV] ................ eps=0.0001, max_iter=3000, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=3000, n_alphas=500 .........................
[CV] ................ eps=0.0001, max_iter=3000, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=3000, n_alphas=500 .........................
[CV] ................ eps=0.0001, max_iter=3000, n_alphas=500 -   0.1s
[CV] eps=0.0001, max_iter=3000, n_alphas=1000 ........................
[CV] .

[CV] ................. eps=0.001, max_iter=100, n_alphas=1000 -   0.2s
[CV] eps=0.001, max_iter=100, n_alphas=1000 ..........................
[CV] ................. eps=0.001, max_iter=100, n_alphas=1000 -   0.2s
[CV] eps=0.001, max_iter=100, n_alphas=1000 ..........................
[CV] ................. eps=0.001, max_iter=100, n_alphas=1000 -   0.2s
[CV] eps=0.001, max_iter=500, n_alphas=1 .............................
[CV] .................... eps=0.001, max_iter=500, n_alphas=1 -   0.0s
[CV] eps=0.001, max_iter=500, n_alphas=1 .............................
[CV] .................... eps=0.001, max_iter=500, n_alphas=1 -   0.0s
[CV] eps=0.001, max_iter=500, n_alphas=1 .............................
[CV] .................... eps=0.001, max_iter=500, n_alphas=1 -   0.0s
[CV] eps=0.001, max_iter=500, n_alphas=1 .............................
[CV] .................... eps=0.001, max_iter=500, n_alphas=1 -   0.0s
[CV] eps=0.001, max_iter=500, n_alphas=1 .............................
[CV] .

[CV] .................. eps=0.001, max_iter=1000, n_alphas=50 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=50 ...........................
[CV] .................. eps=0.001, max_iter=1000, n_alphas=50 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=50 ...........................
[CV] .................. eps=0.001, max_iter=1000, n_alphas=50 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=50 ...........................
[CV] .................. eps=0.001, max_iter=1000, n_alphas=50 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=50 ...........................
[CV] .................. eps=0.001, max_iter=1000, n_alphas=50 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=100 ..........................
[CV] ................. eps=0.001, max_iter=1000, n_alphas=100 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=100 ..........................
[CV] ................. eps=0.001, max_iter=1000, n_alphas=100 -   0.0s
[CV] eps=0.001, max_iter=1000, n_alphas=100 ..........................
[CV] .

[CV] ................. eps=0.001, max_iter=1500, n_alphas=150 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=150 ..........................
[CV] ................. eps=0.001, max_iter=1500, n_alphas=150 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=150 ..........................
[CV] ................. eps=0.001, max_iter=1500, n_alphas=150 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=200 ..........................
[CV] ................. eps=0.001, max_iter=1500, n_alphas=200 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=200 ..........................
[CV] ................. eps=0.001, max_iter=1500, n_alphas=200 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=200 ..........................
[CV] ................. eps=0.001, max_iter=1500, n_alphas=200 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=200 ..........................
[CV] ................. eps=0.001, max_iter=1500, n_alphas=200 -   0.0s
[CV] eps=0.001, max_iter=1500, n_alphas=200 ..........................
[CV] .

[CV] ................. eps=0.001, max_iter=2000, n_alphas=500 -   0.1s
[CV] eps=0.001, max_iter=2000, n_alphas=500 ..........................
[CV] ................. eps=0.001, max_iter=2000, n_alphas=500 -   0.1s
[CV] eps=0.001, max_iter=2000, n_alphas=500 ..........................
[CV] ................. eps=0.001, max_iter=2000, n_alphas=500 -   0.1s
[CV] eps=0.001, max_iter=2000, n_alphas=500 ..........................
[CV] ................. eps=0.001, max_iter=2000, n_alphas=500 -   0.1s
[CV] eps=0.001, max_iter=2000, n_alphas=1000 .........................
[CV] ................ eps=0.001, max_iter=2000, n_alphas=1000 -   0.2s
[CV] eps=0.001, max_iter=2000, n_alphas=1000 .........................
[CV] ................ eps=0.001, max_iter=2000, n_alphas=1000 -   0.2s
[CV] eps=0.001, max_iter=2000, n_alphas=1000 .........................
[CV] ................ eps=0.001, max_iter=2000, n_alphas=1000 -   0.2s
[CV] eps=0.001, max_iter=2000, n_alphas=1000 .........................
[CV] .

[CV] ................ eps=0.001, max_iter=3000, n_alphas=1000 -   0.2s
[CV] eps=0.01, max_iter=100, n_alphas=1 ..............................
[CV] ..................... eps=0.01, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=100, n_alphas=1 ..............................
[CV] ..................... eps=0.01, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=100, n_alphas=1 ..............................
[CV] ..................... eps=0.01, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=100, n_alphas=1 ..............................
[CV] ..................... eps=0.01, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=100, n_alphas=1 ..............................
[CV] ..................... eps=0.01, max_iter=100, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=100, n_alphas=5 ..............................
[CV] ..................... eps=0.01, max_iter=100, n_alphas=5 -   0.0s
[CV] eps=0.01, max_iter=100, n_alphas=5 ..............................
[CV] .

[CV] .................... eps=0.01, max_iter=500, n_alphas=50 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=50 .............................
[CV] .................... eps=0.01, max_iter=500, n_alphas=50 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=50 .............................
[CV] .................... eps=0.01, max_iter=500, n_alphas=50 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=100 ............................
[CV] ................... eps=0.01, max_iter=500, n_alphas=100 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=100 ............................
[CV] ................... eps=0.01, max_iter=500, n_alphas=100 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=100 ............................
[CV] ................... eps=0.01, max_iter=500, n_alphas=100 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=100 ............................
[CV] ................... eps=0.01, max_iter=500, n_alphas=100 -   0.0s
[CV] eps=0.01, max_iter=500, n_alphas=100 ............................
[CV] .

[CV] .................. eps=0.01, max_iter=1000, n_alphas=150 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=150 ...........................
[CV] .................. eps=0.01, max_iter=1000, n_alphas=150 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=150 ...........................
[CV] .................. eps=0.01, max_iter=1000, n_alphas=150 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=150 ...........................
[CV] .................. eps=0.01, max_iter=1000, n_alphas=150 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=150 ...........................
[CV] .................. eps=0.01, max_iter=1000, n_alphas=150 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=200 ...........................
[CV] .................. eps=0.01, max_iter=1000, n_alphas=200 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=200 ...........................
[CV] .................. eps=0.01, max_iter=1000, n_alphas=200 -   0.0s
[CV] eps=0.01, max_iter=1000, n_alphas=200 ...........................
[CV] .

[CV] .................. eps=0.01, max_iter=1500, n_alphas=500 -   0.1s
[CV] eps=0.01, max_iter=1500, n_alphas=500 ...........................
[CV] .................. eps=0.01, max_iter=1500, n_alphas=500 -   0.1s
[CV] eps=0.01, max_iter=1500, n_alphas=500 ...........................
[CV] .................. eps=0.01, max_iter=1500, n_alphas=500 -   0.1s
[CV] eps=0.01, max_iter=1500, n_alphas=500 ...........................
[CV] .................. eps=0.01, max_iter=1500, n_alphas=500 -   0.1s
[CV] eps=0.01, max_iter=1500, n_alphas=500 ...........................
[CV] .................. eps=0.01, max_iter=1500, n_alphas=500 -   0.1s
[CV] eps=0.01, max_iter=1500, n_alphas=1000 ..........................
[CV] ................. eps=0.01, max_iter=1500, n_alphas=1000 -   0.2s
[CV] eps=0.01, max_iter=1500, n_alphas=1000 ..........................
[CV] ................. eps=0.01, max_iter=1500, n_alphas=1000 -   0.3s
[CV] eps=0.01, max_iter=1500, n_alphas=1000 ..........................
[CV] .

[CV] ................. eps=0.01, max_iter=2000, n_alphas=1000 -   0.2s
[CV] eps=0.01, max_iter=3000, n_alphas=1 .............................
[CV] .................... eps=0.01, max_iter=3000, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=3000, n_alphas=1 .............................
[CV] .................... eps=0.01, max_iter=3000, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=3000, n_alphas=1 .............................
[CV] .................... eps=0.01, max_iter=3000, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=3000, n_alphas=1 .............................
[CV] .................... eps=0.01, max_iter=3000, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=3000, n_alphas=1 .............................
[CV] .................... eps=0.01, max_iter=3000, n_alphas=1 -   0.0s
[CV] eps=0.01, max_iter=3000, n_alphas=5 .............................
[CV] .................... eps=0.01, max_iter=3000, n_alphas=5 -   0.0s
[CV] eps=0.01, max_iter=3000, n_alphas=5 .............................
[CV] .

[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed:   54.2s finished


In [494]:
lcv_model

LassoCV(alphas=None, copy_X=True, cv=None, eps=1e-05, fit_intercept=True,
    max_iter=100, n_alphas=10, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=0, selection='cyclic', tol=0.0001,
    verbose=False)

In [495]:
select = SelectFromModel(LassoCV(max_iter=100,n_alphas=1,eps=0.001),threshold=0.04)

select.fit(train_data_x,train_data_y)

SelectFromModel(estimator=LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=100, n_alphas=1, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False),
        norm_order=1, prefit=False, threshold=0.04)

In [496]:
train_data_x_selected = select.transform(train_data_x)
train_data_x_selected.shape

(1835, 4)

In [497]:
test_data_x_selected = select.transform(test_data_x)

In [498]:
LogisticRegression().fit(train_data_x_selected,train_data_y).score(test_data_x_selected,test_data_y)

0.71641791044776115

In [97]:
rf = RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [5,10,50,100,150,500,1000],
    'max_depth': [1,2,5,10,15,50,100]
}
grid_rf = GridSearchCV(rf, param_grid, scoring='accuracy', cv=5, verbose=0)
grid_rf.fit(train_data_x_selected, train_data_y)

rf_model = grid_rf.best_estimator_
rf_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Feature selection for RF
- univariate
- SelectFromModel SVM
- RFE(CV)

In [499]:
model = LogisticRegression()
rfe = RFE(model, 9)
fit = rfe.fit(train_data_x, train_data_y)
print("Num Features: "+ str(fit.n_features_))
print("Selected Features: " + str(fit.support_))
print("Feature Ranking: " + str(fit.ranking_))

Num Features: 9
Selected Features: [False False False False  True  True  True  True False  True False False
  True False False  True False False  True  True]
Feature Ranking: [ 4  8 10  6  1  1  1  1 12  1  5  3  1 11  9  1  7  2  1  1]


In [500]:
train_data_x_selected = fit.transform(train_data_x)
test_data_x_selected = fit.transform(test_data_x)

In [501]:
model.fit(train_data_x_selected,train_data_y).score(test_data_x_selected,test_data_y)

0.70149253731343286

In [39]:
# univariate selection
percentile_list = [10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
for i in percentile_list:
    select_rf = SelectPercentile(percentile=i)
    select_rf.fit(train_data_x, train_data_y)
    
    train_data_x_selected = select_rf.transform(train_data_x)
    test_data_x_selected = select_rf.transform(test_data_x)
    
    mask = select_rf.get_support()
#     print(mask)
    rf = RandomForestClassifier(n_estimators = 1000, max_depth=10,random_state=0)
    rf.fit(train_data_x,train_data_y)
    
    print("\nWhich percentile : " + str(i))
    print("normal rf: {}".format(rf.score(train_data_x,train_data_y)))
    
    rf.fit(train_data_x_selected,train_data_y)
    print("feature selection rf: {}".format(rf.score(train_data_x_selected,train_data_y)))    



Which percentile : 10
normal rf: 0.9979757085020243
feature selection rf: 0.9402834008097166

Which percentile : 15
normal rf: 0.9979757085020243
feature selection rf: 0.9498987854251012

Which percentile : 20
normal rf: 0.9979757085020243
feature selection rf: 0.9772267206477733

Which percentile : 25
normal rf: 0.9979757085020243
feature selection rf: 0.9832995951417004

Which percentile : 30
normal rf: 0.9979757085020243
feature selection rf: 0.9848178137651822

Which percentile : 35
normal rf: 0.9979757085020243
feature selection rf: 0.9838056680161943

Which percentile : 40
normal rf: 0.9979757085020243
feature selection rf: 0.9812753036437247

Which percentile : 45
normal rf: 0.9979757085020243
feature selection rf: 0.9838056680161943

Which percentile : 50
normal rf: 0.9979757085020243
feature selection rf: 0.9888663967611336

Which percentile : 55
normal rf: 0.9979757085020243
feature selection rf: 0.9888663967611336

Which percentile : 60
normal rf: 0.9979757085020243
feature

In [41]:
# based on the output of the univariate, we can narrow to 60, 80
select_70_rf = SelectPercentile(percentile=100)

In [42]:
select_70_rf.fit(train_data_x, train_data_y)

train_data_x_selected_70_rf = select_70_rf.transform(train_data_x)
test_data_x_selected_70_rf = select_70_rf.transform(test_data_x)

mask = select_70_rf.get_support()        
# print(mask)
rf_70 = RandomForestClassifier(n_estimators = 1000, max_depth=10,random_state=0,warm_start=True)
rf_70.fit(train_data_x_selected_70_rf,train_data_y)

rf_70.score(test_data_x_selected_70_rf,test_data_y)

0.83033362598770855

In [104]:
rf_70.predict(test_data_x_selected_70_rf)[:67]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [116]:
select_80_rf.fit(train_data_x, train_data_y)

train_data_x_selected_80_rf = select_80_rf.transform(train_data_x)
test_data_x_selected_80_rf = select_80_rf.transform(test_data_x)

mask = select_80_rf.get_support()        
# print(mask)
rf_80 = RandomForestClassifier(n_estimators = 1000, max_depth=10,random_state=0,warm_start=True)
rf_80.fit(train_data_x_selected_80_rf,train_data_y)

rf_80.score(test_data_x_selected_80_rf,test_data_y)

0.83582089552238803

In [110]:
rf_80.predict(test_data_x_selected_80_rf)[:67]

array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [115]:
Counter(rf_80.predict(test_data_x_selected_80_rf)[:67])

Counter({0: 11, 1: 56})

In [114]:
Counter(rf_70.predict(test_data_x_selected_70_rf)[:67])

Counter({0: 12, 1: 55})

In [123]:
## RFE CV from LR
# for i in [1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20]:
model_rfe = LogisticRegression()
rfe = RFECV(model_rfe, step=1, cv=5)
fit = rfe.fit(train_data_x, train_data_y)
print("Num Features: "+ str(fit.n_features_))
print("Selected Features: " + str(fit.support_))
print("Feature Ranking: " + str(fit.ranking_))

train_data_x_selected_rfe = fit.transform(train_data_x)
test_data_x_selected_rfe = fit.transform(test_data_x)

Num Features: 8
Selected Features: [False False  True  True False False  True  True False False False  True
 False False False  True  True False  True False]
Feature Ranking: [13  5  1  1  6  4  1  1  3  2 11  1 12  9 10  1  1  8  1  7]


In [124]:
rf_model = RandomForestClassifier(n_estimators = 1000, max_depth=10,random_state=0,warm_start=False)

In [125]:
rf_model.fit(train_data_x_selected_rfe,train_data_y).score(test_data_x_selected_rfe,test_data_y)

0.79850746268656714

## SVM

In [40]:
## SVM
# univariate selection
percentile_list = [10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
for i in percentile_list:
    select = SelectPercentile(percentile=i)

    select.fit(train_data_x, train_data_y)

    train_data_x_selected = select.transform(train_data_x)
    test_data_x_selected = select.transform(test_data_x)

    mask = select.get_support()    
#     print(mask)
    svm = SVC(probability=True)
    svm.fit(train_data_x,train_data_y)
    
    print("\nWhich percentile : " + str(i))
    print("normal svm: {}".format(svm.score(train_data_x,train_data_y)))

    svm.fit(train_data_x_selected,train_data_y)
    print("feature selection svm: {}".format(svm.score(train_data_x_selected,train_data_y)))
#     print(svm.predict(test_data_x_selected)[:67])
#     print(svm.predict_proba(test_data_x_selected)[0:10])

    
# 2o percentile is the best FE for logistics regression


Which percentile : 10
normal svm: 0.8674089068825911
feature selection svm: 0.8102226720647774

Which percentile : 15
normal svm: 0.8674089068825911
feature selection svm: 0.8076923076923077

Which percentile : 20
normal svm: 0.8674089068825911
feature selection svm: 0.8097165991902834

Which percentile : 25
normal svm: 0.8674089068825911
feature selection svm: 0.8279352226720648

Which percentile : 30
normal svm: 0.8674089068825911
feature selection svm: 0.8269230769230769

Which percentile : 35
normal svm: 0.8674089068825911
feature selection svm: 0.8248987854251012

Which percentile : 40
normal svm: 0.8674089068825911
feature selection svm: 0.819838056680162

Which percentile : 45
normal svm: 0.8674089068825911
feature selection svm: 0.8168016194331984

Which percentile : 50
normal svm: 0.8674089068825911
feature selection svm: 0.8309716599190283

Which percentile : 55
normal svm: 0.8674089068825911
feature selection svm: 0.8289473684210527

Which percentile : 60
normal svm: 0.8674

In [43]:
# based on the output of the univariate, we can narrow to 100
select_100_svm = SelectPercentile(percentile=100)

select_100_svm.fit(train_data_x, train_data_y)

train_data_x_selected_100_svm = select_100_svm.transform(train_data_x)
test_data_x_selected_100_svm = select_100_svm.transform(test_data_x)

mask = select_100_svm.get_support()        
# print(mask)
svm_100 = SVC(probability=True)
svm_100.fit(train_data_x_selected_100_svm,train_data_y)

svm_100.score(test_data_x_selected_100_svm,test_data_y)

svm_100.predict(test_data_x_selected_100_svm)[:67]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [132]:
rf_fi = RandomForestClassifier(n_estimators = 1000, max_depth=10,random_state=0)

In [133]:
rf_fi_values = (
    pd.DataFrame(rf_fi.fit(train_data_x,train_data_y).feature_importances_,index=train_data_x.columns)
    .rename(columns={0:"feature_importance_values"})
    .reset_index()
    .rename(columns={"index":"features"})
    .sort_values(['feature_importance_values'],ascending=False)
    .pipe(lambda x:x.assign(fi_cumsum = x.feature_importance_values.cumsum()))
    .query("fi_cumsum <= 0.95")
)

In [136]:
rf_fi_values.features.unique()

array(['diff_win_rate_home', 'diff_is_playoff', 'diff_win_rate_post',
       'diff_avg_win_score_by', 'diff_total_def_rebounds', 'diff_seed',
       'diff_num_season', 'diff_win_rate_away', 'diff_win_rate_regular',
       'diff_total_assists', 'diff_avg_lose_score_by', 'diff_fgp',
       'diff_total_off_rebounds', 'diff_total_block_opp_FGA_percent',
       'diff_total_steals', 'diff_total_turnover',
       'diff_total_personalfoul'], dtype=object)

In [148]:
svm_train_data_x = train_df[['diff_win_rate_home', 'diff_is_playoff', 'diff_win_rate_post',
       'diff_avg_win_score_by', 'diff_total_def_rebounds', 'diff_seed',
       'diff_num_season', 'diff_win_rate_away', 'diff_win_rate_regular',
       'diff_total_assists', 'diff_avg_lose_score_by', 'diff_fgp',
       'diff_total_off_rebounds', 'diff_total_block_opp_FGA_percent',
       'diff_total_steals', 'diff_total_turnover',
       'diff_total_personalfoul']]
svm_train_data_y = train_df[['outcome']]

svm_test_data_x = test_df[['diff_win_rate_home', 'diff_is_playoff', 'diff_win_rate_post',
       'diff_avg_win_score_by', 'diff_total_def_rebounds', 'diff_seed',
       'diff_num_season', 'diff_win_rate_away', 'diff_win_rate_regular',
       'diff_total_assists', 'diff_avg_lose_score_by', 'diff_fgp',
       'diff_total_off_rebounds', 'diff_total_block_opp_FGA_percent',
       'diff_total_steals', 'diff_total_turnover',
       'diff_total_personalfoul']]
svm_test_data_y = test_df[['outcome']]



In [149]:
svm_fs = SVC()

In [150]:
svm_fs.fit(svm_train_data_x,svm_train_data_y)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [151]:
svm_fs.score(svm_test_data_x,svm_test_data_y)

0.89552238805970152

In [168]:
svm_fs_df = pd.DataFrame(svm_100.predict(test_data_x_selected_100_svm)[:67]).rename(columns={0:"svm_100"})

In [169]:
log_rf_fs_df = pd.DataFrame(LogisticRegression().fit(train_data_x_selected,train_data_y).predict(test_data_x_selected)[:67]).rename(columns={0:"log_rf_fs_df"})

In [170]:
rf_70_df = pd.DataFrame(rf_70.predict(test_data_x_selected_70_rf)[:67]).rename(columns={0:"rf_70_fs"})

In [171]:
rf_80_df = pd.DataFrame(rf_80.predict(test_data_x_selected_80_rf)[:67]).rename(columns={0:"rf_80_fs"})

In [172]:
rf_rfe_df = pd.DataFrame(rf_model.fit(train_data_x_selected_rfe,train_data_y).predict(test_data_x_selected_rfe)[:67]).rename(columns={0:"rf_rfe"})

In [173]:
log_85_df = pd.DataFrame(logreg_85.predict(test_data_x_selected_85)[:67]).rename(columns={0:"log_85_fs"})


In [174]:
log_90_df = pd.DataFrame(logreg_90.predict(test_data_x_selected_90)[:67]).rename(columns={0:"log_90_fs"})

In [175]:
(
    svm_fs_df
    .merge(log_rf_fs_df,how='outer', left_index=True, right_index=True)
    .merge(rf_70_df,how='outer',left_index=True, right_index=True)
    .merge(rf_80_df,how='outer', left_index=True, right_index=True)
    .merge(rf_rfe_df,how='outer', left_index=True, right_index=True)
    .merge(log_85_df,how='outer', left_index=True, right_index=True)
    .merge(log_90_df,how='outer', left_index=True, right_index=True)
).to_csv("output/final_results_static_year_improved.csv",index=False)

In [61]:
log_reg_df = pd.DataFrame(logreg_25.predict_proba(test_data_x_selected_25)[:,1]).rename(columns={0:"LR"})

In [62]:
lr_rf_df = pd.DataFrame(LogisticRegression().fit(train_data_x_selected,train_data_y).predict_proba(test_data_x_selected)[:,1]).rename(columns={0:"LR_rf"})

In [63]:
rf_df = pd.DataFrame(rf_70.predict_proba(test_data_x_selected_70_rf)[:,1]).rename(columns={0:"rf"})

In [64]:
svm_df = pd.DataFrame(svm_100.predict_proba(test_data_x_selected_100_svm)[:,1]).rename(columns={0:"svm"})

In [74]:
results_output_df = (
    log_reg_df
    .merge(lr_rf_df,how='outer', left_index=True, right_index=True)
    .merge(rf_df,how='outer',left_index=True, right_index=True)
    .merge(svm_df,how='outer', left_index=True, right_index=True)
)

results_output_df.to_csv("2018_results.csv",index=False)

In [67]:
results_output_df.head()

Unnamed: 0,LR,LR_rf,rf,svm
0,0.507854,0.546515,0.559954,0.850039
1,0.93028,0.945529,0.933397,0.985287
2,0.512841,0.592597,0.533842,0.879097
3,0.702561,0.791228,0.733557,0.909262
4,0.957439,0.957348,0.841198,0.964839


In [72]:
seeding_data_2018_submission = (
    seeding_data_2018
    .pipe(lambda x:x.assign(submission_column = x.Season.astype(str) + "_" + x.WTeamID.astype(str) + "_" + x.LTeamID.astype(str)))
)

In [71]:
seeding_data_2018.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed
0,2018,1437,1,1345,2
1,2018,1437,1,1403,3
2,2018,1437,1,1455,4
3,2018,1437,1,1452,5
4,2018,1437,1,1196,6


In [73]:
seeding_data_2018_submission.head()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,submission_column
0,2018,1437,1,1345,2,2018_1437_1345
1,2018,1437,1,1403,3,2018_1437_1403
2,2018,1437,1,1455,4,2018_1437_1455
3,2018,1437,1,1452,5,2018_1437_1452
4,2018,1437,1,1196,6,2018_1437_1196


In [76]:
test_df.tail()

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,outcome,total_off_rebounds_x,total_def_rebounds_x,total_assists_x,...,diff_num_season,diff_is_playoff,diff_is_champion,diff_fgp,diff_total_block_opp_FGA_percent,diff_win_rate_away,diff_win_rate_home,diff_win_rate_neutral,diff_win_rate_post,diff_win_rate_regular
6675,2018,1411,16,1355,12,4,0,0.372549,0.824908,0.450085,...,-0.472574,-0.21875,0.0,0.017973,-0.026266,0.285714,0.413333,-0.285714,-0.25,0.050728
6676,2018,1411,16,1422,13,3,0,0.586057,0.681874,0.531303,...,-0.333142,-0.25,0.0,-0.003312,0.013855,0.174603,0.364211,0.0,-0.25,-0.126772
6677,2018,1411,16,1285,14,2,0,0.59695,0.706535,0.472081,...,-0.413502,-0.25,0.0,0.029288,0.000413,0.285714,0.346667,-0.2,-0.25,0.028517
6678,2018,1411,16,1252,15,1,0,0.431373,0.717633,0.532995,...,-0.324895,-0.25,0.0,0.013532,-0.013078,0.119048,0.2425,0.0,-0.25,-0.135897
6679,2018,1411,16,1300,16,0,0,0.542484,0.707768,0.57868,...,-0.265823,-0.1875,0.0,0.018323,-0.002119,-0.114286,0.089091,0.0,-0.25,-0.041904
