___
this notebook select features that are important before throwing into the model

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_selection import *
from sklearn import *
from scipy import *
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, SelectFromModel, SelectPercentile,RFECV
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from aggregate_function import build_features_table, combine_features_table, coach_stats, win_rate_type_of_location



In [2]:
coach_file = 'data/DataFiles/TeamCoaches.csv'
regularseason_file = 'data/DataFiles/RegularSeasonDetailedResults.csv'
postseason_file = 'data/DataFiles/NCAATourneyCompactResults.csv'

In [3]:
initial_features = build_features_table.BuildFeaturesTable(regularseason_file)
win_rate_features = win_rate_type_of_location.WinRateTypeLocation(regularseason_file)
coach_features = coach_stats.CoachStats(coach_file,regularseason_file,postseason_file)

features = combine_features_table.CombineFeaturesTable(initial_features,win_rate_features,coach_features)

## Data Transformation for recency data
- going to apply a flat weightage of 
    - 85% to current year
    - 15% to previous year

In [5]:
features_table = features.final_table

features_table.head()

Unnamed: 0,Season,TeamID,win_rate,total_score,total_opponent_score,fgp,fg3p,ftp,total_rebounds,total_off_rebounds,...,avg_win_score_by,win_rate_away,win_rate_home,win_rate_neutral,num_season,is_playoff,is_champion,win_rate_post,win_rate_regular,win_rate_overall
0,2014,1101,0.095238,1326.0,1651.0,0.405508,0.373333,0.746067,595.0,168.0,...,3.5,0.0,0.125,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2015,1101,0.25,1708.0,2012.0,0.404858,0.378026,0.727924,781.0,231.0,...,12.142857,0.142857,0.222222,0.666667,2.0,0.0,0.0,0.0,0.0,0.0
2,2016,1101,0.333333,1886.0,2059.0,0.441621,0.363458,0.706985,829.0,221.0,...,7.555556,0.285714,0.333333,0.5,3.0,0.0,0.0,0.0,0.0,0.0
3,2017,1101,0.36,1697.0,1816.0,0.458365,0.371069,0.642241,761.0,189.0,...,4.666667,0.444444,0.3125,0.0,4.0,0.0,0.0,0.0,0.0,0.0
4,2003,1102,0.428571,1603.0,1596.0,0.481149,0.375643,0.651357,588.0,117.0,...,15.583333,0.428571,0.473684,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [6]:
features_table (
    features.final_table
    .pipe(lambda x:x.assign(shifted_team = x.TeamID.shift(+1)))
    .pipe(lambda x:x.assign(shifted_win_rate = x.win_rate.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_score = x.total_score.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_opponent_score = x.total_opponent_score.shift(+1)))
    .pipe(lambda x:x.assign(shifted_fgp = x.fgp.shift(+1)))
    .pipe(lambda x:x.assign(shifted_fg3p = x.fg3p.shift(+1)))
    .pipe(lambda x:x.assign(shifted_ftp = x.ftp.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_rebounds = x.total_rebounds.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_off_rebounds = x.total_off_rebounds.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_def_rebounds = x.total_def_rebounds.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_off_rebounds_percent = x.total_off_rebounds_percent.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_def_rebounds_percent = x.total_def_rebounds_percent.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_rebound_possession_percent = x.total_rebound_possession_percent.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_rebound_possessiongain_percent = x.total_rebound_possessiongain_percent.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_blocks = x.total_blocks.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_assists = x.total_assists.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_steals = x.total_steals.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_turnover = x.total_turnover.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_personalfoul = x.total_personalfoul.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_block_opp_FGA_percent = x.total_block_opp_FGA_percent.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_assist_per_fgm = x.total_assist_per_fgm.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_assist_turnover_ratio = x.total_assist_turnover_ratio.shift(+1)))
    .pipe(lambda x:x.assign(shifted_expectation_per_game = x.expectation_per_game.shift(+1)))
    .pipe(lambda x:x.assign(shifted_total_assists = x.avg_lose_score_by.shift(+1)))
    .pipe(lambda x:x.assign(shifted_avg_lose_score_by = x.avg_win_score_by.shift(+1)))
    .pipe(lambda x:x.assign(shifted_avg_win_score_by = x.avg_win_score_by.shift(+1)))
    .pipe(lambda x:x.assign(shifted_win_rate_away = x.win_rate_away.shift(+1)))
    .pipe(lambda x:x.assign(shifted_win_rate_home = x.win_rate_home.shift(+1)))
    .pipe(lambda x:x.assign(shifted_win_rate_neutral = x.win_rate_neutral.shift(+1)))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_total_score = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_total_opponent_score = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_fgp = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_fg3p = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_ftp = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_total_rebounds = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
    .pipe(lambda x:x.assign(weighted_win_rate = np.where(
        (x.shifted_team == x.TeamID), 0.85*x.win_rate + 0.15 * x.shifted_win_rate
    )))
)

Season                                    int64
TeamID                                    int64
win_rate                                float64
total_score                             float64
total_opponent_score                    float64
fgp                                     float64
fg3p                                    float64
ftp                                     float64
total_rebounds                          float64
total_off_rebounds                      float64
total_def_rebounds                      float64
total_off_rebounds_percent              float64
total_def_rebounds_percent              float64
total_rebound_possession_percent        float64
total_rebound_possessiongain_percent    float64
total_blocks                            float64
total_assists                           float64
total_steals                            float64
total_turnover                          float64
total_personalfoul                      float64
total_block_opp_FGA_percent             