## Forward feature selection

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import TimeSeriesSplit as tscv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs

In [2]:
def result_to_numeric(x):
    if x=='W' or x=='w' or x==0.633047:
        return 1
    elif x=='L' or x=='l':
        return 0

In [None]:
# Read data
df = pd.read_csv('finalDataset.csv', sep=',')
df = df.drop(['PtDiff'],axis=1)
df['WL'] = df['WL'].apply(result_to_numeric)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,3:12],
    df.values[:,-1],
    test_size=0.2,
    random_state=42,
    shuffle=False)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)
print(df['WL'].value_counts())

In [None]:
df.head()

In [3]:
clf = LogisticRegressionCV(cv= tscv(n_splits = 5), random_state=42,solver='liblinear')

In [None]:
# Build step forward feature selection
efs1 = efs(clf,
           min_features=1,
           max_features=9,
           scoring='accuracy',
           print_progress=True,
           cv = tscv(n_splits = 5))

sfs1 = sfs(clf,
           k_features='best',
           scoring='accuracy',
           verbose=2,
           cv=tscv(n_splits = 5))

# Perform SFFS
efs1 = efs1.fit(X_train, y_train)

#Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

In [None]:
efs1.best_idx_

In [None]:
efs1.best_score_

In [None]:
sfs1.k_feature_idx_

In [None]:
sfs1.k_score_

In [None]:
#sfs lr acc
clf.fit(X_train[:,list(sfs1.k_feature_idx_)],y_train)
y_sfs_pred = clf.predict(X_test[:,list(sfs1.k_feature_idx_)])

In [None]:
acc(y_sfs_pred,y_test)

In [None]:
#efs lr acc
clf.fit(X_train[:,list(efs1.best_idx_)],y_train)
y_efs_pred = clf.predict(X_test[:,list(efs1.best_idx_)])

In [None]:
acc(y_efs_pred,y_test)

In [None]:
clf.fit(X_train[:,:],y_train)
y_all_pred = clf.predict(X_test[:,:])
acc(y_all_pred,y_test)

In [4]:
read_df = pd.read_csv('finalDatasetFrom79.csv', sep=',')
read_df = read_df.drop(['Year','TeamA','TeamB','PtDiff'],axis=1)
read_df['WL'] = read_df['WL'].apply(result_to_numeric)
print(read_df.head())

   ORebDiff  DRebDiff  AsstDiff    PFDiff   STLDiff    TODiff   BlkDiff  \
0 -1.414634  0.353659  1.951220 -3.439024  0.548780  2.317073 -0.146341   
1  1.036585  1.731707  1.768293 -1.109756  0.512195 -0.682927 -0.536585   
2  2.939024 -3.634146 -2.158537 -2.146341  0.134146 -0.292683  0.487805   
3 -1.792683 -1.073171  0.304878 -0.402439  3.195122  4.000000  2.548780   
4  0.170732  2.378049  1.585366 -0.841463 -1.634146  0.121951  2.463415   

   x3PMDiff   FGPDiff   FTPDiff  WL  
0  0.524390  0.015133  0.030245   1  
1  0.402439 -0.005718  0.025008   1  
2  0.634146 -0.018197 -0.034511   1  
3 -0.560976  0.033874  0.013886   1  
4 -0.585366  0.035610  0.002423   1  


In [7]:
ss = StandardScaler()
df = pd.DataFrame(ss.fit_transform(read_df),columns = read_df.columns)
df['WL'] = read_df['WL']
print(df.head())

   ORebDiff  DRebDiff  AsstDiff    PFDiff   STLDiff    TODiff   BlkDiff  \
0 -0.679655 -0.093778  0.485930 -1.183177  0.402658  1.621999 -0.181793   
1  0.589625  0.600004  0.411367 -0.216360  0.377265 -0.279793 -0.464532   
2  1.574738 -2.101447 -1.189259 -0.646619  0.114875 -0.032405  0.277659   
3 -0.875415 -0.812118 -0.185140  0.077228  2.239388  2.688858  1.770879   
4  0.141272  0.925406  0.336803 -0.104999 -1.112433  0.230444  1.709029   

   x3PMDiff   FGPDiff   FTPDiff  WL  
0  0.244336  0.359709  0.768017   1  
1  0.165348 -0.754864  0.632071   1  
2  0.315424 -1.421955 -0.912959   1  
3 -0.458650  1.361487  0.343370   1  
4 -0.474447  1.454279  0.045796   1  


In [None]:
# Train/test split
X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(df.values[:,:-1],df.values[:,-1],test_size=0.25,random_state=42,shuffle=False)

y_train_large = y_train_large.astype('int')
y_test_large = y_test_large.astype('int')

print('Training dataset shape:', X_train_large.shape, y_train_large.shape)

In [8]:
# Train/test split
X_train_large, y_train_large = df.values[:,:-1], df.values[:,-1]

y_train_large = y_train_large.astype('int')
#y_test_large = y_test_large.astype('int')

print('Training dataset shape:', X_train_large.shape, y_train_large.shape)

Training dataset shape: (374, 10) (374,)


In [22]:
# Build step forward feature selection
efs2 = efs(clf,
           min_features=1,
           max_features=10,
           scoring='accuracy',
           print_progress=True,
           cv=tscv(n_splits = 5))

sfs2 = sfs(clf,
           k_features='best',
           scoring='accuracy',
           verbose=2,
           cv=tscv(n_splits = 5))

# Perform SFFS
efs2 = efs2.fit(X_train_large, y_train_large)

#Perform SFFS
sfs2 = sfs2.fit(X_train_large, y_train_large)

Features: 638/638[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.0s finished

[2019-10-28 15:15:28] Features: 1/10 -- score: 0.7451612903225806[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s finished

[2019-10-28 15:15:29] Features: 2/10 -- score: 0.7419354838709676[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.8s finished

[2019-10-28 15:15:29] Features: 3/10 -- score: 0.732258064516129[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_

In [17]:
efs2.best_score_

0.7419354838709677

In [18]:
sfs2.k_score_

0.7451612903225806

In [19]:
efs2.best_idx_

(1, 5, 6)

In [20]:
sfs2.k_feature_idx_

(1,)

In [None]:
#sfs lr acc
clf.fit(X_train_large[:,list(sfs2.k_feature_idx_)],y_train_large)
y_sfs_pred_large = clf.predict(X_test_large[:,list(sfs2.k_feature_idx_)])

In [None]:
acc(y_sfs_pred_large,y_test_large)

In [None]:
#efs lr acc
clf.fit(X_train_large[:,list(efs2.best_idx_)],y_train_large)
y_efs_pred_large = clf.predict(X_test_large[:,list(efs2.best_idx_)])

In [None]:
acc(y_efs_pred_large,y_test_large)

In [None]:
clf.fit(X_train_large[:,:],y_train_large)
y_all_pred_large = clf.predict(X_test_large[:,:])
acc(y_all_pred_large,y_test_large)

In [49]:
# Read data
df = pd.read_csv('completeFinalDatasetFrom79_Pruned.csv', sep=',')
df = df.drop(['Year','TeamA','TeamB'],axis=1)
df['WL'] = df['WL'].apply(result_to_numeric)
print(df.head())
print(df.columns)

   O_ORebDiff  O_DRebDiff  O_AsstDiff  O_PFDiff  O_STLDiff  O_TODiff  \
0   -0.852744    0.220118    0.725228 -1.624636   0.502332  1.219331   
1    0.624856    1.077818    0.657237 -0.524262   0.468844 -0.359382   
2    1.771649   -2.261899   -0.802283 -1.013957   0.122792 -0.154021   
3   -1.080633   -0.667943    0.113317 -0.190117   2.924691  2.104951   
4    0.102917    1.480102    0.589247 -0.397517  -1.495834  0.064175   

   O_BlkDiff  O_x3PMDiff  O_FGPDiff  O_FTPDiff  D_ORebDiff  D_DRebDiff  \
0  -0.141657    0.239121   0.635920   1.072682    0.663973   -1.368963   
1  -0.519410    0.183512  -0.240265   0.886943    0.567872    0.354402   
2   0.472191    0.289170  -0.764677  -1.223977    0.366933   -1.077103   
3   2.467195   -0.255804   1.423435   0.492501    1.057116   -2.223696   
4   2.384562   -0.266926   1.496380   0.085936    0.838703   -1.424555   

   D_AsstDiff  D_PFDiff  D_BlkDiff  D_x3PMDiff  D_FTPDiff  WL  
0    1.177983 -0.134638  -0.576137    0.362318  -0.717179 

In [36]:
# Read data
read_df = pd.read_csv('completeFinalDatasetFrom79.csv', sep=',')
read_df = read_df.drop(['Year','TeamA','TeamB','O_PtDiff','D_PtDiff'],axis=1)
read_df['WL'] = read_df['WL'].apply(result_to_numeric)
print(read_df.head())
print(read_df.columns)

   O_ORebDiff  O_DRebDiff  O_AsstDiff  O_PFDiff  O_STLDiff  O_TODiff  \
0   -1.414634    0.353659    1.951220 -3.439024   0.548780  2.317073   
1    1.036585    1.731707    1.768293 -1.109756   0.512195 -0.682927   
2    2.939024   -3.634146   -2.158537 -2.146341   0.134146 -0.292683   
3   -1.792683   -1.073171    0.304878 -0.402439   3.195122  4.000000   
4    0.170732    2.378049    1.585366 -0.841463  -1.634146  0.121951   

   O_BlkDiff  O_x3PMDiff  O_FGPDiff  O_FTPDiff  ...  D_DRebDiff  D_AsstDiff  \
0  -0.146341    0.524390   0.015133   0.030245  ...   -2.402439    3.024390   
1  -0.536585    0.402439  -0.005718   0.025008  ...    0.621951    0.097561   
2   0.487805    0.634146  -0.018197  -0.034511  ...   -1.890244   -3.829268   
3   2.548780   -0.560976   0.033874   0.013886  ...   -3.902439   -0.378049   
4   2.463415   -0.585366   0.035610   0.002423  ...   -2.500000    3.634146   

   D_PFDiff  D_STLDiff  D_TODiff  D_BlkDiff  D_x3PMDiff  D_FGPDiff  D_FTPDiff  \
0 -0.256098

In [37]:
ss = StandardScaler()
df = pd.DataFrame(ss.fit_transform(read_df),columns = read_df.columns)
df['WL'] = read_df['WL']
print(df.head())

   O_ORebDiff  O_DRebDiff  O_AsstDiff  O_PFDiff  O_STLDiff  O_TODiff  \
0   -0.679655   -0.093778    0.485930 -1.183177   0.402658  1.621999   
1    0.589625    0.600004    0.411367 -0.216360   0.377265 -0.279793   
2    1.574738   -2.101447   -1.189259 -0.646619   0.114875 -0.032405   
3   -0.875415   -0.812118   -0.185140  0.077228   2.239388  2.688858   
4    0.141272    0.925406    0.336803 -0.104999  -1.112433  0.230444   

   O_BlkDiff  O_x3PMDiff  O_FGPDiff  O_FTPDiff  ...  D_DRebDiff  D_AsstDiff  \
0  -0.181793    0.244336   0.359709   0.768017  ...   -0.804917    1.469098   
1  -0.464532    0.165348  -0.754864   0.632071  ...    0.619116    0.184956   
2   0.277659    0.315424  -1.421955  -0.912959  ...   -0.563750   -1.537935   
3   1.770879   -0.458650   1.361487   0.343370  ...   -1.511191   -0.023717   
4   1.709029   -0.474447   1.454279   0.045796  ...   -0.850853    1.736628   

   D_PFDiff  D_STLDiff  D_TODiff  D_BlkDiff  D_x3PMDiff  D_FGPDiff  D_FTPDiff  \
0 -0.158705

In [50]:
# Train/test split
X_train_complete, X_test_complete, y_train_complete, y_test_complete = train_test_split(
    df.values[:,:-1],
    df.values[:,-1],
    test_size=0.30,
    random_state=42,
    shuffle=False)

y_train_complete = y_train_complete.astype('int')
y_test_complete = y_test_complete.astype('int')

print('Training dataset shape:', X_train_complete.shape, y_train_complete.shape)
print('Testing dataset shape:', X_test_complete.shape, y_test_complete.shape)
print(df['WL'].value_counts())

Training dataset shape: (261, 17) (261,)
Testing dataset shape: (113, 17) (113,)
1    267
0    107
Name: WL, dtype: int64


In [45]:
# Train/test split
X_train_complete, y_train_complete = df.values[:,:-1], df.values[:,-1]

y_train_complete = y_train_large.astype('int')
#y_test_large = y_test_large.astype('int')

print('Training dataset shape:', X_train_complete.shape, y_train_complete.shape)

Training dataset shape: (374, 17) (374,)


In [51]:
# Build step forward feature selection

sfs3 = sfs(clf,
           k_features='best',
           scoring='accuracy',
           verbose=2,
           cv=tscv(n_splits = 5))

#Perform SFFS
sfs3 = sfs3.fit(X_train_complete, y_train_complete)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    1.6s finished

[2019-10-28 15:54:55] Features: 1/17 -- score: 0.7441860465116279[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    1.4s finished

[2019-10-28 15:54:56] Features: 2/17 -- score: 0.7488372093023256[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    1.3s finished

[2019-10-28 15:54:58] Features: 3/17 -- score: 0.7441860465116279[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [None]:
efs3 = efs(clf,
           min_features=3,
           max_features=10,
           scoring='accuracy',
           print_progress=True,
           cv=tscv(n_splits = 5))

# Perform SFFS
efs3 = efs3.fit(X_train_complete, y_train_complete)

In [None]:
efs3.best_score_

In [52]:
sfs3.k_score_

0.772093023255814

In [None]:
efs3.best_idx_

In [53]:
sfs3.k_feature_idx_

(1, 4, 7, 9, 11, 12, 13, 14)

In [54]:
#Names of features
colname = df.columns[[x for x in (list(sfs3.k_feature_idx_))]]
print(colname)

Index(['O_DRebDiff', 'O_STLDiff', 'O_x3PMDiff', 'O_FTPDiff', 'D_DRebDiff',
       'D_AsstDiff', 'D_PFDiff', 'D_BlkDiff'],
      dtype='object')


In [55]:
#sfs lr acc
clf.fit(X_train_complete[:,list(sfs3.k_feature_idx_)],y_train_complete)
y_sfs_pred_complete = clf.predict(X_test_complete[:,list(sfs3.k_feature_idx_)])

In [56]:
acc(y_sfs_pred_complete,y_test_complete)

0.6991150442477876

In [None]:
#efs lr acc
clf.fit(X_train_complete[:,list(efs3.best_idx_)],y_train_complete)
y_efs_pred_complete = clf.predict(X_test_complete[:,list(efs3.best_idx_)])

In [None]:
acc(y_efs_pred_complete,y_test_complete)

In [57]:
clf.fit(X_train_complete[:,:],y_train_complete)
y_all_pred_complete = clf.predict(X_test_complete[:,:])
acc(y_all_pred_complete,y_test_complete)

0.7079646017699115