In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.stats import chi2_contingency

In [37]:
df = pd.read_csv('../Data/final_inning1.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113101 entries, 0 to 113100
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   overs                113101 non-null  int64  
 1   ballnumber           113101 non-null  int64  
 2   total_run            113101 non-null  int64  
 3   isWicketDelivery     113101 non-null  int64  
 4   BattingTeam          113101 non-null  int64  
 5   TossWinner           113101 non-null  int64  
 6   TossDecision_bat     113101 non-null  int64  
 7   TossDecision_field   113101 non-null  int64  
 8   runs scored          113101 non-null  int64  
 9   BowlingTeam          113101 non-null  int64  
 10  bowling_team_points  113101 non-null  int64  
 11  ground_adv           113101 non-null  int64  
 12  batsman_run          113101 non-null  int64  
 13  non_striker_runs     113101 non-null  int64  
 14  wickets_gone         113101 non-null  int64  
 15  wickets_remaining

In [5]:
corr = df.corr()

In [13]:
df1 = df[corr[abs(corr['batting_team_won']) > 0.1].index]

In [14]:
cat_cols = ['isWicketDelivery', 'TossWinner', 'BattingTeam', 'TossDecision_bat', 'TossDecision_field', 'BowlingTeam', 'overs', 'ballnumber', 'ground_adv']

In [15]:
X = df.drop('batting_team_won', axis = 1)
y = df['batting_team_won']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [17]:
chi2_check = {}

In [18]:
for column in cat_cols:
  chi, p, dof, ex = chi2_contingency(pd.crosstab(y_train, X_train[column]))
  chi2_check.setdefault('Feature',[]).append(column)
  chi2_check.setdefault('p-value',[]).append(p)

In [19]:
chi2_result = pd.DataFrame(data = chi2_check)

In [20]:
chi2_result

Unnamed: 0,Feature,p-value
0,isWicketDelivery,3.34609e-10
1,TossWinner,3.3137500000000005e-181
2,BattingTeam,0.0
3,TossDecision_bat,0.03728363
4,TossDecision_field,0.03728363
5,BowlingTeam,0.0
6,overs,0.9334406
7,ballnumber,0.1265792
8,ground_adv,7.262138e-133


In [21]:
cols = ['isWicketDelivery', 'TossWinner', 'TossDecision_bat', 'TossDecision_field', 'ground_adv']

In [22]:
df.drop([col for col in cat_cols if col not in cols], axis = 1, inplace=True)

In [23]:
df.columns

Index(['total_run', 'isWicketDelivery', 'TossWinner', 'TossDecision_bat',
       'TossDecision_field', 'runs scored', 'bowling_team_points',
       'ground_adv', 'batsman_run', 'non_striker_runs', 'wickets_gone',
       'wickets_remaining', 'batting_team_won', 'batting_team_points',
       'boundaries_scored', 'runrate'],
      dtype='object')

Numerical Feature Selection

In [24]:
X = df.drop('batting_team_won', axis = 1)
y = df['batting_team_won']

In [25]:
X.drop([col for col in X.columns if col in cat_cols], axis = 1, inplace=True)

In [26]:
X.columns

Index(['total_run', 'runs scored', 'bowling_team_points', 'batsman_run',
       'non_striker_runs', 'wickets_gone', 'wickets_remaining',
       'batting_team_points', 'boundaries_scored', 'runrate'],
      dtype='object')

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [28]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [29]:
fs = SelectKBest(score_func=f_classif, k='all')
fs.fit(X_train, y_train)

In [30]:
for i in range(len(fs.scores_)):
 print('Feature %d: %f' % (i, fs.scores_[i]))

Feature 0: 274.539801
Feature 1: 477.767886
Feature 2: 9.580418
Feature 3: 1167.874467
Feature 4: 1008.979826
Feature 5: 1133.777601
Feature 6: 1133.777601
Feature 7: 27.455768
Feature 8: 794.309028
Feature 9: 720.335824


Feature 0, Feature 1, Feature 3, Feature 4, Feature 5, Feature 8, Feature 9

In [31]:
X_train.columns

Index(['total_run', 'runs scored', 'bowling_team_points', 'batsman_run',
       'non_striker_runs', 'wickets_gone', 'wickets_remaining',
       'batting_team_points', 'boundaries_scored', 'runrate'],
      dtype='object')

total_run, runs scored, batsman_run, non_striker_runs, wickets_gone, boundaries_scored, runrate

In [32]:
df.drop(['bowling_team_points', 'wickets_remaining', 'batting_team_points'], axis = 1, inplace = True)

In [33]:
df.columns

Index(['total_run', 'isWicketDelivery', 'TossWinner', 'TossDecision_bat',
       'TossDecision_field', 'runs scored', 'ground_adv', 'batsman_run',
       'non_striker_runs', 'wickets_gone', 'batting_team_won',
       'boundaries_scored', 'runrate'],
      dtype='object')

In [35]:
df1.columns

Index(['batsman_run', 'non_striker_runs', 'wickets_gone', 'wickets_remaining',
       'batting_team_won', 'boundaries_scored'],
      dtype='object')

In [38]:
df.to_csv('../Data/feature_selected.csv', index = False)

Selected Features