# **Problem Statement:**

- The major objective of this project is to extract actionable insights from the historical match data and make strategic changes to make India win. 
- Primary objective is to create Machine Learning models which correctly predicts a win for the Indian Cricket Team. 
- Once a model is developed then you have to extract actionable insights and recommendation. Also, below are the details of the next 10 matches, India is going to play. You have to predict the result of the matches.


In [None]:
!pip install optuna
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display
plt.rcParams['figure.figsize'] = (16,8)
plt.style.use("fivethirtyeight")

from sklearn.model_selection import KFold,StratifiedKFold, train_test_split

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from xgboost import  XGBClassifier


import optuna

In [None]:
# If read.excel doesnt work uncomment and run following code.
!pip install xlrd



# **Loading the data**

### Data Dictionary

In [None]:
df_metadata = pd.read_excel('/content/Sports Data.xlsx', sheet_name = 'Meta data', header = 1, usecols = [1,2])
df_metadata

## Data set

In [None]:
df_maindata_excel = pd.read_excel('/content/Sports Data.xlsx', sheet_name = 'Sports data for DSBA')
df_maindata_excel.info()

## **Creating StratifyKFold columns in the dataset, which will be used to create train, validation split in later stages.**

In [None]:
df_maindata_excel["kfold"] = -1

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(df_maindata_excel, df_maindata_excel.Result)):
    df_maindata_excel.loc[valid_indicies, "kfold"] = fold

df_maindata_excel.to_csv("maindata_folds.csv", index=False)

In [None]:
df_maindata = pd.read_csv('maindata_folds.csv')
df_maindata.head()

In [None]:
df_maindata.shape

In [None]:
df_maindata.describe()

In [None]:
df_maindata.info()

Observations:

---


- There are missing values. Check them and treat them accordingly
- Player_hightest_wicket and Players_scored_zero is object type convert it into int as it might be useful as int


In [None]:
"""Making columns lower case and replacing any spaces with '_'"""
df_maindata.columns = df_maindata.columns.str.lower().str.replace(' ', '_')

## **Check unique values and convert required columns into Integer type:**

---


- Check the unique values 
- If all values are int then we change it to integer
- Else we convert any string values to int and then convert all values into int type.

In [None]:
for col in df_maindata.columns:
  print(col, '\n',df_maindata[col].unique())
  print()

### **Observations:**

---


- Variables like player_highest_wicket, players_scored_zero, match_format, first_selection has repeated unique values. 
- E.g. player_highest_wicket has 'Three' and 3, which actually means the same. Hence, we will replace 'Three' by 3.
- Similarly for the other variables.
- Finally convert player_hightest_wicket and players_scored_zero to integer

In [None]:
"""Replacing repeated values"""

df_maindata['player_highest_wicket'] = df_maindata['player_highest_wicket'].apply(lambda x: x if (x != 'Three') else 3)
df_maindata['players_scored_zero'] = df_maindata['players_scored_zero'].apply(lambda x: x if (x != 'Three') else 3)
df_maindata['match_format'] = df_maindata['match_format'].apply(lambda x: x if (x != '20-20') else 'T20')
df_maindata['first_selection'] = df_maindata['first_selection'].apply(lambda x: x if (x != 'Bat') else 'Batting')


"""Converting player_highest_wicket and players_scored_zero to integer"""
df_maindata['player_highest_wicket'] = df_maindata['player_highest_wicket'].astype('int')
df_maindata['players_scored_zero'] = df_maindata['players_scored_zero'].astype('int')

for col in df_maindata.columns:
  print(col, '\n',df_maindata[col].unique())
  print()

In [None]:
df_maindata.isnull().sum().to_frame().rename({0: 'Missing Values'}, axis = 1).sort_values(by = 'Missing Values', ascending = False).style.background_gradient('copper_r')

In [None]:
"""Percentage of Missing Values"""
percentage = df_maindata.isnull().mean().round(5).to_frame().rename({0: '%age of Missing Values'}, axis = 1).sort_values(by = '%age of Missing Values', ascending = False)
display(percentage.style.background_gradient('copper_r'))

"""Setting for displaying plot"""
plot_percentage = percentage.reset_index().rename({"index": "Variables"}, axis = 1)
# order = percentage.isnull().mean().round(2).sort_values(ascending =False).index
ax = sns.barplot(plot_percentage['%age of Missing Values'], plot_percentage['Variables'], palette = 'copper')
plt.show()

Observations:
- Missing values in Avg_team_Age, Bowlers_in_team, Audience_number,Match_format ,Offshore, Season,First_selection, Match_light_type ,All_rounder_in_team, Opponent,Max_run_given_1over, Extra_bowls_bowled,player_highest_run,Max_run_scored_1over, Min_run_scored_1over Treat them accordingly
- Avg_team_Age has highest number of missing values.
- **Missing value in Opponent cannot be filled with mode as it might bias our result towards. Best option is to drop those rows**

In [None]:
missing_values_cols = list(plot_percentage[plot_percentage['%age of Missing Values'] != 0]['Variables']) 
missing_cat_cols = [col for col in missing_values_cols if df_maindata[col].dtype == 'object']
missing_num_cols = [col for col in missing_values_cols if col not in missing_cat_cols]
missing_cat_cols

In [None]:
"""Drop rows with missing values in Opponent"""
df_maindata.dropna(subset=['opponent'], inplace = True)

# Now removing Opponent col from list of categorical colummns
missing_cat_cols.remove('opponent')                     

"""Impute Mode for categorical columns"""
for col in missing_cat_cols:
    df_maindata[col].fillna(value=df_maindata[col].mode()[0],inplace=True)

"""Impute mean for numerical columns"""
for col in missing_num_cols:
    df_maindata[col].fillna(value=df_maindata[col].median(),inplace=True)

In [None]:
df_maindata.isnull().sum().to_frame().rename({0: 'Missing Values'}, axis = 1).sort_values(by = 'Missing Values', ascending = False).style.background_gradient('copper_r')

Observations:
- Dropped the rows with missing values in Opponent Col 
- Imputed categorical missing values using mode
- Imputed numerical missing values using mean

## **Duplicates?**

In [None]:
df_maindata.duplicated().sum()

## **Any Constant Features? They show same value or just one value for all the records in the dataset.**

In [None]:
# Checking constant variables
constant_features = [col for col in df_maindata.columns if df_maindata[col].nunique() == 1]
constant_features

In [None]:
# Dropping the constant feature
del df_maindata['wicket_keeper_in_team']

## **Correlation**

In [None]:
plt.figure(figsize=(12,12))
ax = sns.heatmap(df_maindata.corr(), annot = True, fmt='.2f', mask = df_maindata.corr() < .65, square = True, lw=0.2,linecolor='black' , cmap = 'copper_r')
plt.title("Heatmap of Correlation",fontsize = 20)
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(fontsize = 15)
plt.show();

Observations:
- Wicket_keeper_in_team has only one value. Hence we can drop that variable
- Multicollinearity exists between player_highest_wicket and (audience number and extra bowls bowled) at 0.65 as threshold.
- Drop audience number and extra bowls bowled

In [None]:
df_maindata.drop(['audience_number', 'extra_bowls_bowled'], axis = 1, inplace=True)

## **Plotting Categorical Fetures**

In [None]:
sns.countplot('first_selection', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
sns.countplot('season', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
sns.countplot('opponent', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
sns.countplot('match_format', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
sns.countplot('offshore', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
sns.countplot('match_light_type', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
sns.countplot('avg_team_age', hue = 'result' , data = df_maindata)
plt.show()

Observation:
- avg_age_team has outliers. Average age for any team cannot be 12 or 70. Either its a mistake or outliers.
- India win most matches in avg_age 30. 
- Day time matches are beneficial for us.

### **Treating the outliers**
- **Capping and Flooring the outliers**



In [None]:
percentiles = df_maindata['avg_team_age'].quantile([0.01, 0.99]).values
df_maindata['avg_team_age'] = np.clip(df_maindata['avg_team_age'], percentiles[0], percentiles[1])

In [None]:
sns.countplot('avg_team_age', hue = 'result' , data = df_maindata)
plt.show()

In [None]:
for col in df_maindata.columns:
  plt.figure(figsize = (12,5))
  if df_maindata[col].dtype != 'object':
    sns.boxplot(df_maindata[col])
    plt.show()

## **Let's work with Cardinality**

In [None]:
for col in df_maindata.columns:
  print(col)
  print(f"First 5 Unique Values: {df_maindata[col].unique()[:5]}")
  print(f"Number of unique values: {df_maindata[col].nunique()}")

  print('\n')

### **Observation:**

---
- We can One Hot Encode variables with 3 unique values
- Drop audience_number as it doesn't contribute much towards the predictions.


In [None]:
useful_cols = [col for col in df_maindata.columns if col not in ['game_number', 'result', 'kfold']]
categorical = [col for col in useful_cols if df_maindata[col].dtype == 'object']
numerical = [col for col in useful_cols if col not in categorical]

In [None]:
ohe_list = []
for col in df_maindata[useful_cols].columns:
  if df_maindata[useful_cols][col].nunique() <= 3:
    ohe_list.append(col) 

ohe_list

In [None]:
for col in ohe_list:
  dummies = pd.get_dummies(df_maindata[col], prefix=col)
  df_maindata[dummies.columns] = dummies

In [None]:
df_maindata.drop(ohe_list, axis = 1, inplace=True )
df_maindata.drop('game_number', axis = 1, inplace=True)

In [None]:
df_maindata['result'] = df_maindata['result'].apply(lambda x: 1 if x == 'Win' else 0)

In [None]:
useful_cols = [col for col in df_maindata.columns if col not in ['game_number', 'result' ,'kfold']]
categorical = [col for col in useful_cols if df_maindata[col].dtype == 'object']
numerical = [col for col in useful_cols if col not in categorical]

In [None]:
sns.set(rc={'xtick.labelsize':16,'ytick.labelsize':16,'axes.labelsize':16})
sns.relplot(x="player_highest_run", y="max_run_given_1over",col="first_selection_Bowling", row="max_wicket_taken_1over", hue='result',  data=df_maindata)
plt.show()

In [None]:
for col in df_maindata.columns:
  print(col)
  print(f"First 5 Unique Values: {df_maindata[col].unique()[:5]}")
  print(f"Number of unique values: {df_maindata[col].nunique()}")

  print('\n')

In [None]:
df, df_test1 = train_test_split(df_maindata, stratify=df_maindata['result'], test_size = 0.15, random_state = 7)

## **HyperTuning first Model**

In [None]:
def run(trial):

    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)

        ytrain = xtrain.result
        yvalid = xvalid.result
        
        xtrain = xtrain[useful_cols]
        xvalid = xvalid[useful_cols]
        
        train_dicts =xtrain.to_dict(orient='records')
        val_dicts = xvalid.to_dict(orient='records')
        dv = DictVectorizer(sparse=False)
        xtrain = dv.fit_transform(train_dicts)
        xvalid = dv.transform(val_dicts)
        
        # Optuna suggest params


        params = {
        'n_estimators': trial.suggest_int('n_estimators', 7000, 9000),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_float('gamma', 0, 0.001)}
        
        model_xgb = XGBClassifier(**params, random_state=7)
        
        
        model_xgb.fit(xtrain, ytrain, early_stopping_rounds=300,eval_set=[(xvalid, yvalid)],  verbose=500)
        
        preds_valid = model_xgb.predict_proba(xvalid)[:, 1]
        roc_auc = metrics.roc_auc_score(yvalid, preds_valid)
    
        
    return roc_auc


study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=20)

study.best_params

In [None]:

df_test = df_test1[useful_cols]
final_test_predictions = []
final_valid_predictions = {}
decisions_valid = []
decisions_test = [] 
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test1.copy()
    
    valid_ids = xvalid.index.values.tolist()

    ytrain = xtrain.result
    yvalid = xvalid.result
    
    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]
    xtest = xtest[useful_cols]
    
    train_dicts = xtrain.to_dict(orient='records')
    val_dicts = xvalid.to_dict(orient='records')
    test_dicts = xtest.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    xtrain = dv.fit_transform(train_dicts)
    xvalid = dv.transform(val_dicts)
    xtest = dv.transform(test_dicts)
    
    params = study.best_params
    
    
    model = XGBClassifier(n_estimators=2000,
        random_state=7
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=500)
    preds_valid = model.predict_proba(xvalid)[:, 1]

    test_preds = model.predict_proba(xtest)[:1]

    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    roc_auc = metrics.roc_auc_score(yvalid, preds_valid)
    print(fold, roc_auc)
    scores.append(roc_auc)
 
print(np.mean(scores), np.std(scores))

## **Confusion Matrix and Classification reports for XGB model.**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
val_dicts = df_test.to_dict(orient='records')
df_test = dv.transform(val_dicts)
print(confusion_matrix(df_test1.result, model.predict(df_test)))
print(classification_report(df_test1.result, model.predict(df_test)))

In [None]:
import xgboost as xgb
model.get_booster().feature_names = list(useful_cols)
xgb.plot_importance(model.get_booster())

In [None]:
from sklearn.ensemble import  RandomForestClassifier

## **Hyperparameter tuning Random Forest**

In [None]:
def run(trial):

    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)

        ytrain = xtrain.result
        yvalid = xvalid.result
        
        xtrain = xtrain[useful_cols]
        xvalid = xvalid[useful_cols]
        
        train_dicts =xtrain.to_dict(orient='records')
        val_dicts = xvalid.to_dict(orient='records')
        dv = DictVectorizer(sparse=False)
        xtrain = dv.fit_transform(train_dicts)
        xvalid = dv.transform(val_dicts)
        
        # Optuna suggest params
        params = {
        'n_estimators':  trial.suggest_int("n_estimators", 2, 7000),
        'max_depth': int(trial.suggest_loguniform('max_depth', 1, 32))}


        
        model_rf = RandomForestClassifier(**params, random_state=7)
        
        
        model_rf.fit(xtrain, ytrain)
        
        preds_valid = model_rf.predict_proba(xvalid)[:, 1]
        roc_auc = metrics.roc_auc_score(yvalid, preds_valid)
    
        
    return roc_auc


study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=20)

study.best_params

In [None]:

df_test = df_test1[useful_cols]
final_test_predictions = []
final_valid_predictions = {}
decisions_valid = []
decisions_test = [] 
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.index.values.tolist()

    ytrain = xtrain.result
    yvalid = xvalid.result
    
    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]
    
    train_dicts = xtrain.to_dict(orient='records')
    val_dicts = xvalid.to_dict(orient='records')
    test_dicts = xtest.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    xtrain = dv.fit_transform(train_dicts)
    xvalid = dv.transform(val_dicts)
    xtest = dv.transform(test_dicts)
    
    

    model = RandomForestClassifier(
        random_state=7, 
        n_estimators=103,
        max_depth = int(31.939959712387637)
    )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict_proba(xvalid)[:, 1]

    test_preds = model.predict_proba(xtest)[:, 1]

    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    roc_auc = metrics.roc_auc_score(yvalid, preds_valid)
    print(fold, roc_auc)
    scores.append(roc_auc)
 
print(np.mean(scores), np.std(scores))

In [None]:
import pandas as pd
%matplotlib inline
#do code to support model
#"data" is the X dataframe and model is the SKlearn object

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(df_maindata.columns, model.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

(importances.sort_values(by='Gini-importance', ascending = False)).plot(kind='barh')