In [None]:
# This is one of my first notebooks so feel free to give suggestions in the comments

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

### Conversion of file to dataframe

df = pd.read_csv("../input/cricket-mach-dataset/ODI_Cricket Match Dataset.csv") 

## Overview

- Details of dataset
- Data Cleaning
- Some Data Visualizations and Observations
- Preparing data for training the model
- Defining accuracy function to calculate all metric values

### Regression Models

- Linear Regression
- Decision Tree Regressor
- Random Forest
- Gradient Boost
- Ada Boost
- Cat Boost

In [None]:
feature_names = np.array(["venue","bat_team","bowl_team","batsman","bowler","runs","wickets","overs","striker","non-striker"])

df.head(8)

# Details of the dataset

- Striker - Greater of two i.e. runs scored by striker and runs scored by non-striker
- Non-Striker - Minimum of the two i.e. runs scored by striker and runs scored by non-striker
- Runs_last_5 - Runs scored in last 5 overs
- Wickets_last_5 - Wickets taken down in the last 5 overs

In [None]:
### number of rows and cols

df.shape

In [None]:
### info about dataset

df.info()

# Data Cleaning 

### Checking if any null values present in the dataset taken

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
### Getting results scores of all matches played so far..

matches = df["mid"].unique().tolist()

results = []

for match in matches:
    result = df[df["mid"] == match].last_valid_index()
    results.append(df.loc[result,:])

matches = pd.DataFrame(results)

matches.describe()

### Observations

- On an average 250 runs were scored in an odi
- On an average 8 wickets were taken by a bowling team in a match

In [None]:
df.dtypes

# Data Visualizations

Plot for a particular match say between England and Ireland

In [None]:
stadium = "Civil Service Cricket Club, Stormont"
team1 = "England"
team2 = "Ireland"

innings = df[(df["bat_team"]==team1) & (df["bowl_team"]==team2) & (df["venue"]== stadium)]

innings.plot(xlabel="Number of balls")


In [None]:
# line chart of overs vs runs

innings.plot(x="overs",y="runs",alpha = 0.5)

In [None]:
# selecting from stadium 

def getStadiumRuns(stadiumName):
    stadium_matches = df[df["venue"]==stadiumName]["total"]
    runs_in_that_stadium = stadium_matches.unique()
    return runs_in_that_stadium

getStadiumRuns("Civil Service Cricket Club, Stormont")

In [None]:
sns.boxplot(x=getStadiumRuns("Civil Service Cricket Club, Stormont"))

In [None]:
getStadiumRuns("M Chinnaswamy Stadium")

In [None]:
sns.boxplot(x=getStadiumRuns("M Chinnaswamy Stadium"))

### Observations

- The runs scored in Chinnaswamy Stadium is much higher than runs scored in Civil Service Cricket Club Stormont
- 75% of the matches in Chinnaswamy have score greater than 280, whereas for the other it is greater than 220
- Two matches in Chinnaswamy have scored significantly lesser than others (two dots on the left)

In [None]:
#stadium vs totals scatter plot
from matplotlib.pyplot import figure

fig, ax = plt.subplots(figsize=(10,30))

stadium = matches["venue"]
total = matches["total"]

ax.set_xlabel('total')
ax.set_ylabel('venue')

ax.scatter(total, stadium)


In [None]:
#teams vs totals

matches

In [None]:
fig, ax = plt.subplots(figsize=(20,20))

teams = matches["bat_team"]
total = matches["total"]

ax.set_xlabel('total')
ax.set_ylabel('bat_team')

ax.scatter(total, teams)

### Observations

- From both these graphs it can be observed that highest score was by England in Trent Bridge stadium

In [None]:
## Proof 
### Getting the highest score

max_runs_index = df["total"].idxmax()
max_runs = df.iloc[max_runs_index,:]
max_runs

## Heatmaps

#### Correlations between the various features of the dataset

In [None]:
#heatmap

### checking correlations between various features

plt.figure(figsize=(15,10))
correlations = df.corr();
sns.heatmap(correlations, cmap="BrBG",annot=True)


### Observations

- Overs and runs are obviously positively correlated (strongly infact)
- Runs taken in the last 5 overs seems to have a positive correlation with number of wickets taken, this can also be witnessed from observing that runs and wickets are positively correlated

In [None]:
#histogram - Total runs taken by all teams so far.

total = matches["total"]
bins = [50,100,150,200,250,300,350,400,450,500]

plt.hist(total, bins=bins)
plt.xlabel('Total scores')
plt.ylabel('Number of matches in the range')
plt.title('Matches vs total')
plt.tight_layout()
plt.show()


### Observations
- For most of the matches played in an ODI the first innings score ranges from 250 - 300 followed by 200 - 250

# Convert Textual data into Numeric data for Prediction

- As we can see we have few variables which are in form of text to train our models we want them to be converted to numeric

- These include
    - venue 
    - bat_team
    - bowl_team
    - batsman
    - bowler

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['bat_team'] = le.fit_transform(df['bat_team'])
df['venue'] = le.fit_transform(df['venue'])
df['bowl_team'] = le.fit_transform(df['bowl_team'])
df['batsman'] = le.fit_transform(df['batsman'])
df['bowler'] = le.fit_transform(df['bowler'])

In [None]:
df = df.drop(['date'],axis=1)
df.head()

# Data Splitting 

### Separating labels and features

- Input Vector 
    - venue
    - bat_team
    - bowl_team
    - batsman
    - bowler
    - runs
    - wickets
    - overs
    - striker
    - non-striker
    
    
- Output is the total score

In [None]:
X = df.iloc[:,[1,2,3,4,5,6,7,8,11,12]].values
y = df.iloc[:,13].values

df.iloc[315031,:]

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Scaling the Dependent and Independent Variables

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

# Accuracy

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss


def accuracy(y_test,y_pred):
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    rmsle = np.sqrt(mean_squared_log_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)
    
    print("Mean absolute error : ",mae)
    print("Mean squared error : ",rmse)
    print("Mean squared log error : ",rmsle)
    print("r2 score : ",r2)


# Test Cases

### Test Case 1

    India vs West Indies 2017 match at Sir Vivian Richards Stadium ODI 3-5

    Batsman - Yuvraj Singh
    Bowler - AR Nurse

    Runs so far - 99
    Wickets - 2
    Overs - 25.2
    
    [116,8,19,929,45,99,2,25.2,44,39]
    
    Actual Outcome : 251
    
### Test Case 2
    
    SriLanka vs Bangladesh 2017 match at Sinhalese Sports Club Ground ODI 3-3

    Batsman - WU Tharanga
    Bowler - Mustafizur Rahman

    Runs so far - 41
    Wickets - 0
    Overs - 5.3
    
    [115,17,3,915,446,41,0,5.3,24,13]
    
    Actual Outcome : 280
    
### Test Case 3

    India vs England 2017 match at Barabati Stadium ODI 2-3
    
    Batsman - Yuvraj Singh
    Bowler - BA Stokes
    
    Runs so far - 55
    Wickets - 3
    Overs - 39.3
    
    [5,8,6,929,87,258,3,39.3,143,84]
    
    Actual Outcome : 381

# Linear Regression 

In [None]:
from sklearn.linear_model import LinearRegression
lin = LinearRegression()
lin.fit(X_train,y_train)

## Accuracy Test - Linear Regression

In [None]:
## Testing the dataset using the trained model using linear regression

y_pred = lin.predict(X_test)
accuracy(y_test,y_pred)

## Predicting Scores - Linear Regression

### Test Cases

- Let us examine by taking three test cases

In [None]:
test_case1 = lin.predict(mm.transform([[116,8,19,929,45,99,2,25.2,44,39]]))

print("Prediction Score : ",test_case1)

In [None]:
test_case2 = lin.predict(mm.transform([[115,17,3,915,446,41,0,5.3,24,13]]))

print("Prediction Score : ",test_case2)

In [None]:
test_case3 = lin.predict(mm.transform([[5,8,6,929,87,258,3,39.3,143,84]]))

print("Prediction Score : ",test_case3)

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtreg = DecisionTreeRegressor()
dtreg = dtreg.fit(X_train,y_train)

## Accuracy Test - Decision Tree Regressor

In [None]:
## Testing dataset using trained model generated by Decision Tree Regressor

dtreg_pred = dtreg.predict(X_test)
accuracy(y_test,dtreg_pred)

## Predicting Score - Decision Tree Regressor

### Test Cases

- Let us examine by taking three test cases

In [None]:
test_case1 = dtreg.predict(mm.transform([[116,8,19,929,45,99,2,25.2,44,39]]))
print("Prediction_score : ",test_case1)

In [None]:
test_case2 = dtreg.predict(mm.transform([[115,17,3,915,446,41,0,5.3,24,13]]))
print("Prediction_score : ",test_case2)

In [None]:
test_case3 = dtreg.predict(mm.transform([[5,8,6,929,87,258,3,39.3,143,84]]))
print("Prediction_score : ",test_case3)

# Ensembling Models - Bagging and Boosting


# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(X_train,y_train)


## Accuracy Test - Random Forest Regressor

In [None]:
# Testing the dataset on trained model generated by Random Forest Regressor

reg_pred = reg.predict(X_test)
accuracy(reg_pred,y_test)

## Weightage of various features - Random Forest Regressor

In [None]:
sorted_feature_importance = reg.feature_importances_.argsort()
plt.barh(feature_names[sorted_feature_importance], 
        reg.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("Random Forest Regressor Feature Importance")

## Predicting scores - Random Forest Regression

### Test Cases

- Let us examine by taking three test cases

In [None]:
test_case1 =  reg.predict(mm.transform([[116,8,19,929,45,99,2,25.2,44,39]]))
print("Prediction score:" , test_case1)

In [None]:
test_case2 =  reg.predict(mm.transform([[115,17,3,915,446,41,0,5.3,24,13]]))
print("Prediction score:" , test_case2)

In [None]:
test_case3 =  reg.predict(mm.transform([[5,8,6,929,87,258,3,39.3,143,84]]))
print("Prediction score:" , test_case3)

# Gradient Boost Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gradreg = GradientBoostingRegressor(max_depth=10,n_estimators=100,learning_rate=0.5)
gradientBoostModel = gradreg.fit(X_train,y_train)

## Accuracy Test - Gradient Boost Regressor

In [None]:
gradreg_pred = gradientBoostModel.predict(X_test)
accuracy(y_test,gradreg_pred)

## Weightage of various features - Gradient Boost Regressor

In [None]:
sorted_feature_importance = gradreg.feature_importances_.argsort()
plt.barh(feature_names[sorted_feature_importance], 
        gradreg.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("Gradient Boost Regressor Feature Importance")

## Predicting scores - Gradient Boost Regressor

### Test Cases

- Let us examine by taking three test cases

In [None]:
test_case1 = gradreg.predict(mm.transform([[116,8,19,929,45,99,2,25.2,44,39]]))
print("Prediction_score : ",test_case1)

In [None]:
test_case2 =  gradreg.predict(mm.transform([[115,17,3,915,446,41,0,5.3,24,13]]))
print("Prediction score:" , test_case2)

In [None]:
test_case3 =  gradreg.predict(mm.transform([[5,8,6,929,87,258,3,39.3,143,84]]))
print("Prediction score:" , test_case3)

# Adaptive Boosting - AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
adareg = AdaBoostRegressor(n_estimators=200,learning_rate=1.0)
adaBoostModel = adareg.fit(X_train,y_train)

## Accuracy Test - AdaBoost Regressor

In [None]:
adareg_pred = adaBoostModel.predict(X_test)
accuracy(y_test,adareg_pred)

## Weightage of various features - AdaBoost Regressor

In [None]:
sorted_feature_importance = adareg.feature_importances_.argsort()
plt.barh(feature_names[sorted_feature_importance], 
        adareg.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("Ada Boost Regressor Feature Importance")

## Predicting scores - AdaBoost Regression

### Test Cases

- Let us examine by taking three test cases

In [None]:
test_case1 = adareg.predict(mm.transform([[116,8,19,929,45,99,2,25.2,44,39]]))
print("Prediction_score : ",test_case1)

In [None]:
test_case2 =  adareg.predict(mm.transform([[115,17,3,915,446,41,0,5.3,24,13]]))
print("Prediction score:" , test_case2)

In [None]:
test_case3 =  adareg.predict(mm.transform([[5,8,6,929,87,258,3,39.3,143,84]]))
print("Prediction score:" , test_case3)

# Categorical Boosting - CatBoost

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostRegressor
catreg = CatBoostRegressor(n_estimators=300,learning_rate=1.0)
catBoostModel = catreg.fit(X_train,y_train)

## Accuracy Test - CatBoost Regressor

In [None]:
catreg_pred = catBoostModel.predict(X_test)
r2 = r2_score(y_test,catreg_pred)
r2

## Weightage of various features - CatBoost Regressor

In [None]:
sorted_feature_importance = catreg.feature_importances_.argsort()
plt.barh(feature_names[sorted_feature_importance], 
        catreg.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("Cat Boost Regressor Feature Importance")

## Predicting scores -  CatBoost Regression

### Test Cases

- Let us examine by taking three test cases

In [None]:
test_case1 = catreg.predict(mm.transform([[116,8,19,929,45,99,2,25.2,44,39]]))
print("Prediction_score : ",test_case1)

In [None]:
test_case2 =  catreg.predict(mm.transform([[115,17,3,915,446,41,0,5.3,24,13]]))
print("Prediction score:" , test_case2)

In [None]:
test_case3 =  catreg.predict(mm.transform([[5,8,6,929,87,258,3,39.3,143,84]]))
print("Prediction score:" , test_case3)