In [159]:
import pandas as pd
import numpy as np

# Model: linear regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Model: Naive Bayes
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Model: SVM
from sklearn.svm import SVC

In [110]:
df = pd.read_csv('cricket_strike_rate_dataset.csv')
df.info()

print("\nFirst few rows of Dataset")
# df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Runs             1000 non-null   int64  
 1   Balls Faced      1000 non-null   int64  
 2   4s               1000 non-null   int64  
 3   6s               1000 non-null   int64  
 4   Opposition Team  1000 non-null   object 
 5   Strike Rate      900 non-null    float64
 6   Unnamed: 6       0 non-null      float64
 7   Unnamed: 7       0 non-null      float64
 8   Unnamed: 8       0 non-null      float64
 9   Unnamed: 9       0 non-null      float64
 10  Unnamed: 10      0 non-null      float64
 11  Unnamed: 11      0 non-null      float64
 12  Unnamed: 12      0 non-null      float64
 13  Unnamed: 13      0 non-null      float64
 14  Unnamed: 14      1 non-null      object 
dtypes: float64(9), int64(4), object(2)
memory usage: 117.3+ KB

First few rows of Dataset


# 1. Data preprocessing
- Inputs: ['Runs','Balls Faced','4s','6s'], 'Opposition Team' dropped because type is object.
- Output: ['Strike Rate']
    - Remove invalid values: ['NaN','NA',float('inf')]

# 1.1 Fix null values
- The max Strike Rate is infinity, so we can't fill null values with average.
- So, I have filled the null values with the min strike rate (0.0).

In [126]:
# 1. data preprocessing
df_filtered = df[(df['Strike Rate'] != float('inf')) & (df['Strike Rate'] != str('NA')) & (df['Strike Rate'] != str('NaN'))]
x_initial = df_filtered[['Runs','Balls Faced','4s','6s']]
Y_initial = df_filtered['Strike Rate']

# check null values
print(f"Null values in input columns:\n{x_initial.isnull().sum()}")
print(f"\nNull values in output (class label): {Y_initial.isnull().sum()}")

# verify
Y_initial.isnull().sum()

print(f"\nData Types of input columns:\n{x_initial.dtypes}")
print(f"\nData Type of output column: {Y_initial.dtypes}")

Null values in input columns:
Runs           0
Balls Faced    0
4s             0
6s             0
dtype: int64

Null values in output (class label): 100

Data Types of input columns:
Runs           int64
Balls Faced    int64
4s             int64
6s             int64
dtype: object

Data Type of output column: float64


# 1.2 - 1.5 further preprocessing
- Remove all rows having Strike Rate = infinity.
- Convert all data types of y to numeric.
- Check for duplicates. There are no duplicates.
- Convert to numpy array for easier learning by the ML Model.

In [133]:
# 1.3 convert data type of output column to numeric
# Y_initial = pd.to_numeric(Y_initial)

# check for duplicates
print(f"\nNumber of duplicate rows: {df_filtered.duplicated().sum()}")

# 1.5 convert to numpy array
x = x_initial.to_numpy()
Y = Y_initial.to_numpy()

# 1.4 check for isnan values in `Y`
x = x[~np.isnan(Y)]
Y = Y[~np.isnan(Y)]

print("\nShape of Input array:",x.shape)
print("\nShape of Output array:",Y.shape)


Number of duplicate rows: 0

Shape of Input array: (896, 4)

Shape of Output array: (896,)


# 1.5 Split dataset
- Training: 80%
- Testing: 20%

# 1.6 Model: Linear Regression

In [134]:
# split dataset
x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2)

model_linear_regression = LinearRegression()
model_linear_regression.fit(x_train, Y_train)

# train model
Y_pred = model_linear_regression.predict(x_test)

mse = mean_squared_error(Y_test,Y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Model Coefficients: {model_linear_regression.coef_}")
print(f"Model Intercept: {model_linear_regression.intercept_}")

Mean Squared Error: 223097.28037437197
Model Coefficients: [ 3.72471797 -5.80020644 -1.92561706 -1.82851865]
Model Intercept: 584.0866439219453


# 3.b Model: Naive Bayes Classifier

In [150]:
# data obtain
df_filtered = df[(df['Strike Rate'] != float('inf')) & (df['Strike Rate'] != str('NA')) & (df['Strike Rate'] != str('NaN'))]
x_initial = df_filtered[['Runs','Balls Faced','4s','6s','Strike Rate']]
Y_initial = df_filtered['Opposition Team']

In [151]:
# 1. data preprocessing
# check null values
print(f"Null values in input columns:\n{x_initial.isnull().sum()}")
print(f"\nNull values in output (class label): {Y_initial.isnull().sum()}")

# verify
Y_initial.isnull().sum()

print(f"\nData Types of input columns:\n{x_initial.dtypes}")
print(f"\nData Type of output column: {Y_initial.dtypes}")

Null values in input columns:
Runs             0
Balls Faced      0
4s               0
6s               0
Strike Rate    100
dtype: int64

Null values in output (class label): 0

Data Types of input columns:
Runs             int64
Balls Faced      int64
4s               int64
6s               int64
Strike Rate    float64
dtype: object

Data Type of output column: object


# Encode the output value: Label Encoder

In [152]:
Y_initial.unique()

array(['England', 'India', 'Australia', 'Pakistan', 'South Africa'],
      dtype=object)

In [153]:
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y_initial)

# Convert to numpy arrays
x = x_initial.to_numpy()
Y = Y_encoded

# check for isnan values in `Y`
x = x[~np.isnan(Y)]

# Train model: Naive Bayes

In [161]:
# Split the data
x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2)
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

model_naive_bayes = GaussianNB()
model_naive_bayes.fit(x_train, Y_train)

# Make predictions on the test set
Y_pred = model_naive_bayes.predict(x_test)

# Evaluate the model
accuracy_naivebayes = accuracy_score(Y_test, Y_pred)
print(f"Accuracy of Naive Bayes Classifier: {accuracy_naivebayes}")

Accuracy of Naive Bayes Classifier: 0.225


# Model: Support Vector Machines (SVM)

In [162]:
# train
model_svm = SVC()
model_svm.fit(x_train, Y_train)

# predict
Y_pred = model_svm.predict(x_test)

accuracy_svm = accuracy_score(Y_test, Y_pred)
print(f"Accuracy of SVM Classifier: {accuracy_svm}")

Accuracy of SVM Classifier: 0.24


In [164]:
# Compare accuracy
print(f"Difference in Accuracy: {accuracy_svm-accuracy_naivebayes}")

Difference in Accuracy: 0.014999999999999986
