# Importing and removing unwanted columns

In [1]:
import numpy as np                              # Importing numpy library
import pandas as pd                             # Importing the pandas library

In [2]:
match = pd.read_csv('ipl2017.csv')              # Reading the csv dataset
match.head()                                    # Viewing first five rows of the dataset

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [3]:
match = match.drop(['mid', 'date', 'venue'], axis = 1)        # Dropped 3 unwanted columns
match.head()                                                  # Viewed first five rows of the dataset

Unnamed: 0,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


# Machine Learning Workflow

## 1--) Extraction 

In [4]:
y = match['total']                                      # Created a target variable
y.head(10)                                              # Viewed first 10 elements of the target variable

0    222
1    222
2    222
3    222
4    222
5    222
6    222
7    222
8    222
9    222
Name: total, dtype: int64

In [5]:
X = match.drop('total',axis=1)                          # Created a feature variable
X.head(10)                                              # Viewed first 10 elements of the feature variable

Unnamed: 0,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker
0,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0
1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0
2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0
3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0
4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0
5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.5,2,0,0,0
6,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,3,0,0.6,3,0,0,0
7,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,Z Khan,3,0,1.1,3,0,0,0
8,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,Z Khan,7,0,1.2,7,0,4,0
9,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,Z Khan,11,0,1.3,11,0,8,0


# Checking for the rules

In [6]:
X.info()                                                 # Checked summary of the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bat_team        76014 non-null  object 
 1   bowl_team       76014 non-null  object 
 2   batsman         76014 non-null  object 
 3   bowler          76014 non-null  object 
 4   runs            76014 non-null  int64  
 5   wickets         76014 non-null  int64  
 6   overs           76014 non-null  float64
 7   runs_last_5     76014 non-null  int64  
 8   wickets_last_5  76014 non-null  int64  
 9   striker         76014 non-null  int64  
 10  non-striker     76014 non-null  int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 6.4+ MB


In [7]:
X.shape                                          # Checking whether our features are of form rows and columns

(76014, 11)

In [8]:
type(X)                                          # Checking whether our features are of type array/DataFrame

pandas.core.frame.DataFrame

In [9]:
X.isnull().sum().sum()                           # Checking whether null values are present in our features

0

In [10]:
# Converting the categorical data into numeric data

In [11]:
from sklearn.preprocessing import LabelEncoder                # Importing necessary Library
labeled = LabelEncoder()                                      # Instantiated the LabelEncoder

In [12]:
X['bat_team'] = labeled.fit_transform(X['bat_team'])          # Applied fit_transform method
X['bat_team'].head(10)                                        # Viewed first ten rows

0    6
1    6
2    6
3    6
4    6
5    6
6    6
7    6
8    6
9    6
Name: bat_team, dtype: int32

In [13]:
X['bowl_team'] = labeled.fit_transform(X['bowl_team'])
X['bowl_team'].head(10)

0    12
1    12
2    12
3    12
4    12
5    12
6    12
7    12
8    12
9    12
Name: bowl_team, dtype: int32

In [14]:
X['bowler'] = labeled.fit_transform(X['bowler'])
X['bowler'].head(10)

0    201
1    201
2    201
3    201
4    201
5    201
6    201
7    328
8    328
9    328
Name: bowler, dtype: int32

In [15]:
X['batsman'] = labeled.fit_transform(X['batsman'])
X['batsman'].head(10)

0    328
1     61
2     61
3     61
4     61
5     61
6     61
7     61
8     61
9     61
Name: batsman, dtype: int32

In [16]:
X.head(10)

Unnamed: 0,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker
0,6,12,328,201,1,0,0.1,1,0,0,0
1,6,12,61,201,1,0,0.2,1,0,0,0
2,6,12,61,201,2,0,0.2,2,0,0,0
3,6,12,61,201,2,0,0.3,2,0,0,0
4,6,12,61,201,2,0,0.4,2,0,0,0
5,6,12,61,201,2,0,0.5,2,0,0,0
6,6,12,61,201,3,0,0.6,3,0,0,0
7,6,12,61,328,3,0,1.1,3,0,0,0
8,6,12,61,328,7,0,1.2,7,0,4,0
9,6,12,61,328,11,0,1.3,11,0,8,0


# 2--) Splitting and Scaling

In [17]:
from sklearn.model_selection import train_test_split                    # Imported the necessary function

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)    # Splitted into training and testing data

In [19]:
X.head()

Unnamed: 0,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker
0,6,12,328,201,1,0,0.1,1,0,0,0
1,6,12,61,201,1,0,0.2,1,0,0,0
2,6,12,61,201,2,0,0.2,2,0,0,0
3,6,12,61,201,2,0,0.3,2,0,0,0
4,6,12,61,201,2,0,0.4,2,0,0,0


In [20]:
from sklearn.preprocessing import StandardScaler                # Imported the StandardScaler

In [21]:
scaler = StandardScaler()                                       # Instantiated the StandardScaler

In [22]:
X_train = scaler.fit_transform(X_train)                         # fitted and transformed our training data

In [23]:
X_test = scaler.transform(X_test)                               # Transformed our testing data

# 3--) Training based on training data 

In [24]:
from sklearn.neighbors import KNe              # Importing RandomForestRegressor

In [25]:
model=RandomForestRegressor()                                   # Instantiating RandomForestRegressor

In [26]:
model.fit(X_train,y_train)                                      # Fitting on our training data

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

# Testing model based on testing data 

In [27]:
model.score(X_test,y_test)                                      # Checking the accuracy of the model

0.9056668735859524

In [28]:
X_dataframe = X_train.tolist()                                  # Converting numpy array to list

In [29]:
X_dataframe = pd.DataFrame(X_train)                             # Coverting list to dataframe

In [30]:
feature_important = model.feature_importances_
feature_important
total = sum(feature_important)
new = [value * 100 / total for value in feature_important]
new = np.round(new,2)
keys = list(X_dataframe.columns)
feature_importances = pd.DataFrame()
feature_importances['Features'] = keys
feature_importances['Importance (%)'] = new
feature_importances = feature_importances.sort_values(['Importance (%)'],ascending=False).reset_index(drop=True)
feature_importances

Unnamed: 0,Features,Importance (%)
0,7,18.84
1,4,15.8
2,5,15.7
3,6,8.03
4,1,7.52
5,2,7.51
6,9,7.49
7,0,7.37
8,3,7.03
9,10,3.53


# Predicting on new model

In [31]:
dict2={0:[1,2,4,13,12,5,3,4,2,6],1:[4,3,11,14,1,10,1,5,13,2],2:[12,43,78,99,2,71,33,123,172,111],3:[21,88,65,69,33,34,100,200,126,220],4:[69,59,12,10,22,24,59,120,90,87],5:[1,3,5,7,9,0,2,4,6,8],6:[0.2,0.4,19.1,15.1,12.3,16.5,12.1,9.5,2.1,1.5],7:[55,10,21,43,22,33,44,55,11,10],8:[2,1,0,3,1,3,4,1,3,4],9:[20,0,11,33,14,99,12,23,4,55],10:[44,122,32,0,54,21,2,4,55,60]}

In [32]:
data2=pd.DataFrame(dict2)            

In [33]:
model.predict(data2)                                  # Predicting output on new dataset

array([248.72, 234.89, 248.33, 249.75, 248.33, 247.88, 249.64, 248.33,
       243.07, 251.1 ])