# Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
# Importing Dataset
data = pd.read_csv('/content/IPL_team_prediction_data.csv')
data.head()

Unnamed: 0,current_team,choose,venue,scored,conceived,result
0,RCB,bat,away,173,176,loss
1,CSK,bowl,home,176,173,won
2,PBKS,bowl,home,177,174,won
3,DC,bat,away,174,177,loss
4,SRH,bowl,away,204,208,loss


In [None]:
# Lets see how many teams we have in the dataset
all_teams = data['current_team'].unique().tolist()
all_teams

['RCB', 'CSK', 'PBKS', 'DC', 'SRH', 'KKR', 'RR', 'LSG', 'MI', 'GT']

So some teams indeed played less, this is the with respect to data of IPL 2024, till **24st May 2024.**

As the data is created by me manually, it doesn't have any outlier, or missing values. So lets move on to next steps of data preprocessing

## Feature engineering

### Aggregating single match results in single row

Creating a additional dataframe to show aggregate info of both teams of a match in single line.
Deleting 'choose' column here as we won't require it from now

In [None]:
rows = []
for i in range(0, len(data), 2):
    current_row = data.iloc[i]
    next_row = data.iloc[i + 1] if i + 1 < len(data) else None
    if next_row is not None:
        rows.append({
            'current_team': current_row['current_team'],
            'opp_team': next_row['current_team'],
            'venue': current_row['venue'],
            'scored': current_row['scored'],
            'conceived': current_row['conceived'],
            'result': current_row['result']
        })

data_dub = pd.DataFrame(rows)
print(data_dub.head())

  current_team opp_team venue  scored  conceived result
0          RCB      CSK  away     173        176   loss
1         PBKS       DC  home     177        174    won
2          SRH      KKR  away     204        208   loss
3           RR      LSG  away     193        173    won
4           MI       GT  away     162        168   loss


This would naturally halve the length of our original dataframe

In [None]:
print('Length of original dataframe: {}'.format(str(len(data))))
print('Length of new dataframe: {}'.format(str(len(data_dub))))

Length of original dataframe: 138
Length of new dataframe: 69


### Interchanging the team names

Now, lets logically interchange the teams so that we create a more robust ML model. This would give us reliable prediction in case of user interchanges the names of teams in input

In [None]:
# Interchanging values of teams, venue and runs in scored and conceived also the results
rows = []
for i, row in data_dub.iterrows():
  # first row with same values that of original frame
  first_row = [row['current_team'], row['opp_team'], row['venue'],
                row['scored'], row['conceived'], row['result']]

  # second row with values interchanged
  second_row = [row['opp_team'], row['current_team'],
                   'away' if row['venue'] == 'home' else 'home',
                   row['conceived'], row['scored'],
                   'loss' if row['result'] == 'won' else 'won']
  rows.append(first_row)
  rows.append(second_row)

data_robust = pd.DataFrame(rows, columns=['current_team', 'opp_team', 'venue', 'scored', 'conceived', 'result'])
print(data_robust.head())

  current_team opp_team venue  scored  conceived result
0          RCB      CSK  away     173        176   loss
1          CSK      RCB  home     176        173    won
2         PBKS       DC  home     177        174    won
3           DC     PBKS  away     174        177   loss
4          SRH      KKR  away     204        208   loss


Now this new robust dataframe must have same no. of rows as original one.

In [None]:
print('Length of original dataframe: {}'.format(str(len(data))))
print('Length of new robust dataframe: {}'.format(str(len(data_robust))))

Length of original dataframe: 138
Length of new robust dataframe: 138


### Getting avg NRR for each team as per venue

This dataset would be used in the last step where we would modify our predicted output which is winning percent with this NRR value.

In [None]:
team_total_runs = data_robust.groupby(['current_team'])['scored'].sum().reset_index()
team_total_conceded = data_robust.groupby(['current_team'])['conceived'].sum().reset_index()
team_overs_bowled = data_robust.groupby(['current_team']).size().reset_index(name='matches_played')
team_overs_bowled['overs_bowled'] = team_overs_bowled['matches_played'] * 20
team_overs_bowled['overs_faced'] = team_overs_bowled['matches_played'] * 20

In [None]:
team_nrr_stats = team_total_runs.merge(team_total_conceded, on='current_team')
team_nrr_stats = team_nrr_stats.merge(team_overs_bowled[['current_team', 'matches_played', 'overs_bowled', 'overs_faced']], on='current_team')
team_nrr_stats['nrr'] = (team_nrr_stats['scored']/team_nrr_stats['overs_faced']) - (team_nrr_stats['conceived'] / team_nrr_stats['overs_bowled'])

In [None]:
team_nrr_stats.sort_values("nrr", ascending=False)

Unnamed: 0,current_team,scored,conceived,matches_played,overs_bowled,overs_faced,nrr
3,KKR,2553,2294,13,260,260,0.996154
0,CSK,2526,2415,14,280,280,0.396429
7,RCB,2930,2820,15,300,300,0.366667
8,RR,2508,2482,14,280,280,0.092857
9,SRH,2764,2763,14,280,280,0.003571
4,LSG,2483,2521,14,280,280,-0.135714
2,GT,2040,2101,12,240,240,-0.254167
5,MI,2568,2660,14,280,280,-0.328571
6,PBKS,2487,2614,14,280,280,-0.453571
1,DC,2573,2762,14,280,280,-0.675


In [None]:
team_nrr_stats.loc[team_nrr_stats['current_team'] == 'KKR']

Unnamed: 0,current_team,scored,conceived,matches_played,overs_bowled,overs_faced,nrr
3,KKR,2553,2294,13,260,260,0.996154


## Encoding Categorical features

Label Encoding: **Venue**

OneHotEncoding: **Current Team**, **Opp Team**

Regular value replacement method: **Result**

In [None]:
le = LabelEncoder()
data_robust['venue'] = le.fit_transform(data_robust['venue'])
data_robust['result'] = data_robust['result'].replace({'won':1, 'loss':0})
encoded_columns = pd.get_dummies(data=data_robust, columns=['current_team','opp_team'],dtype=int)
encoded_columns.columns

Index(['venue', 'scored', 'conceived', 'result', 'current_team_CSK',
       'current_team_DC', 'current_team_GT', 'current_team_KKR',
       'current_team_LSG', 'current_team_MI', 'current_team_PBKS',
       'current_team_RCB', 'current_team_RR', 'current_team_SRH',
       'opp_team_CSK', 'opp_team_DC', 'opp_team_GT', 'opp_team_KKR',
       'opp_team_LSG', 'opp_team_MI', 'opp_team_PBKS', 'opp_team_RCB',
       'opp_team_RR', 'opp_team_SRH'],
      dtype='object')

In [None]:
encoded_columns.head()

Unnamed: 0,venue,scored,conceived,result,current_team_CSK,current_team_DC,current_team_GT,current_team_KKR,current_team_LSG,current_team_MI,...,opp_team_CSK,opp_team_DC,opp_team_GT,opp_team_KKR,opp_team_LSG,opp_team_MI,opp_team_PBKS,opp_team_RCB,opp_team_RR,opp_team_SRH
0,0,173,176,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,176,173,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,177,174,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,174,177,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,204,208,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
# Removing some unwanted columns, Rearranging the columns
encoded_columns.drop(columns=['scored', 'conceived'], inplace=True)
encoded_columns= encoded_columns[['current_team_CSK',
       'current_team_DC', 'current_team_GT', 'current_team_KKR',
       'current_team_LSG', 'current_team_MI', 'current_team_PBKS',
       'current_team_RCB', 'current_team_RR', 'current_team_SRH',
       'opp_team_CSK', 'opp_team_DC', 'opp_team_GT', 'opp_team_KKR',
       'opp_team_LSG', 'opp_team_MI', 'opp_team_PBKS', 'opp_team_RCB',
       'opp_team_RR', 'opp_team_SRH','venue', 'result']]

encoded_columns.head()

Unnamed: 0,current_team_CSK,current_team_DC,current_team_GT,current_team_KKR,current_team_LSG,current_team_MI,current_team_PBKS,current_team_RCB,current_team_RR,current_team_SRH,...,opp_team_GT,opp_team_KKR,opp_team_LSG,opp_team_MI,opp_team_PBKS,opp_team_RCB,opp_team_RR,opp_team_SRH,venue,result
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


Now this is our Final dataframe which is ready for fitting into our ML model



# Training the model

We won't split our dataset here because this won't be the final step of the model preparation.
We would be coupling this with a NRR formula

In [None]:
X = encoded_columns.iloc[:, :-1].values
y = encoded_columns.iloc[:, -1].values

In [None]:
# Scalling the input values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_scaled, y)

# Calculating Accuracy of the model

So the Accuracy of the model is tested upon a random set of match data taken from 2nd half or recently occured matches. Why?

That's because The match info we've feeded the model on, calculates the NRR based on current team's performances. If we had few of the matches data available, it would have predicted great on those initial matches.

In [None]:
# Prepared a test dataset with matches from 2nd half of the tournament. This is done in accordance
# This would ensure proper testing of the model
data_test = pd.read_csv('/content/IPL_team_predictor_test.csv')
data_test.head()

Unnamed: 0,current_team,opp_team,venue,result
0,CSK,SRH,away,loss
1,RCB,RR,away,loss
2,MI,RR,away,loss
3,DC,GT,home,won
4,KKR,PBKS,home,loss


In [None]:
print('Length of Test Dataset: ',len(data_test))

Length of Test Dataset:  20


Encoding categorical variables as previously used in training dataset. We here would be using manual onehotencoding, because we don't want our encoding to be dependant on diversity of our test dataset (which we know is less)

In [None]:
data_test['venue'] = le.transform(data_test['venue'])
data_test['result'] = data_test['result'].replace({'won':1, 'loss':0})
dummy_columns = ['current_team_CSK','current_team_DC', 'current_team_GT', 'current_team_KKR','current_team_LSG', 'current_team_MI',
                 'current_team_PBKS','current_team_RCB', 'current_team_RR', 'current_team_SRH','opp_team_CSK', 'opp_team_DC',
                 'opp_team_GT', 'opp_team_KKR','opp_team_LSG', 'opp_team_MI', 'opp_team_PBKS', 'opp_team_RCB','opp_team_RR',
                 'opp_team_SRH','venue', 'result']

# Define a dictionary to map team names to column names
team_to_column = {
    'CSK': 'current_team_CSK', 'DC': 'current_team_DC', 'GT': 'current_team_GT',
    'KKR': 'current_team_KKR', 'LSG': 'current_team_LSG', 'MI': 'current_team_MI',
    'PBKS': 'current_team_PBKS', 'RCB': 'current_team_RCB', 'RR': 'current_team_RR',
    'SRH': 'current_team_SRH'
}

opp_team_to_column = {
    'CSK': 'opp_team_CSK', 'DC': 'opp_team_DC', 'GT': 'opp_team_GT',
    'KKR': 'opp_team_KKR', 'LSG': 'opp_team_LSG', 'MI': 'opp_team_MI',
    'PBKS': 'opp_team_PBKS', 'RCB': 'opp_team_RCB', 'RR': 'opp_team_RR',
    'SRH': 'opp_team_SRH'
}

encoded_df = pd.DataFrame(0, index=range(data_test.shape[0]), columns=dummy_columns)
for index, row in data_test.iterrows():
  current_team = row['current_team']
  opp_team = row['opp_team']
  if current_team in team_to_column:
    encoded_df.at[index, team_to_column[current_team]] = 1
  if opp_team in opp_team_to_column:
    encoded_df.at[index, opp_team_to_column[opp_team]] = 1

encoded_df['venue'] = data_test['venue']
encoded_df['result'] = data_test['result']
print(encoded_df.head())

   current_team_CSK  current_team_DC  current_team_GT  current_team_KKR  \
0                 1                0                0                 0   
1                 0                0                0                 0   
2                 0                0                0                 0   
3                 0                1                0                 0   
4                 0                0                0                 1   

   current_team_LSG  current_team_MI  current_team_PBKS  current_team_RCB  \
0                 0                0                  0                 0   
1                 0                0                  0                 1   
2                 0                1                  0                 0   
3                 0                0                  0                 0   
4                 0                0                  0                 0   

   current_team_RR  current_team_SRH  ...  opp_team_GT  opp_team_KKR  \
0             

In [None]:
encoded_df = encoded_df[['current_team_CSK',
       'current_team_DC', 'current_team_GT', 'current_team_KKR',
       'current_team_LSG', 'current_team_MI', 'current_team_PBKS',
       'current_team_RCB', 'current_team_RR', 'current_team_SRH',
       'opp_team_CSK', 'opp_team_DC', 'opp_team_GT', 'opp_team_KKR',
       'opp_team_LSG', 'opp_team_MI', 'opp_team_PBKS', 'opp_team_RCB',
       'opp_team_RR', 'opp_team_SRH','venue', 'result']]
print(encoded_df.columns)

Index(['current_team_CSK', 'current_team_DC', 'current_team_GT',
       'current_team_KKR', 'current_team_LSG', 'current_team_MI',
       'current_team_PBKS', 'current_team_RCB', 'current_team_RR',
       'current_team_SRH', 'opp_team_CSK', 'opp_team_DC', 'opp_team_GT',
       'opp_team_KKR', 'opp_team_LSG', 'opp_team_MI', 'opp_team_PBKS',
       'opp_team_RCB', 'opp_team_RR', 'opp_team_SRH', 'venue', 'result'],
      dtype='object')


Calculating Win percentage: It is done in two stages. First is calculated based on our logistic regression output (which, in intuitive terms predicts on the basis of past records).

Second, We adjust this probablity by our formula.

**Difference in %** = `(Opposition team NRR - Current team NRR)/Current team NRR`

Basically this is a comparitive value of performance between these two teams.

Now, We add this percentile value to our predicted value to get the final winning percentage.

In [None]:
win_percentage = list()
for i, row in encoded_df.iterrows():
  venue = row['venue']
  curr_row = row[:-1].values.reshape(1, -1)
  curr_row_scaled = sc.transform(curr_row)
  win_probability = classifier.predict_proba(curr_row_scaled)[:, 1]
  current_team = row.index[row.index.str.startswith('current_team_')][row.values[row.index.str.startswith('current_team_')] == 1][0].split('_')[-1]
  opp_team = row.index[row.index.str.startswith('opp_team_')][row.values[row.index.str.startswith('opp_team_')] == 1][0].split('_')[-1]
  team1_nrr = team_nrr_stats.loc[team_nrr_stats['current_team'] == current_team,"nrr"].values[0]
  team2_nrr = team_nrr_stats.loc[team_nrr_stats['current_team'] == opp_team, "nrr"].values[0]

  diff_percent = (team2_nrr - team1_nrr)/team1_nrr
  adjusted_pred_prob = win_probability + (win_probability * (diff_percent / 100))
  win_percentage.append(adjusted_pred_prob)

In [None]:
encoded_df['win_percentage'] = win_percentage

In [None]:
encoded_df.head()

Unnamed: 0,current_team_CSK,current_team_DC,current_team_GT,current_team_KKR,current_team_LSG,current_team_MI,current_team_PBKS,current_team_RCB,current_team_RR,current_team_SRH,...,opp_team_KKR,opp_team_LSG,opp_team_MI,opp_team_PBKS,opp_team_RCB,opp_team_RR,opp_team_SRH,venue,result,win_percentage
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,[0.3505665688208955]
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,[0.2942280216500937]
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,[0.17447939041230273]
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,[0.6849243583452609]
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,[0.856985478287633]


## Getting Accuracy

Rounding off the win_percentage value to nearest whole number to compare it against actual value

In [None]:
encoded_df['win_prediction'] = encoded_df['win_percentage'].apply(lambda x: 1 if x >= 0.5 else 0)
encoded_df.head()

Unnamed: 0,current_team_CSK,current_team_DC,current_team_GT,current_team_KKR,current_team_LSG,current_team_MI,current_team_PBKS,current_team_RCB,current_team_RR,current_team_SRH,...,opp_team_LSG,opp_team_MI,opp_team_PBKS,opp_team_RCB,opp_team_RR,opp_team_SRH,venue,result,win_percentage,win_prediction
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,[0.3505665688208955],0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,[0.2942280216500937],0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,[0.17447939041230273],0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,[0.6849243583452609],1
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,[0.856985478287633],1


In [None]:
encoded_df['prediction_accuracy'] = encoded_df['result'] == encoded_df['win_prediction']
print(encoded_df['prediction_accuracy'].value_counts())

prediction_accuracy
True     15
False     5
Name: count, dtype: int64


Getting the Accuracy of the model on the test dataset

In [None]:
print('Accuracy= {}%'.format((encoded_df['prediction_accuracy'].value_counts()[True]/len(encoded_df['prediction_accuracy']))*100))

Accuracy= 75.0%


So that's Great Accuracy for a Machine learning model predicting Sports outcome which is highly randomized 😲

# Predicting single results

Let's predict the match result for next match which is SRH vs RR on 24th May who's results even I don't know. It would be interesting to get the model's answer on this..

As this model doesn't support input of neutral venue info (Might be supported in future release 😉), Let's consider the venue of this match, which is chepauk, a home ground for SRH as they're nearer to it comparitively

Enter Values in the sample_input as: team1, team2, venue of team1 (can only have home/ away values. Incase of neutral venues favour the most nearest team's state geographically

In [None]:
sample_input = ['SRH','RR','home']
venue_encoded = le.transform([sample_input[2]])[0]

In [None]:
dummy_columns = [col for col in encoded_columns.columns if col not in ['venue','result']]
encoded_input_df = pd.DataFrame(0, index=[0], columns=dummy_columns)
encoded_input_df[f'current_team_{sample_input[0]}'] = 1
encoded_input_df[f'opp_team_{sample_input[1]}'] = 1
encoded_input_df['venue'] = venue_encoded
dummy_columns.append('venue')
encoded_input_df = encoded_input_df.reindex(columns=dummy_columns, fill_value=0)

input_data = encoded_input_df.values
input_data = sc.transform(input_data)

In [None]:
win_probability = classifier.predict_proba(input_data)[:, 1]
team1_nrr = team_nrr_stats.loc[team_nrr_stats['current_team'] == sample_input[0],"nrr"].values[0]
team2_nrr = team_nrr_stats.loc[team_nrr_stats['current_team'] == sample_input[1], "nrr"].values[0]
diff_percent = (team2_nrr - team1_nrr)/team1_nrr
adjusted_pred_prob = win_probability + (win_probability * (diff_percent / 100))

In [None]:
if adjusted_pred_prob >= 0.5:
  print('{} Wins this match with {}% chance!!'.format(sample_input[0],round(adjusted_pred_prob[0],4)*100))
else:
  print('{} Wins this match with {}% chance!!'.format(sample_input[1],round(1-adjusted_pred_prob[0],4)*100))

SRH Wins this match with 62.21% chance!!


Oh Interesting.. It predicts Hydrabad will take over its proceedings to the finals!! 🥳

Well we'll see if it indeed gets it right tommorrow.

📢 **Update**: SRH actually wins today's match. Hence our model passes its first real life test 🎉