In [26]:
# Install LIBRARIES if not present
#! pip install requirements.txt

In [27]:
import pandas as pd
import lxml
import numpy as np

# Data Collection

## A. Scraping Player Statistics

### 1) Batting Statistics

In [28]:
batting =pd.DataFrame()
for i in range(1,10):
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};orderby=runs;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=batting'
    temp = pd.read_html(url)[2]
    batting=pd.concat([batting,temp])
    

batting=batting.drop(columns=['Span','Inns','NO','HS','BF',0])


batting=batting.rename(columns={'Runs' : 'Total Runs Scored',
                        'Ave' : 'Batting Avg',
                        'SR' : 'Batting Strike Rate',
                        '100': 'Total Centuries',
                        '50':'Total Half Centuries'
    })

batting.shape

(158, 11)

### 2) Bowling Statistics

In [29]:
bowling =pd.DataFrame()
for i in range(1,10):
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};orderby=wickets;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=bowling'
    temp = pd.read_html(url)[2]
    bowling=pd.concat([bowling,temp])


bowling=bowling.drop(columns=['Overs','Span','Inns','Mdns','Runs','4','5',0])
bowling=bowling.rename(columns={'Wkts': 'Total Wickets Taken',
                    'Ave' : 'Bowling Avg',
                    'BBI':'Best Bowling Figures',
                    'Econ':'Best Economy Rate',
                    'SR':'Bowling Strike Rate'

})
bowling.dropna(axis=1,how='all',inplace=True)
bowling.shape

(158, 7)

### 3) Fielding Statistics

In [30]:
fielding=pd.DataFrame()

for i in range(1,10):
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};orderby=dismissals;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=fielding'
    temp= pd.read_html(url)[2]
    fielding=pd.concat([fielding,temp])


fielding=fielding.drop(columns=['Span','Inns','Ct Wk','Ct Fi','MD','D/I',0])
fielding=fielding.rename(columns={'Mat':'Total Matches Played','Ct':'Total Catches Taken','St':'Total Stumpings Made','Dis':'Total Dismissals Made'})
fielding.dropna(axis=1,how='all',inplace=True)
fielding.shape

(158, 5)

In [31]:
players = pd.merge(batting, bowling, on='Player', how='inner')
players = pd.merge(players,fielding, on='Player', how='inner')
players.dropna(axis=1,how='all',inplace=True)


players = players.drop(columns=['Mat_x','Mat_y','0'])

##Adding Player Id
players['Player ID']=players.index
players = players[['Player ID'] + [col for col in players.columns if col != 'Player ID']]
players.shape

(278, 18)

## B. Adding Playing Styles

### 1) Type of Batter (by hand)

In [32]:
batting_hand=pd.DataFrame()
for i in range(1,5):
    ##Right Hand Batsmen
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?batting_hand=1;class=2;filter=advanced;page={i};orderby=runs;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=batting'
    temp= pd.read_html(url)[2]
    temp['Batting Hand']='Right Hand'
    batting_hand=pd.concat([batting_hand,temp])
    ##Left Hand Batsmen
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?batting_hand=2;class=2;filter=advanced;page={i};orderby=runs;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=batting'
    temp= pd.read_html(url)[2]
    temp['Batting Hand']='Left Hand'
    batting_hand=pd.concat([batting_hand,temp])


### 2) Type of Bowler (by arm)

In [33]:
bowling_arm=pd.DataFrame()
for i in range(1,5):
    ## Right Arm Bowler
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?bowling_hand=1;class=2;filter=advanced;page={i};orderby=matches;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround'
    temp= pd.read_html(url)[2]
    temp['Bowling Arm']='Right Arm'
    bowling_arm=pd.concat([bowling_arm,temp])

    ## Left Arm Bowler
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?bowling_hand=2;class=2;filter=advanced;page={i};orderby=matches;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround'
    temp= pd.read_html(url)[2]
    temp['Bowling Arm']='Left Arm'
    bowling_arm=pd.concat([bowling_arm,temp])

    #Unknoown Arm Bowler
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?bowling_hand=3;class=2;filter=advanced;page={i};orderby=matches;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround'
    temp= pd.read_html(url)[2]
    temp['Bowling Arm']='Unknown Arm'
    bowling_arm=pd.concat([bowling_arm,temp])



### 3) Bowling Style

In [34]:
bowling_style=pd.DataFrame()

for i in range(1,5):
    ## Pace Bowler
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?bowling_pacespin=1;class=2;filter=advanced;page={i};orderby=matches;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround'
    temp= pd.read_html(url)[2]
    temp['Bowling Style']='Pace Bowler'
    bowling_style=pd.concat([bowling_style,temp])

    ##Spin Bowler
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?bowling_pacespin=2;class=2;filter=advanced;page={i};orderby=matches;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround'
    temp= pd.read_html(url)[2]
    temp['Bowling Style']='Spin Bowler'
    bowling_style=pd.concat([bowling_style,temp])
    ##Mixture
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?bowling_pacespin=3;class=2;filter=advanced;page={i};orderby=matches;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround'
    temp= pd.read_html(url)[2]
    temp['Bowling Style']='Mixed'
    bowling_style=pd.concat([bowling_style,temp])

In [35]:

style=pd.merge(batting_hand,bowling_arm,on='Player',how='inner')
style=pd.merge(style,bowling_style,on='Player',how='inner')
style=style[['Player','Batting Hand','Bowling Arm','Bowling Style']]
style.head()

Unnamed: 0,Player,Batting Hand,Bowling Arm,Bowling Style
0,V Kohli,Right Hand,Right Arm,Pace Bowler
1,MS Dhoni,Right Hand,Right Arm,Pace Bowler
2,RG Sharma,Right Hand,Right Arm,Spin Bowler
3,SR Tendulkar,Right Hand,Right Arm,Spin Bowler
4,V Sehwag,Right Hand,Right Arm,Spin Bowler


In [36]:
players=pd.merge(players,style,on='Player',how='inner')
players=players[players['Player'].isna()==False]
players.head()

Unnamed: 0,Player ID,Player,Total Runs Scored,Batting Avg,Batting Strike Rate,Total Centuries,Total Half Centuries,4s,6s,Total Wickets Taken,...,Bowling Avg,Best Economy Rate,Bowling Strike Rate,Total Matches Played,Total Dismissals Made,Total Catches Taken,Total Stumpings Made,Batting Hand,Bowling Arm,Bowling Style
0,0,V Kohli,13437,58.16,93.69,48,69,1255,148,4,...,166.75,6.21,161.0,286.0,150.0,150.0,0.0,Right Hand,Right Arm,Pace Bowler
1,1,MS Dhoni,10599,50.23,87.13,9,73,809,222,1,...,31.0,5.16,36.0,347.0,438.0,318.0,120.0,Right Hand,Right Arm,Pace Bowler
2,2,RG Sharma,10423,49.16,91.4,31,53,961,309,8,...,64.37,5.21,74.1,256.0,92.0,92.0,0.0,Right Hand,Right Arm,Spin Bowler
3,3,SR Tendulkar,9855,46.7,85.75,25,52,1162,84,76,...,41.09,5.32,46.2,234.0,65.0,65.0,0.0,Right Hand,Right Arm,Spin Bowler
4,4,Yuvraj Singh,8609,36.47,87.43,14,52,896,153,110,...,38.42,5.08,45.3,301.0,93.0,93.0,0.0,Left Hand,Left Arm,Spin Bowler


In [37]:
players.to_csv('Players.csv')  

## B. Scraping Match Conditions

### 1) Day Vs Night Match

In [38]:
matches1=pd.DataFrame()

for i in range(1,10):
    url= f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};floodlit=1;orderby=start;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=team;view=results'
    temp= pd.read_html(url)[2]
    temp['Day/Night Match']='Day'
    matches1=pd.concat([matches1,temp])
    ##For night matches
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};floodlit=2;orderby=start;orderbyad=reverse;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=team;view=results'
    temp=pd.read_html(url)[2]
    temp['Day/Night Match']='Night'
    matches1=pd.concat([matches1,temp])


matches1=matches1[matches1['Team'].isna()==False]
matches1=matches1.dropna(axis=1)
matches1.head()

Unnamed: 0,Team,Result,Margin,Toss,Bat,Opposition,Ground,Start Date,Day/Night Match
0,India,lost,32 runs,won,2nd,v Pakistan,Hobart,21 Jan 2000,Day
1,India,lost,4 wickets,won,1st,v Australia,Perth,30 Jan 2000,Day
2,India,won,3 wickets,lost,2nd,v South Africa,Kochi,9 Mar 2000,Day
3,India,won,6 wickets,lost,2nd,v South Africa,Jamshedpur,12 Mar 2000,Day
4,India,lost,2 wickets,lost,1st,v South Africa,Faridabad,15 Mar 2000,Day


### 2) Home VS Away Vs Neutral Match

In [39]:
matches2=pd.DataFrame()

for i in range(1,10):
    ##Home Matches
    url= f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};home_or_away=1;orderby=start;orderbyad=reverse;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=team;view=results'
    temp = pd.read_html(url)[2]
    temp['Home/Away/Neutral']='Home'
    matches2=pd.concat([matches2,temp])
    ##Away Matches
    url = f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};home_or_away=2;orderby=start;orderbyad=reverse;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=team;view=results'
    temp = pd.read_html(url)[2]
    temp['Home/Away/Neutral']='Away'
    matches2=pd.concat([matches2,temp])
    #Neutral Matches
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};home_or_away=3;orderby=start;orderbyad=reverse;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=team;view=results'
    temp = pd.read_html(url)[2]
    temp['Home/Away/Neutral']='Neutral'
    matches2=pd.concat([matches2,temp])

## Conacatenating all 
matches2=matches2.dropna(axis=1,how='all')
matches2=matches2[matches2['Team'].isna()==False]
matches2.head()

Unnamed: 0,Team,Result,Margin,BR,Toss,Bat,Opposition,Ground,Start Date,Home/Away/Neutral,0
0,India,won,4 wickets,12.0,won,2nd,v New Zealand,Dharamsala,22 Oct 2023,Home,
1,India,won,7 wickets,51.0,lost,2nd,v Bangladesh,Pune,19 Oct 2023,Home,
2,India,won,7 wickets,117.0,won,2nd,v Pakistan,Ahmedabad,14 Oct 2023,Home,
3,India,won,8 wickets,90.0,lost,2nd,v Afghanistan,Delhi,11 Oct 2023,Home,
4,India,won,6 wickets,52.0,lost,2nd,v Australia,Chennai,8 Oct 2023,Home,


In [40]:
matches=pd.merge(matches1,matches2,on=['Team','Opposition','Ground','Start Date','Toss','Result','Bat'],how='inner')
matches=matches.drop(columns=['Margin_x','Margin_y','Team','BR'])

##Adding Match Id
matches['Match ID']=matches.index
matches = matches[['Match ID'] + [col for col in matches.columns if col != 'Match ID']]

# Create the 'Toss Decision' column based on 'Bat'
matches['Toss Decision'] = matches['Bat'].apply(lambda x: 'Bat' if x == '1st' else 'Field')
matches.drop(columns=['Bat',0],inplace=True)
matches.head()

Unnamed: 0,Match ID,Result,Toss,Opposition,Ground,Start Date,Day/Night Match,Home/Away/Neutral,Toss Decision
0,0,lost,won,v Pakistan,Hobart,21 Jan 2000,Day,Neutral,Field
1,1,lost,won,v Australia,Perth,30 Jan 2000,Day,Away,Bat
2,2,won,lost,v South Africa,Kochi,9 Mar 2000,Day,Home,Field
3,3,won,lost,v South Africa,Jamshedpur,12 Mar 2000,Day,Home,Field
4,4,lost,lost,v South Africa,Faridabad,15 Mar 2000,Day,Home,Bat


## C. Scraping Player Participation in Each Match

In [41]:


plays = pd.DataFrame()
for i in range(1,36):
    url=f'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;filter=advanced;page={i};orderby=start;page={i};size=200;spanmax1=31+Dec+2023;spanmin1=01+Jan+2000;spanval1=span;team=6;template=results;type=allround;view=match'
    temp= pd.read_html(url)[2]
    plays=pd.concat([plays,temp])

plays.dropna(axis=1,how='all',inplace=True)
plays.rename(columns={ "Bat1": "Runs Scored in Match",
                      "Wkts": "Wickets Taken in Match",
                      "Conc":"Runs Conceded in Match",
                      "Ct": "Catches Taken in Match",
                      "St": "Stumpings Made in Match"
},inplace=True)
# Create the 'Not Out' column

def remove_asterisk(score):
    return score.rstrip('*')

def is_not_out(score):
    return score.endswith('*')

plays['Not Out'] = plays['Runs Scored in Match'].str.endswith('*')

# Create the 'Did Not Bat' column
plays['Did Not Bat'] = (plays['Runs Scored in Match'] == 'DNB')

# Remove asterisk from 'Runs Scored in Match'
plays['Runs Scored in Match'] = plays['Runs Scored in Match'].apply(remove_asterisk)

# Convert 'Runs Scored' to None if 'Did Not Bat' is True
plays['Runs Scored in Match'] = plays.apply(lambda row: np.nan if row['Did Not Bat'] else row['Runs Scored in Match'], axis=1)

plays.head()


Unnamed: 0,Player,Runs Scored in Match,Wickets Taken in Match,Runs Conceded in Match,Catches Taken in Match,Stumpings Made in Match,Opposition,Ground,Start Date,Not Out,Did Not Bat
0,AB Agarkar,2,2,39,0,0,v Pakistan,Brisbane,10 Jan 2000,False,False
1,SS Dighe,6,-,-,1,0,v Pakistan,Brisbane,10 Jan 2000,False,False
2,R Dravid,8,-,-,1,0,v Pakistan,Brisbane,10 Jan 2000,False,False
3,SC Ganguly,61,1,26,0,0,v Pakistan,Brisbane,10 Jan 2000,False,False
4,HH Kanitkar,0,-,-,0,0,v Pakistan,Brisbane,10 Jan 2000,False,False


## D. Merging all Dataframes

In [42]:
temp = plays.join(players.set_index("Player"),on="Player",how='inner')
df = temp.join(matches.set_index(["Opposition","Ground","Start Date"]),on=["Opposition","Ground","Start Date"])
df=df.set_index(["Match ID","Player ID"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Runs Scored in Match,Wickets Taken in Match,Runs Conceded in Match,Catches Taken in Match,Stumpings Made in Match,Opposition,Ground,Start Date,Not Out,...,Total Catches Taken,Total Stumpings Made,Batting Hand,Bowling Arm,Bowling Style,Result,Toss,Day/Night Match,Home/Away/Neutral,Toss Decision
Match ID,Player ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
648,25,AB Agarkar,2.0,2,39,0,0,v Pakistan,Brisbane,10 Jan 2000,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,won,Night,Neutral,Bat
647,25,AB Agarkar,6.0,1,47,1,0,v Australia,Melbourne,12 Jan 2000,True,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,lost,Night,Away,Field
643,25,AB Agarkar,0.0,0,60,0,0,v Pakistan,Perth,28 Jan 2000,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,lost,Night,Neutral,Field
1,25,AB Agarkar,,1,39,1,0,v Australia,Perth,30 Jan 2000,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,won,Day,Away,Bat
2,25,AB Agarkar,,0,55,1,0,v South Africa,Kochi,9 Mar 2000,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,won,lost,Day,Home,Field


In [43]:
df.columns

Index(['Player', 'Runs Scored in Match', 'Wickets Taken in Match',
       'Runs Conceded in Match', 'Catches Taken in Match',
       'Stumpings Made in Match', 'Opposition', 'Ground', 'Start Date',
       'Not Out', 'Did Not Bat', 'Total Runs Scored', 'Batting Avg',
       'Batting Strike Rate', 'Total Centuries', 'Total Half Centuries', '4s',
       '6s', 'Total Wickets Taken', 'Best Bowling Figures', 'Bowling Avg',
       'Best Economy Rate', 'Bowling Strike Rate', 'Total Matches Played',
       'Total Dismissals Made', 'Total Catches Taken', 'Total Stumpings Made',
       'Batting Hand', 'Bowling Arm', 'Bowling Style', 'Result', 'Toss',
       'Day/Night Match', 'Home/Away/Neutral', 'Toss Decision'],
      dtype='object')

# Data Preprocessing

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6847 entries, (648, 25) to (59, 126)
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Player                   6847 non-null   object 
 1   Runs Scored in Match     5151 non-null   object 
 2   Wickets Taken in Match   6847 non-null   object 
 3   Runs Conceded in Match   6847 non-null   object 
 4   Catches Taken in Match   6847 non-null   object 
 5   Stumpings Made in Match  6847 non-null   object 
 6   Opposition               6847 non-null   object 
 7   Ground                   6847 non-null   object 
 8   Start Date               6847 non-null   object 
 9   Not Out                  6847 non-null   bool   
 10  Did Not Bat              6847 non-null   bool   
 11  Total Runs Scored        6847 non-null   object 
 12  Batting Avg              6847 non-null   object 
 13  Batting Strike Rate      6847 non-null   object 
 14  Total Centu

In [45]:
df.describe()

Unnamed: 0,Total Matches Played,Total Dismissals Made,Total Catches Taken,Total Stumpings Made
count,6847.0,6847.0,6847.0,6847.0
mean,150.022346,71.047028,64.268293,6.778735
std,99.02858,94.207929,71.022703,26.290434
min,1.0,0.0,0.0,0.0
25%,66.0,17.0,17.0,0.0
50%,125.0,43.0,43.0,0.0
75%,232.0,90.0,90.0,0.0
max,347.0,438.0,318.0,120.0


In [46]:
## Checking for NULLS
df.isna().sum()

Player                        0
Runs Scored in Match       1696
Wickets Taken in Match        0
Runs Conceded in Match        0
Catches Taken in Match        0
Stumpings Made in Match       0
Opposition                    0
Ground                        0
Start Date                    0
Not Out                       0
Did Not Bat                   0
Total Runs Scored             0
Batting Avg                   0
Batting Strike Rate           0
Total Centuries               0
Total Half Centuries          0
4s                            0
6s                            0
Total Wickets Taken           0
Best Bowling Figures          0
Bowling Avg                   0
Best Economy Rate             0
Bowling Strike Rate           0
Total Matches Played          0
Total Dismissals Made         0
Total Catches Taken           0
Total Stumpings Made          0
Batting Hand                  0
Bowling Arm                   0
Bowling Style                 0
Result                        0
Toss    

## Changing variables to appropriate format

In [47]:
##Converting to DateTime
df["Start Date"] = pd.to_datetime(df["Start Date"])
add_num = ["Total Runs Scored","Batting Avg","Batting Strike Rate","Total Centuries","Total Half Centuries","4s","6s","Total Wickets Taken","Best Bowling Figures","Bowling Avg","Best Economy Rate","Total Matches Played","Total Dismissals Made","Total Catches Taken","Total Stumpings Made"]

# Replace "-" with NaN
df = df.replace('-', np.nan)

#Conerting to numeric datatype
add_num = ["Total Runs Scored","Batting Avg","Batting Strike Rate","Total Centuries","Total Half Centuries","4s","6s","Total Wickets Taken","Bowling Avg","Best Economy Rate","Total Matches Played","Total Dismissals Made","Total Catches Taken","Total Stumpings Made"]
for col in add_num:
    df[col]= pd.to_numeric(df[col])

# Remove "v " from the beginning of the 'opposition' column
df['Opposition'] = df['Opposition'].str.replace('^v ', '', regex=True)

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Player,Runs Scored in Match,Wickets Taken in Match,Runs Conceded in Match,Catches Taken in Match,Stumpings Made in Match,Opposition,Ground,Start Date,Not Out,...,Total Catches Taken,Total Stumpings Made,Batting Hand,Bowling Arm,Bowling Style,Result,Toss,Day/Night Match,Home/Away/Neutral,Toss Decision
Match ID,Player ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
648,25,AB Agarkar,2.0,2,39,0,0,Pakistan,Brisbane,2000-01-10,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,won,Night,Neutral,Bat
647,25,AB Agarkar,6.0,1,47,1,0,Australia,Melbourne,2000-01-12,True,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,lost,Night,Away,Field
643,25,AB Agarkar,0.0,0,60,0,0,Pakistan,Perth,2000-01-28,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,lost,Night,Neutral,Field
1,25,AB Agarkar,,1,39,1,0,Australia,Perth,2000-01-30,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,lost,won,Day,Away,Bat
2,25,AB Agarkar,,0,55,1,0,South Africa,Kochi,2000-03-09,False,...,38.0,0.0,Right Hand,Right Arm,Pace Bowler,won,lost,Day,Home,Field


## Checking for outliers

In [48]:
# Function to calculate the percentage of outliers
def percentage_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = (column < lower_bound) | (column > upper_bound)
    percentage = outliers.sum() / len(column) * 100
    return percentage

# Calculate the percentage of outliers for each numerical column
outlier_percentages = df.select_dtypes(include=[np.number]).apply(percentage_outliers)

print(outlier_percentages)



Total Runs Scored         0.000000
Batting Avg               0.000000
Batting Strike Rate       4.410691
Total Centuries           7.915876
Total Half Centuries      0.000000
4s                        0.000000
6s                        3.738864
Total Wickets Taken       0.000000
Bowling Avg               9.420184
Best Economy Rate         5.418431
Total Matches Played      0.000000
Total Dismissals Made     5.067913
Total Catches Taken       5.067913
Total Stumpings Made     14.152183
dtype: float64


In [49]:
df.shape

(6847, 35)

**NOTE:** We will deal with outliers by applying appropriate data transformation in near future.

# Saving Final Dataframe

In [50]:
df.to_csv("IndianCricketTeam.csv")