In this notebook i am creating data in the format that anyone can easily understand. I.e

Here basically i am creating each player performance in each match.

1) Batsman Data - It contains columns like
* id, batsman, batsman_runs, balls_faced, strike_rate, Boundary Fours, Boundary Sixes,    innings_number, Team, opposition_team, Ground, date, 50s, 100s 


2) Bowlers Data - It contain columns like
* id, bowler, overs, runs_conceded, wickets_taken, economy_rate, innings_number, date, Ground, Team, opposition_team 

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_matches = pd.read_csv("/kaggle/input/ipl-complete-dataset-20082020/IPL Matches 2008-2020.csv")
df_matches.head(2)

In [None]:
df_deliveries = pd.read_csv("/kaggle/input/ipl-complete-dataset-20082020/IPL Ball-by-Ball 2008-2020.csv")
df_deliveries.head(2)

In [None]:
print(df_matches.shape)
print(df_deliveries.shape)

In [None]:
#In 2019 delhi daredevils name changed to delhi capitals. So i am replacing old name with new name for all the rows.
df_matches = df_matches.replace(to_replace ="Delhi Daredevils", value = "Delhi Capitals") 

df_deliveries = df_deliveries.replace(to_replace = "Delhi Daredevils", value = "Delhi Capitals")

#Rising pune supergiants name also printed in two ways. Lets change it to Rising pune supergiants.
df_matches = df_matches.replace(to_replace ="Rising Pune Supergiant", value = "Rising Pune Supergiants") 

df_deliveries = df_deliveries.replace(to_replace = "Rising Pune Supergiant", value = "Rising Pune Supergiants")

In [None]:
df_total = pd.merge(df_matches, df_deliveries, left_on='id', right_on='id',how='outer')
df_total = df_total.drop(['umpire1','umpire2'], axis = 1) 
df_total.head(2)

**Creating Batsman Data**

In [None]:
#batsman runs
batsman_runs = df_total.groupby(['id','batsman'])['batsman_runs'].sum()
df_batsman_runs = batsman_runs.to_frame().reset_index()
df_batsman_runs.head(2)

In [None]:
#batsman deliveries faced
dfballs = df_total[(df_total['extra_runs'] == 0) ] 
batsman_balls_faced = dfballs.groupby(['id','batsman'])['batsman_runs'].count() 
df_batsman_balls_faced = batsman_balls_faced.to_frame().reset_index()
df_batsman_balls_faced.rename(columns = {'batsman_runs' : 'balls_faced'}, inplace = True)
df_batsman_balls_faced.head(2) 

In [None]:
#boundaries - 4
df4 = df_total[df_total['batsman_runs'] == 4]
batsman_4s = df4.groupby(['id','batsman'])['batsman_runs'].count() 
df_batsman_4s = batsman_4s.to_frame().reset_index()
df_batsman_4s.rename(columns = {'batsman_runs' : 'Boundary Fours'}, inplace = True)
df_batsman_4s.head(10) 

In [None]:
#boundaries - 6
df6 = df_total[df_total['batsman_runs'] == 6]
batsman_6s = df6.groupby(['id','batsman'])['batsman_runs'].count() 
df_batsman_6s = batsman_6s.to_frame().reset_index()
df_batsman_6s.rename(columns = {'batsman_runs' : 'Boundary Sixes'}, inplace = True)
df_batsman_6s.head(2) 

In [None]:
#strike rate
df_strike_rate = df_batsman_runs.merge(df_batsman_balls_faced, how = 'inner', on = ['id', 'batsman'])
df_strike_rate['strike_rate'] = round(((df_strike_rate['batsman_runs'] / df_strike_rate['balls_faced']) * 100),2)

df_strike_rate = df_strike_rate.merge(df_batsman_4s, how = 'outer', on = ['id', 'batsman'])

df_strike_rate = df_strike_rate.merge(df_batsman_6s, how = 'outer', on = ['id', 'batsman'])

df_strike_rate["Boundary Fours"].fillna(0, inplace = True)

df_strike_rate["Boundary Sixes"].fillna(0, inplace = True)

df_strike_rate.head(5)

In [None]:
#innings number
df_inning = df_total[['id','inning','batsman']]
df_innings = df_strike_rate.merge(df_inning, how = 'inner', on = ['id', 'batsman'])
df_innings= df_innings.drop_duplicates(subset=['id','batsman','balls_faced','strike_rate'])
df_innings.rename(columns = {'inning' : 'innings_number'}, inplace = True)
df_innings.head(3)

In [None]:
# Team
df_battingteam= df_total[['id','batsman','batting_team']]
df_team = df_innings.merge(df_battingteam, how = 'inner', on = ['id', 'batsman'])
df_team= df_team.drop_duplicates(subset=['id','batsman','balls_faced','strike_rate','innings_number'])
df_team.rename(columns = {'batting_team' : 'Team'}, inplace = True)
df_team.head(3)

In [None]:
# opposition team
df_bowlingteam = df_total[['id','batsman','bowling_team']]
df_opposition = df_team.merge(df_bowlingteam, how = 'inner', on = ['id', 'batsman'])
df_opposition= df_opposition.drop_duplicates(subset=['id','batsman','batsman_runs','balls_faced','strike_rate','innings_number','Team'])
df_opposition.rename(columns = {'bowling_team' : 'opposition_team'}, inplace = True)
df_opposition.head(3)

In [None]:
# Ground
df_venue= df_total[['id','batsman','venue']]
df_ground = df_opposition.merge(df_venue, how = 'inner', on = ['id', 'batsman'])
df_ground= df_ground.drop_duplicates(subset=['id','batsman','batsman_runs','balls_faced','strike_rate','innings_number','Team','opposition_team'])
df_ground.rename(columns = {'venue' : 'Ground'}, inplace = True)
df_ground.head(3)

In [None]:
# Date
df_date= df_total[['id','batsman','date']]
df_date = df_ground.merge(df_date, how = 'inner', on = ['id', 'batsman'])
df_date= df_date.drop_duplicates(subset=['id','batsman','batsman_runs','balls_faced','strike_rate','innings_number','Team','opposition_team','Ground'])
df_date.head(3)

In [None]:
import numpy as np
df_date['50s'] = np.where(((df_date.batsman_runs >= 50) & (df_date.batsman_runs < 99)) , 1, 0)
df_date['100s'] = np.where(df_date.batsman_runs >= 100 , 1, 0)
df_date.head(2)

In [None]:
df_date.to_csv('batsmans_data(08-20).csv') 

**Creating Bowlers Data**

In [None]:
#Overs bowled
df_overs = df_total[(df_total['extra_runs'] == 0)]

bowlerovers = df_overs.groupby(['id','bowler'])['ball'].count()/6
df_bowlersovers = bowlerovers.to_frame().reset_index()
df_bowlersovers["ball"] = df_bowlersovers['ball'].astype(int)
df_bowlersovers.rename(columns={'ball':'overs'}, inplace=True)
df_bowlersovers.head()

In [None]:
#Maidens bowled
# df_bowler_runs = df_total.groupby(['id','bowler','over'],as_index=False)['batsman_runs','extra_runs'].sum()
# df_bowler_runs['bowler_runs'] =df_bowler_runs['batsman_runs'] + df_bowler_runs['extra_runs'] 
# df_bowler_runs.head()
# df_maiden = df_bowler_runs[df_bowler_runs['bowler_runs'] == 0]
# df_maidens = df_maiden.groupby(['id','bowler'], as_index=False)['bowler_runs'].count()
# df_maidens.rename(columns={'bowler_runs':'maiden_overs'}, inplace = True)
# df_maidens.head() 

# df_maiden_overs = pd.merge(df_bowlersovers, df_maidens, left_on=['id','bowler'], right_on=['id','bowler'],how='outer')
# df_maiden_overs.head()

In [None]:
#Runs Conceded
df_runs = df_total.groupby(['id','bowler'],as_index=False)['batsman_runs','extra_runs'].sum()
df_runs['runs_conceded'] = df_runs['batsman_runs'] + df_runs['extra_runs']  
df_runs = df_runs[['id','bowler','runs_conceded']]
df_runs.head()
df_runs = pd.merge(df_bowlersovers, df_runs, left_on=['id','bowler'], right_on=['id','bowler'],how='outer')
df_runs.head()

In [None]:
#Wickets Taken
df_wickets = df_total[(df_total['player_dismissed'] != 'No Wicket') & (df_total['dismissal_kind'] != 'run out')]

df_wickets_taken = df_wickets.groupby(['id','bowler'],as_index=False)['player_dismissed'].count()
df_wickets_taken.rename(columns={'player_dismissed':'wickets_taken'},inplace=True)
df_wickets_taken.head()
df_wickets_taken = pd.merge(df_runs, df_wickets_taken, left_on=['id','bowler'], right_on=['id','bowler'],how='outer')
df_wickets_taken.head()

In [None]:
df_balls = df_total[(df_total['extra_runs'] == 0) ]

df_ballsbowled = df_balls.groupby(['id','bowler'],as_index=False)['ball'].count()
df_economy = pd.merge(df_runs, df_ballsbowled, left_on=['id','bowler'], right_on=['id','bowler'],how='outer')
df_economy.head()
df_economy['economy_rate'] = round((df_economy['runs_conceded'] / df_economy['ball'])*6,2)
df_economy = df_economy[['id','bowler','economy_rate']]
df_economy.head()
df_economy = pd.merge(df_wickets_taken, df_economy, left_on=['id','bowler'], right_on=['id','bowler'],how='outer')
df_economy.head()

In [None]:
#innings number
df_inning = df_total[['id','inning','bowler']]
df_innings = df_economy.merge(df_inning, how = 'inner', on = ['id', 'bowler'])
df_innings= df_innings.drop_duplicates(subset=['id','bowler'])
df_innings.rename(columns = {'inning' : 'innings_number'}, inplace = True)
df_innings.head(3)

In [None]:
# Date
df_date= df_total[['id','bowler','date']]
df_date = df_innings.merge(df_date, how = 'inner', on = ['id', 'bowler'])
df_date= df_date.drop_duplicates(subset=['id','bowler'])
df_date.head(3)

In [None]:
# Ground
df_venue= df_total[['id','bowler','venue']]
df_ground = df_date.merge(df_venue, how = 'inner', on = ['id', 'bowler'])
df_ground= df_ground.drop_duplicates(subset=['id','bowler'])
df_ground.rename(columns = {'venue' : 'Ground'}, inplace = True)
df_ground.head(3)

In [None]:
# Team
df_bowlingteam= df_total[['id','bowler','bowling_team']]
df_team = df_ground.merge(df_bowlingteam, how = 'inner', on = ['id', 'bowler'])
df_team= df_team.drop_duplicates(subset=['id','bowler'])
df_team.rename(columns = {'bowling_team' : 'Team'}, inplace = True)
df_team.head(3)

In [None]:
# opposition team
df_bowlingteam = df_total[['id','bowler','batting_team']]
df_opposition = df_team.merge(df_bowlingteam, how = 'inner', on = ['id', 'bowler'])
df_opposition= df_opposition.drop_duplicates(subset=['id','bowler'])
df_opposition.rename(columns = {'batting_team' : 'opposition_team'}, inplace = True)
df_opposition.head(3)

In [None]:
df_opposition.to_csv('bowlers_data(08-20).csv')

In [None]:
bat = pd.read_csv('./batsmans_data(08-20).csv')

In [None]:
bat

In [None]:
df_opposition

In [None]:
final_data = pd.merge(df_opposition, bat, on='id').drop(['Unnamed: 0'],1)

In [None]:
bat[bat.batsman == "SK Raina"].strike_rate.sum()

In [None]:
bat

In [None]:
final_data.to_csv('allfts.csv')

In [None]:
final_data.columns

In [None]:
(final_data.Ground_x != final_data.Ground_y).sum()

In [None]:
(final_data.Team_x != final_data.Team_y).sum()

In [None]:
teams = final_data[['Team_x', 'Team_y', 'opposition_team_x','opposition_team_y']]

In [None]:
final_data.drop(['Ground_y'],1,inplace=True)

In [None]:
c=0
for it in teams.iterrows():
    if not (it[1]['Team_x'] == it[1]['Team_y'] or it[1]['Team_x'] == it[1]['opposition_team_y'] ):
        c+=1
        print(it[1])
print(c)


In [None]:
final_data.isna().sum()

In [None]:
bat.isna().sum()

In [None]:
df_opposition.isna().sum()

In [None]:
final_data.dropna(inplace = True)

In [None]:
final_data.isna().sum()

In [None]:
final_data

In [None]:
final_data.drop(['Team_x', 'opposition_team_x'],1,inplace=True)

In [None]:
final_data