In [1]:
import pandas as pd

data = pd.read_csv("F:/Projects/ML/T20 World Cup 2024 Match Analysis/india-usa_innings_data.csv")

print(data.head())

  from pandas.core import (


            batter          bowler non_striker  runs_batter  runs_extras  \
0  Shayan Jahangir  Arshdeep Singh   SR Taylor            0            0   
1         AGS Gous  Arshdeep Singh   SR Taylor            0            0   
2         AGS Gous  Arshdeep Singh   SR Taylor            0            0   
3         AGS Gous  Arshdeep Singh   SR Taylor            0            1   
4         AGS Gous  Arshdeep Singh   SR Taylor            2            0   

   runs_total wickets_0_player_out wickets_0_kind                      team  \
0           0      Shayan Jahangir            lbw  United States of America   
1           0                  NaN            NaN  United States of America   
2           0                  NaN            NaN  United States of America   
3           1                  NaN            NaN  United States of America   
4           2                  NaN            NaN  United States of America   

   over  ...  wickets_0_fielders_0_name review_by review_umpire revi

In [2]:
# checking for missing values in the dataset
missing_values = data.isnull().sum()

# checking data types of the columns
data_types = data.dtypes

missing_values

batter                         0
bowler                         0
non_striker                    0
runs_batter                    0
runs_extras                    0
runs_total                     0
wickets_0_player_out         225
wickets_0_kind               225
team                           0
over                           0
extras_wides                 231
wickets_0_fielders_0_name    228
review_by                    235
review_umpire                235
review_batter                235
review_decision              235
review_type                  235
extras_legbyes               234
wickets_0_fielders_1_name    235
extras_noballs               235
extras_penalty               235
dtype: int64

In [3]:
data_types

batter                        object
bowler                        object
non_striker                   object
runs_batter                    int64
runs_extras                    int64
runs_total                     int64
wickets_0_player_out          object
wickets_0_kind                object
team                          object
over                           int64
extras_wides                 float64
wickets_0_fielders_0_name     object
review_by                     object
review_umpire                 object
review_batter                 object
review_decision               object
review_type                   object
extras_legbyes               float64
wickets_0_fielders_1_name     object
extras_noballs               float64
extras_penalty               float64
dtype: object

In [4]:
# total runs scored by each team
total_runs = data.groupby('team')['runs_total'].sum()

# total wickets taken by each team
total_wickets = data['wickets_0_player_out'].notna().groupby(data['team']).sum()

# total extras
total_extras = data[['team', 'runs_extras', 'extras_wides', 'extras_noballs', 'extras_legbyes', 'extras_penalty']].groupby('team').sum()

# runs scored by each batter
batter_runs = data.groupby('batter')['runs_batter'].sum()

# balls faced by each batter
balls_faced = data.groupby('batter').size()

# strike rate of each batter
strike_rate = (batter_runs / balls_faced) * 100

# boundaries hit by each batter
boundaries = data[(data['runs_batter'] == 4) | (data['runs_batter'] == 6)].groupby(['batter', 'runs_batter']).size().unstack(fill_value=0)

# wickets taken by each bowler
wickets_taken = data['wickets_0_player_out'].notna().groupby(data['bowler']).sum()

# runs conceded by each bowler
runs_conceded = data.groupby('bowler')['runs_total'].sum()

# balls bowled by each bowler
balls_bowled = data.groupby('bowler').size()

# economy rate of each bowler
economy_rate = runs_conceded / (balls_bowled / 6)

# dott balls bowled by each bowler
dot_balls = data[data['runs_total'] == 0].groupby('bowler').size()

# combine all these statistics into dataframes for batters and bowlers
batter_stats = pd.DataFrame({
    'Runs': batter_runs,
    'Balls Faced': balls_faced,
    'Strike Rate': strike_rate,
}).join(boundaries)

bowler_stats = pd.DataFrame({
    'Wickets': wickets_taken,
    'Runs Conceded': runs_conceded,
    'Balls Bowled': balls_bowled,
    'Economy Rate': economy_rate,
    'Dot Balls': dot_balls,
})

In [5]:
total_runs

team
India                       111
United States of America    110
Name: runs_total, dtype: int64

In [6]:
total_wickets

team
India                       3
United States of America    8
Name: wickets_0_player_out, dtype: int64

In [7]:
total_extras

Unnamed: 0_level_0,runs_extras,extras_wides,extras_noballs,extras_legbyes,extras_penalty
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
India,9,2.0,1.0,1.0,5.0
United States of America,8,7.0,0.0,1.0,0.0


In [8]:
import plotly.graph_objects as go

india_runs_progression = data[data['team'] == 'India'].groupby('over')['runs_total'].sum().cumsum()
usa_runs_progression = data[data['team'] == 'United States of America'].groupby('over')['runs_total'].sum().cumsum()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=india_runs_progression.index,
    y=india_runs_progression.values,
    mode='lines+markers',
    name='India'
))

fig.add_trace(go.Scatter(
    x=usa_runs_progression.index,
    y=usa_runs_progression.values,
    mode='lines+markers',
    name='USA'
))

fig.update_layout(
    title='Runs Progression Over Overs',
    xaxis_title='Overs',
    yaxis_title='Cumulative Runs',
    legend_title='Teams',
    template='plotly_white'
)

fig.show()

In [9]:
india_wickets = data[(data['team'] == 'India') & data['wickets_0_player_out'].notna()].groupby('over').size()
usa_wickets = data[(data['team'] == 'United States of America') & data['wickets_0_player_out'].notna()].groupby('over').size()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=india_wickets.index,
    y=india_wickets.values,
    name='India',
    marker_color='blue',
    opacity=0.7
))

fig.add_trace(go.Bar(
    x=usa_wickets.index,
    y=usa_wickets.values,
    name='USA',
    marker_color='red',
    opacity=0.7
))

fig.update_layout(
    title='Wickets Timeline',
    xaxis_title='Overs',
    yaxis_title='Number of Wickets',
    barmode='group',
    template='plotly_white',
    legend_title='Teams'
)

fig.show()

In [10]:
import plotly.express as px

fig = px.bar(
    batter_stats,
    x=batter_stats.index,
    y='Runs',
    title='Run Distribution by Batters',
    labels={'x': 'Batter', 'Runs': 'Runs Scored'},
    template='plotly_white'
)

fig.update_layout(
    xaxis_title='Batter',
    yaxis_title='Runs Scored',
    xaxis=dict(tickangle=90)
)

fig.show()

In [11]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=bowler_stats['Economy Rate'],
    y=bowler_stats['Wickets'],
    mode='markers+text',
    text=bowler_stats.index,
    textposition='top center',
    textfont=dict(
        family="sans serif",
        size=12,
        color="black"
    ),
    marker=dict(color='red', size=10),
    name='Bowlers'
))

fig.update_layout(
    title='Bowling Performance',
    xaxis_title='Economy Rate',
    yaxis_title='Wickets Taken',
    template='plotly_white',
    autosize=False,
    width=800,
    height=600
)

fig.show()

In [21]:
# separate data for India and USA
india_partnership_data = data[data['team'] == 'India'].groupby(['over', 'batter', 'non_striker'])['runs_total'].sum().reset_index()
usa_partnership_data = data[data['team'] == 'United States of America'].groupby(['over', 'batter', 'non_striker'])['runs_total'].sum().reset_index()

# create pivot tables for better visualization
india_partnership_pivot = india_partnership_data.pivot(index='over', columns=['batter', 'non_striker'], values='runs_total').fillna(0)
usa_partnership_pivot = usa_partnership_data.pivot(index='over', columns=['batter', 'non_striker'], values='runs_total').fillna(0)


# convert the pivot table to a long format
india_partnership_long = india_partnership_pivot.reset_index().melt(id_vars='over', var_name=['batter', 'non_striker'], value_name='runs_total')

# create a stacked bar chart
fig = go.Figure()

# add bars for each partnership
for (batter, non_striker) in india_partnership_pivot.columns:
    partnership_data = india_partnership_long[(india_partnership_long['batter'] == batter) & (india_partnership_long['non_striker'] == non_striker)]
    fig.add_trace(go.Bar(
        x=partnership_data['over'],
        y=partnership_data['runs_total'],
        name=f'{batter} & {non_striker}'
    ))

fig.update_layout(
    title='Partnership Contributions - India',
    xaxis_title='Over',
    yaxis_title='Runs',
    barmode='stack',
    template='plotly_white',
    legend_title='Partnership',
    legend=dict(
        x=1.05,
        y=1,
        traceorder='normal',
        font=dict(size=10)
    ),
    autosize=False,
    width=900,
    height=600
)

fig.show()

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['over']"

In [18]:
# Reset index and check for 'over' column
india_partnership_pivot = india_partnership_pivot.reset_index()
print(india_partnership_pivot.columns)  # Check if 'over' appears in the columns

# Continue if 'over' is found in the columns
if 'over' in india_partnership_pivot.columns:
    india_partnership_long = india_partnership_pivot.melt(id_vars='over', var_name=['batter', 'non_striker'], value_name='runs_total')
else:
    print("The column 'over' is not in the DataFrame after reset_index.")

# Create a stacked bar chart if melt was successful
if 'india_partnership_long' in locals():  # Check if the variable exists
    fig = go.Figure()

    # Add bars for each partnership
    for (batter, non_striker) in india_partnership_pivot.columns[1:]:  # Skipping the 'over' column
        partnership_data = india_partnership_long[
            (india_partnership_long['batter'] == batter) & 
            (india_partnership_long['non_striker'] == non_striker)
        ]
        fig.add_trace(go.Bar(
            x=partnership_data['over'],
            y=partnership_data['runs_total'],
            name=f'{batter} & {non_striker}'
        ))

    fig.update_layout(
        title='Partnership Contributions - India',
        xaxis_title='Over',
        yaxis_title='Runs',
        barmode='stack',
        template='plotly_white',
        legend_title='Partnership',
        legend=dict(
            x=1.05,
            y=1,
            traceorder='normal',
            font=dict(size=10)
        ),
        autosize=False,
        width=900,
        height=600
    )

    fig.show()
else:
    print("Unable to melt DataFrame as 'over' column is missing.")


MultiIndex([(  'level_0',          ''),
            (    'index',          ''),
            (     'over',          ''),
            ('RG Sharma',   'V Kohli'),
            (  'RR Pant', 'RG Sharma'),
            (  'V Kohli', 'RG Sharma'),
            ('RG Sharma',   'RR Pant'),
            (  'RR Pant',  'SA Yadav'),
            ( 'SA Yadav',   'RR Pant'),
            (   'S Dube',  'SA Yadav'),
            ( 'SA Yadav',    'S Dube')],
           names=['batter', 'non_striker'])


KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['over']"