# Badminton Data Analysis

Due to recently proposed scoring system change from 3x21 to 5x11 rally point system, I like to investigate on how the result will change if the system is implemented earlier. In this notebook, I will use dataset of badminton matches across 88 different BWF World Tour competition from 2018 to 2021 that I got from kaggle dataset down below.

Data Source : https://www.kaggle.com/sanderp/badminton-bwf-world-tour

In [None]:
import pandas as pd 
import numpy as np
import missingno as msno
import plotly.express as px 

In [None]:
def filter_retired(df):
    '''
    Filter out match that is not completed or retired match 
    '''
    
    df = df.copy()
    
    #filter out retired games
    df = df[df['retired']==False]
    df.reset_index(inplace=True)
    df.drop(columns='index',inplace=True)
    
    return df

In [None]:
def extract_team_nationality(df):
    '''
    Extract nationality from the team with same nationality
    '''
    
    #check if there player pair with different nationality
    if (df[((df['team_one_player_one_nationality'])!=(df['team_one_player_two_nationality']))|
           ((df['team_two_player_one_nationality'])!=(df['team_two_player_two_nationality']))].shape[0]) != 0:
        
        print('There player pair with different nationality !')
        
    
    else:
        #create only one variable of team nationality
        df.loc[:,'team_one_nationalities'] = df['team_one_player_one_nationality']
        df.loc[:,'team_two_nationalities'] = df['team_two_player_one_nationality']

        df.drop(columns=['team_one_player_one_nationality','team_one_player_two_nationality',
                         'team_two_player_one_nationality','team_two_player_two_nationality'],inplace=True)
        
    return df

In [None]:
def player_pair_summary(df):
    '''
    Make a column for player pair summary
    '''
    
    #create only one variable for player pair
    df.loc[:,'team_one_players'] = df['team_one_player_one'] + '/' + df['team_one_player_two']
    df.loc[:,'team_two_players'] = df['team_two_player_one'] + '/' + df['team_two_player_two']
    
    #drop columns that won't be used on analysis for simplicity
    df.drop(columns=['team_one_player_one','team_one_player_two',
                     'team_two_player_one', 'team_two_player_two'],inplace=True)
    
    return df

In [None]:
def game_score_summary_21pts(df):
    '''
    create game score summary on 3 x 21 points based system
    '''
    
    #create one column for game score summary
    df.loc[df['nb_sets']==2,'21pts_game_score'] = df['game_1_score'] + ',' + df['game_2_score']
    df.loc[df['nb_sets']==3,'21pts_game_score'] = df['game_1_score'] + ',' + df['game_2_score'] + ',' + \
                                                  df['game_3_score']
    
    #create column of lists consists of 21 points game score
    df['21pts_game_score'] = df['21pts_game_score'].str.split(',')
    
    #rename some columns
    df.rename(columns={'winner':'21pts_winner',
                       'game_1_score':'21pts_game_1_score',
                       'game_2_score':'21pts_game_2_score',
                       'game_3_score':'21pts_game_3_score'}, inplace=True)
    
    return df

In [None]:
def simplify_columns(df):
    '''
    drop the unused columns in the dataset
    '''
    
    #drop columns that won't be used on analysis for simplicity
    df.drop(columns=['team_one_total_points', 'team_two_total_points',
                     'team_one_most_consecutive_points', 'team_two_most_consecutive_points',
                     'team_one_game_points', 'team_two_game_points',
                     'team_one_most_consecutive_points_game_1','team_two_most_consecutive_points_game_1',
                     'team_one_game_points_game_1', 'team_two_game_points_game_1',
                     'team_one_most_consecutive_points_game_2','team_two_most_consecutive_points_game_2',
                     'team_one_game_points_game_2', 'team_two_game_points_game_2',
                     'team_one_most_consecutive_points_game_3','team_two_most_consecutive_points_game_3',
                     'team_one_game_points_game_3', 'team_two_game_points_game_3'],inplace=True)
    
    return df

In [None]:
def visualize_missing_value(df):
    '''
    visualize the missing value by using missingno library
    '''
    msno.bar(df);
    
    msno.matrix(df);

In [None]:
def interval_score(score_column):
    '''
    extract score on interval (11 points) on a single game
    '''
    for score in score_column:
        if '11' in score:
            score_11 = score.strip("'] [ ")
            return score_11
            break

In [None]:
def get_winner(game_score):
    '''
    to get the winner of the match
    '''
    team_1 = 0
    team_2 = 0
    
    for score in game_score:
        team_1_score = int(score.split('-')[0])
        team_2_score = int(score.split('-')[1])
        if team_1_score > team_2_score:
            team_1 += 1
        else:
            team_2 += 1
    
    #if team 1 win
    if team_1 > team_2:
        return 1
    #if team 2 win
    elif team_2 > team_1:
        return 2
    #for inconclusive result
    else:
        return 0

In [None]:
def game_score_summary_11pts(df):
    '''
    create game score summary on 3 x 11 points based system
    '''
    
    #extract game 1 score
    elv_score_series_1 = pd.Series(dtype='string')

    for i in range(0,len(df['game_1_scores'])):
        score_list = interval_score(df['game_1_scores'].str.split(',')[i])
        elv_pts_score = pd.Series(score_list)
        elv_score_series_1 = elv_score_series_1.append(elv_pts_score,ignore_index=True)

    df['11pts_game_1_score'] = elv_score_series_1
    
    #extract game 2 score
    elv_score_series_2 = pd.Series(dtype='string')

    for i in range(0,len(df['game_2_scores'])):
        score_list = interval_score(df['game_2_scores'].str.split(',')[i])
        elv_pts_score = pd.Series(score_list)
        elv_score_series_2 = elv_score_series_2.append(elv_pts_score,ignore_index=True)

    df['11pts_game_2_score'] = elv_score_series_2
    
    #extract game 3 score
    elv_score_series_3 = pd.Series(dtype='string')

    for i in range(0,len(df['game_3_scores'])):
        if df['nb_sets'][i] == 3:
            score_list = interval_score(df['game_3_scores'].str.split(',')[i])
            elv_pts_score = pd.Series(score_list)
            elv_score_series_3 = elv_score_series_3.append(elv_pts_score,ignore_index=True)
        else:
            elv_pts_score = pd.Series(str('NaN'))
            elv_score_series_3 = elv_score_series_3.append(elv_pts_score,ignore_index=True)

    df['11pts_game_3_score'] = elv_score_series_3
    df['11pts_game_3_score'].replace('NaN',np.NaN, inplace=True)
    
    #create one column for game score summary
    df.loc[df['nb_sets']==2,'11pts_game_score'] = df['11pts_game_1_score'] + ',' + df['11pts_game_2_score']
    df.loc[df['nb_sets']==3,'11pts_game_score'] = df['11pts_game_1_score'] + ',' + df['11pts_game_2_score'] + ',' \
                                                  + df['11pts_game_3_score']
    df['11pts_game_score'] = df['11pts_game_score'].str.split(',')
    
    #extract the winner of 11 points system
    df['11pts_winner'] = df['11pts_game_score'].apply(get_winner)
    
    return df

In [None]:
def get_winner_nationality(df,pts_system='21'):
    '''
    to get the nationality of winning team/individuals
    '''
    if pts_system == '21':
        df.loc[df['21pts_winner']==1, '21pts_winner_nationalities'] = df['team_one_nationalities']
        df.loc[df['21pts_winner']==2, '21pts_winner_nationalities'] = df['team_two_nationalities']
        
    elif pts_system == '11':
        df.loc[df['11pts_winner']==1, '11pts_winner_nationalities'] = df['team_one_nationalities']
        df.loc[df['11pts_winner']==2, '11pts_winner_nationalities'] = df['team_two_nationalities']
        #for inconclusive result
        df.loc[df['11pts_winner']==0, '11pts_winner_nationalities'] = np.nan
        
    return df

In [None]:
def evaluate_scoring_change(df):
    '''
    Evaluate the result of scoring system change
    '''
    
    #for unchanged result
    df.loc[(df['21pts_winner'])==(df['11pts_winner']),'point_change_eval'] = 'Unchanged'

    #for changed result
    df.loc[(df['21pts_winner']!=df['11pts_winner'])&(df['11pts_winner']!=0),'point_change_eval'] = 'Changed'

    #for inconclusive result 
    df.loc[(df['11pts_winner']==0),'point_change_eval'] = 'Inconclusive'
    
    return df

In [None]:
def plot_pie(df,chart_title):
    '''
    to plot pie chart about proportion of match result evaluation
    '''
    fig = px.pie(df, 
             values=df['point_change_eval'].value_counts(), 
             names=df['point_change_eval'].value_counts().index, 
             title=chart_title)
    return fig.show()

In [None]:
def plot_stacked_bar(df,by,chart_title,pct=True):
    '''
    plot stacked bar chart by customable group category
    '''
    evaluation = df.groupby(by)['point_change_eval'].value_counts()
    evaluation = evaluation.unstack('point_change_eval')

    #reorder the columns 
    col_name = ['Unchanged','Inconclusive','Changed']
    evaluation = evaluation[col_name]

    #order of display
    custom_dict = {'HSBC BWF World Tour Finals': 0, 
                   'HSBC BWF World Tour Super 1000': 1, 
                   'HSBC BWF World Tour Super 750': 2,
                   'HSBC BWF World Tour Super 500': 3,
                   'HSBC BWF World Tour Super 300': 4,
                   'HSBC BWF World Tour Super 100': 5} 

    evaluation = pd.DataFrame(evaluation.sort_index(key=lambda x:x.map(custom_dict)))

    #evaluation_percentage
    evaluation_pct = evaluation.copy()
    evaluation_pct['Unchanged'] = round(100*(evaluation['Unchanged']/(evaluation['Unchanged']+
                                    evaluation['Inconclusive']+evaluation['Changed'])),2)
    evaluation_pct['Inconclusive'] = round(100*(evaluation['Inconclusive']/(evaluation['Unchanged']+
                                         evaluation['Inconclusive']+evaluation['Changed'])),2)
    evaluation_pct['Changed'] = round(100*(evaluation['Changed']/(evaluation['Unchanged']+
                                        evaluation['Inconclusive']+evaluation['Changed'])),2)
    
    if pct==True:
        fig = px.bar(evaluation_pct,
                     title = chart_title,
                     labels={
                     "tournament_type": "Tournament Type",
                     "value": "Percentage of Games"
                     })
        return fig.show()
    
    else:
        fig = px.bar(evaluation,
                     title = chart_title,
                     labels={
                     "tournament_type": "Tournament Type",
                     "value": "Number of Games"
                     })
        return fig.show()

In [None]:
def extract_net_win(df):
    '''
    calculate the difference of numbers of win for each match between different nationality
    with changed result and grouped by each nationality
    '''
    
    #subset the match with changed result between different nationality 
    changed_diffnat = df[(df['point_change_eval']=='Changed')&
                            (df['21pts_winner_nationalities']!=df['11pts_winner_nationalities'])]
    
    #create dataframe to count winner nationality from 21 points winner 
    twenty_one_pts = pd.DataFrame(changed_diffnat['21pts_winner_nationalities'].value_counts())
    twenty_one_pts.reset_index(inplace=True)
    
    #create dataframe to count winner nationality from 11 points winner 
    eleven_pts = pd.DataFrame(changed_diffnat['11pts_winner_nationalities'].value_counts())
    eleven_pts.reset_index(inplace=True)
    
    #merge the two dataset
    net_win = pd.merge(twenty_one_pts, eleven_pts, how='inner', on='index')
    
    #calculate the net_win_changed parameter
    net_win['net_win_changed'] = net_win['11pts_winner_nationalities'] - net_win['21pts_winner_nationalities']
    
    #calculate total match involved
    net_win['total_changed_match'] = net_win['11pts_winner_nationalities'] + net_win['21pts_winner_nationalities']
    
    #calculate the percentage of net_win_changed from total match involved
    net_win['net_win_changed_%'] = round(net_win['net_win_changed']*100/net_win['total_changed_match'],2)
    
    
    net_win.set_index('index',inplace=True)
    net_win.sort_values(by='net_win_changed',inplace=True)
    
    return net_win

## Men's Singles

In [None]:
#Men's Singles
ms = pd.read_csv('../input/badminton-bwf-world-tour/ms.csv')

In [None]:
#filter retired match 
ms = filter_retired(ms)

#there are incomplete games I discovered in the earlier EDA
ms.drop(index=2739,inplace=True)
ms.reset_index(drop=False, inplace=True)
ms.drop(columns='index',inplace=True)

#create game score
ms = game_score_summary_21pts(ms)

#simplify columns 
ms = simplify_columns(ms)

ms

In [None]:
visualize_missing_value(ms)

In [None]:
#extract score for 11 points based system
ms = game_score_summary_11pts(ms)

In [None]:
#get winning team nationality for 21 points 
ms = get_winner_nationality(ms,pts_system='21')

#get winning team nationality for 11 points 
ms = get_winner_nationality(ms,pts_system='11')

In [None]:
#evaluate the result of scoring system change 
ms = evaluate_scoring_change(ms)

In [None]:
plot_pie(ms,chart_title="Proportion of Men's Singles Match Affected by Change in Scoring System")

In [None]:
plot_stacked_bar(df=ms,by='tournament_type',pct=True,
                 chart_title="Men's Single Match Evaluation for Scoring System Change in each Tournament Type")

In [None]:
#calculate net_win for Men's Singles
net_win_ms = extract_net_win(ms)
net_win_ms

## Women's Singles

In [None]:
#Women's Singles
ws = pd.read_csv('../input/badminton-bwf-world-tour/ws.csv')

In [None]:
#filter retired match 
ws = filter_retired(ws)

#create game score
ws = game_score_summary_21pts(ws)

#simplify columns 
ws = simplify_columns(ws)

ws

In [None]:
visualize_missing_value(ws)

In [None]:
#extract score for 11 points based system
ws = game_score_summary_11pts(ws)

In [None]:
#get winning team nationality for 21 points 
ws = get_winner_nationality(ws,pts_system='21')

#get winning team nationality for 11 points 
ws = get_winner_nationality(ws,pts_system='11')

In [None]:
#evaluate the result of scoring system change 
ws = evaluate_scoring_change(ws)

In [None]:
plot_pie(ws,chart_title="Proportion of Women's Singles Match Affected by Change in Scoring System")

In [None]:
plot_stacked_bar(df=ws,by='tournament_type',pct=True,
                 chart_title="Women's Single Match Evaluation for Scoring System Change in each Tournament Type")

In [None]:
#calculate net_win for Men's Singles
net_win_ws = extract_net_win(ws)
net_win_ws

## Men's Doubles

In [None]:
#Men's Doubles
md = pd.read_csv('../input/badminton-bwf-world-tour/md.csv')

In [None]:
#filter retired match 
md = filter_retired(md)

#extract team nationality
md = extract_team_nationality(md)

#create player pair summary 
md = player_pair_summary(md)

#create game score
md = game_score_summary_21pts(md)

#simplify columns 
md = simplify_columns(md)

md

In [None]:
visualize_missing_value(md)

In [None]:
#extract score for 11 points based system
md = game_score_summary_11pts(md)

In [None]:
#get winning team nationality for 21 points 
md = get_winner_nationality(md,pts_system='21')

#get winning team nationality for 11 points 
md = get_winner_nationality(md,pts_system='11')

In [None]:
#evaluate the result of scoring system change 
md = evaluate_scoring_change(md)

In [None]:
plot_pie(md,chart_title="Proportion of Men's Doubles Match Affected by Change in Scoring System")

In [None]:
plot_stacked_bar(df=md,by='tournament_type',pct=True,
                 chart_title="Men's Double Match Evaluation for Scoring System Change in each Tournament Type")

In [None]:
#calculate net_win for Men's Doubles
net_win_md = extract_net_win(md)
net_win_md

## Women's Doubles

In [None]:
#Women's Singles
wd = pd.read_csv('../input/badminton-bwf-world-tour/wd.csv')

In [None]:
#filter retired match 
wd = filter_retired(wd)

#extract team nationality
wd = extract_team_nationality(wd)

#create player pair summary 
wd = player_pair_summary(wd)

#create game score
wd = game_score_summary_21pts(wd)

#simplify columns 
wd = simplify_columns(wd)

wd

In [None]:
visualize_missing_value(wd)

In [None]:
#extract score for 11 points based system
wd = game_score_summary_11pts(wd)

In [None]:
#get winning team nationality for 21 points 
wd = get_winner_nationality(wd,pts_system='21')

#get winning team nationality for 11 points 
wd = get_winner_nationality(wd,pts_system='11')

In [None]:
#evaluate the result of scoring system change 
wd = evaluate_scoring_change(wd)

In [None]:
plot_pie(wd,chart_title="Proportion of Women's Doubles Match Affected by Change in Scoring System")

In [None]:
plot_stacked_bar(df=wd,by='tournament_type',pct=True,
                 chart_title="Women's Double Match Evaluation for Scoring System Change in each Tournament Type")

In [None]:
#calculate net_win for Women's Doubles
net_win_wd = extract_net_win(wd)
net_win_wd

## Mixed Double

In [None]:
#Mixed Doubles
xd = pd.read_csv('../input/badminton-bwf-world-tour/xd.csv')

In [None]:
#filter retired match 
xd = filter_retired(xd)

#there are incomplete games I discovered in the earlier EDA
xd.drop(index=2100,inplace=True)
xd.reset_index(drop=False, inplace=True)
xd.drop(columns='index',inplace=True)

#extract team nationality
xd = extract_team_nationality(xd)

#create player pair summary 
xd = player_pair_summary(xd)

#create game score
xd = game_score_summary_21pts(xd)

#simplify columns 
xd = simplify_columns(xd)

xd

In [None]:
visualize_missing_value(xd)

In [None]:
#extract score for 11 points based system
xd = game_score_summary_11pts(xd)

In [None]:
#get winning team nationality for 21 points 
xd = get_winner_nationality(xd,pts_system='21')

#get winning team nationality for 11 points 
xd = get_winner_nationality(xd,pts_system='11')

In [None]:
#evaluate the result of scoring system change 
xd = evaluate_scoring_change(xd)

In [None]:
plot_pie(xd,chart_title="Proportion of Mixed Doubles Match Affected by Change in Scoring System")

In [None]:
plot_stacked_bar(df=xd,by='tournament_type',pct=True,
                 chart_title="Mixed Double Match Evaluation for Scoring System Change in each Tournament Type")

In [None]:
#calculate net_win for Mixed Doubles
net_win_xd = extract_net_win(xd)
net_win_xd

## Combine All Data

In [None]:
#set the order of columns
cols_order = ['tournament', 'city', 'country', 'date', 'tournament_type',
       'discipline', 'round', '21pts_winner', 'nb_sets', 'retired',
       '21pts_game_1_score', '21pts_game_2_score', '21pts_game_3_score',
       'team_one_players', 'team_two_players', 'team_one_nationalities',
       'team_two_nationalities', 'game_1_scores', 'game_2_scores',
       'game_3_scores', '21pts_game_score', '11pts_game_1_score',
       '11pts_game_2_score', '11pts_game_3_score', '11pts_game_score',
       '11pts_winner', '21pts_winner_nationalities',
       '11pts_winner_nationalities', 'point_change_eval']

In [None]:
#Men's Singles
ms = ms[cols_order]

#Women's Singles
ws = ws[cols_order]

#Men's Doubles
md = md[cols_order]

#Women's Doubles
wd = wd[cols_order]

#Mixed Doubles
xd = xd[cols_order]

In [None]:
badminton_data = pd.concat([ms,ws,md,wd,xd])
badminton_data.reset_index(inplace = True, drop='index')

In [None]:
badminton_data

In [None]:
plot_pie(badminton_data,chart_title="Proportion of All Match Affected by Change in Scoring System")

In [None]:
plot_stacked_bar(df=badminton_data,by='tournament_type',pct=True,
                 chart_title="All Match Evaluation for Scoring System Change in each Tournament Type")

In [None]:
#calculate net_win for all matches
net_win_all = extract_net_win(badminton_data)

#subset data for total match involved more than 100
net_win_all = net_win_all[net_win_all['total_changed_match'] > 100]
net_win_all

In [None]:
fig = px.bar(net_win_all, y='net_win_changed',
             title = 'Nationalities net win',
             labels={"net_win_changed": "Net Win Changed",
                     "index": "Nationalities"})
fig.show()

## Save to csv

In [None]:
badminton_data.to_csv('badminton_data.csv')