In [1]:
import pandas as pd
import plotly.express as px

In [2]:
#getting files for analysis
wc_overall = pd.read_csv('WorldCupOverall.csv')
wc_matches = pd.read_csv('MatchesPlayersGoals.csv')
qualified_teams = pd.read_csv('QualifiedTeams.csv')

In [3]:
wc_overall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   edition           21 non-null     int64 
 1   year              21 non-null     int64 
 2   host_country      21 non-null     object
 3   first             21 non-null     object
 4   second            21 non-null     object
 5   third             21 non-null     object
 6   teams             21 non-null     int64 
 7   total_attendance  21 non-null     int64 
dtypes: int64(4), object(4)
memory usage: 1.4+ KB


In [4]:
wc_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2548 entries, 0 to 2547
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         2548 non-null   int64 
 1   country      2548 non-null   object
 2   player_id    2548 non-null   object
 3   player_name  2548 non-null   object
 4   home_goals   2548 non-null   int64 
 5   away_goals   2548 non-null   int64 
 6   own_goal     2548 non-null   int64 
 7   penalty      2548 non-null   int64 
dtypes: int64(5), object(3)
memory usage: 159.4+ KB


World Cup --> Overall Page

In [None]:
wc_overall['edition'].iloc[-1]

In [None]:
#function to return the count of passed series or list
def count(data):
    total_count = len(data)
    return total_count

In [None]:
#function to return the required dataframe or series
def get_data(dataframe,required_column_list):
    df = dataframe[required_column_list]
    return df

In [None]:
def get_total_wc_played(dataframe):
    req_col = ['edition']
    total_wc_played = get_data(wc_overall,req_col).iloc[-1].values[0]
    return total_wc_played

In [None]:
get_total_wc_played(wc_overall)

In [None]:
#total participating nations list
def get_total_participating_nations(dataframe,filter_year=0):
    req_col = 'country'
    if filter_year == 0:
        participating_nations = dataframe[req_col].unique()
        return participating_nations
    else:
        filter_1 = dataframe['year'] == filter_year
        participating_nations = dataframe[filter_1][req_col].unique()
        return participating_nations

In [None]:
get_total_participating_nations(qualified_teams)

In [None]:
#world cup winners and winning frequency
bar_data = wc_overall['first'].value_counts().reset_index().rename(columns = {'index':'country','first':'no_of_wins'})
bar_data

In [None]:
config = {'staticPlot': True}
fig = px.bar(bar_data, x='country',y='no_of_wins', 
             text='no_of_wins',
             template='plotly_dark',
)
fig.update_traces(width = 0.5,texttemplate='%{text:.2s}', textposition='outside', marker_color = 'orange')
fig.update_yaxes(visible=False)
fig.update_layout(title_text='Country and World Cup Wins', title_x=0.5,title_y = 0.95,
                  uniformtext_minsize=8, 
                  uniformtext_mode='hide',
                  xaxis_title=None)
fig.show()

In [None]:
def plot_bar(plotlyobj, dataframe,x_axis,y_axis,plot_title):
        
    
    fig = plotlyobj.bar(dataframe, x=x_axis,
                 y=y_axis, 
                 text=y_axis,
                 template='plotly_dark',
    )
    
    
    
    fig.update_traces(width = 0.5,
                      textposition='outside', 
                      marker_color = 'orange')
    
    fig.update_yaxes(visible=False)
    
    fig.update_xaxes({'type' : 'category'})

    
    fig.update_layout(title_text=plot_title, 
                      title_x=0.5,title_y = 0.95,
                      uniformtext_minsize=8, 
                      uniformtext_mode='hide',
                      xaxis_title=None)
    
    return fig.show()

In [None]:
plot_bar(px, bar_data, 'country','no_of_wins', 'Country and World Cup Wins')

In [None]:
def get_grouped_data(dataframe,group_by_col, to_be_grouped):
    wc_matches_v1_grouped = dataframe.groupby(group_by_col)[to_be_grouped].sum().reset_index()
    return wc_matches_v1_grouped

In [None]:
def get_wc_matches_v1(dataframe,own_goal=False,filter_year = 0 , filter_country = ''):
    req_cols = ['year','country', 'player_id', 'player_name', 'home_goals','away_goals','own_goal']
    dataframe = dataframe[req_cols]
    dataframe['goals'] = dataframe['home_goals'] + dataframe['away_goals']
    
    if filter_year != 0:
        filter_1 = dataframe['year'] == filter_year
        dataframe = dataframe[filter_1]
    
    if filter_country != '':
        filter_1 = dataframe['country'] == filter_country
        dataframe = dataframe[filter_1]
        
    if(own_goal):
        dataframe = dataframe[dataframe['own_goal'] == 0]
    
    return dataframe

In [None]:
wc_matches_v1= get_wc_matches_v1(wc_matches)
wc_matches_v1_new = get_wc_matches_v1(wc_matches,True)
wc_matches_v1_new

In [None]:
player_goals_grouped = get_grouped_data(wc_matches_v1,group_by_col=['player_id','player_name'],to_be_grouped='goals')
player_goals_grouped = player_goals_grouped.sort_values(by='goals',ascending=False)

In [None]:
plot_bar(px, player_goals_grouped[0:7], 'player_name','goals', 'Top 10 goal scorers of all time')

In [None]:
#countries with highest goals
country_goals_grouped = get_grouped_data(wc_matches_v1_new,group_by_col='country',to_be_grouped='goals')
country_goals_grouped = country_goals_grouped.sort_values(by='goals',ascending=False)

In [None]:
plot_bar(px, country_goals_grouped[0:10], 'country','goals', 'highest goal scoring countries of all time')

Year Wise Analytics

In [None]:
#getting the list of year to add to the select box
def get_years_list(dataframe):
    year_list = dataframe['year'].tolist()
    return year_list;

In [None]:
get_years_list(wc_overall)

In [None]:
#function to return host_country, total attendance, first, second and third
filter_year = wc_overall['year'] == 1930
dataframe = wc_overall[filter_year]
dataframe

In [None]:
host_country = dataframe['host_country'].values[0]
tota_attendance = dataframe['total_attendance'].values[0]
first = dataframe['first'].values[0]
second = dataframe['second'].values[0]
third = dataframe['third'].values[0]
print(host_country)
print(tota_attendance)
print(first)
print(second)
print(third)

In [None]:
def get_yearly_overall_data(dataframe,selected_year,column_name):
    filter_year = wc_overall['year'] == selected_year
    dataframe = wc_overall[filter_year]
    data = dataframe[column_name].values[0]
    return data

In [None]:
get_yearly_overall_data(wc_overall,1930,'host_country')

In [None]:
get_yearly_overall_data(wc_overall,1930,'total_attendance')

In [None]:
#getting the qualified teams list and year 
qualified_teams
filter_year = qualified_teams['year'] == 1930
qualified_teams[filter_year]

In [None]:
participating_teams = get_total_participating_nations(qualified_teams,1930)
participating_teams

In [None]:
count(participating_teams)

In [None]:
#Task Top 5 highest scoring countries for a given year
countries_yearly_filtered=get_wc_matches_v1(wc_matches,own_goal=True,filter_year=1930)
countries_yearly_filtered.head(5)

In [None]:
countries_filtered_grouped = get_grouped_data(countries_yearly_filtered,group_by_col='country',to_be_grouped='goals')
countries_filtered_grouped = countries_filtered_grouped.sort_values(by='goals',ascending=False)
countries_filtered_grouped

In [None]:
#bar plot of top 5 countries
plot_bar(px,countries_filtered_grouped[0:5],x_axis='country',y_axis='goals',plot_title='Countries with highest goals top-5')

In [None]:
#Task Top 5 highest scoring countries for a given year
players_yearly_filtered=get_wc_matches_v1(wc_matches,own_goal=True,filter_year=1930)
player_filtered_grouped = get_grouped_data(players_yearly_filtered,group_by_col=['player_id','player_name'],to_be_grouped='goals')
player_filtered_grouped = player_filtered_grouped.sort_values(by='goals',ascending=False)
plot_bar(px,player_filtered_grouped[0:5],x_axis='player_name',y_axis='goals',plot_title='Countries with highest goals top-5')

In [None]:
get_total_participating_nations(qualified_teams)

In [None]:
#total participating nations list
def get_country_participation_years(dataframe,filter_country):
    req_col = 'year'
    filter_1 = dataframe['country'] == filter_country
    participating_years = dataframe[filter_1][req_col].unique()
    return participating_years

In [None]:
country_participation_years = get_country_participation_years(qualified_teams,'Germany')
country_participation_years

In [None]:
count(country_participation_years)

In [None]:
#how many goal a country has scored in the wc it has participated
df_filtered_by_country = get_wc_matches_v1(wc_matches,filter_country='Germany')
df_filtered_by_country

In [None]:
country_yearly_goals_grouped = get_grouped_data(df_filtered_by_country,group_by_col='year',to_be_grouped='goals')

In [None]:
plot_bar(px,country_yearly_goals_grouped,x_axis='year',y_axis='goals',plot_title='Goals in participated world cups')

In [None]:
#Top 5 players with most goals for a given country
df_filtered_by_country = get_wc_matches_v1(wc_matches,own_goal=True,filter_country='Germany')
player_goals_grouped = get_grouped_data(df_filtered_by_country,group_by_col=['player_id','player_name'],to_be_grouped='goals')
player_goals_grouped = player_goals_grouped.sort_values(by='goals',ascending=False)
plot_bar(px,player_goals_grouped[0:5],x_axis='player_name',y_axis='goals',plot_title='Goals in participated world cups')

In [None]:
#given a country how many times it won a world cup, first position, second position (ex. Germany)
filter_1 = wc_overall['first'] == 'Germany'
sum(filter_1)

In [None]:
filter_1 = wc_overall['second'] == 'Germany'
sum(filter_1)

In [None]:
#given a country how many times it won a world cup, first position, second position
def country_wc_win_position(dataframe,country_filter,pos):
    if pos == 1:
        filter_1 = dataframe['first'] == country_filter
        total_first = sum(filter_1)
        return total_first
    elif pos == 2:
        filter_1 = dataframe['second'] == country_filter
        total_second = sum(filter_1)
        return total_second
    elif pos == 3:
        filter_1 = dataframe['third'] == country_filter
        total_third = sum(filter_1)
        return total_third

In [None]:
country_wc_win_position(wc_overall,'Germany',1)

In [None]:
country_wc_win_position(wc_overall,'Germany',2)

In [None]:
country_wc_win_position(wc_overall,'Germany',3)

In [None]:
country_wc_win_position(wc_overall,'Argentina',1)

In [None]:
country_wc_win_position(wc_overall,'Argentina',2)