In [None]:


import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv("/kaggle/input/imo-scores/imo_results.csv")
df

### understanding data and quick overview of dataset

In [None]:
df.info()

### Drop nan values and fill nans values and patterm recognization

In [None]:
## here we don't require all columns we like first name , last name is useless for us so don't consider this two columns

df = df.drop(['firstname','lastname'],axis=1)
df

In [None]:
df['award'].unique()

In [None]:
## to fill nan values first we have to identify range of each medals
import matplotlib.pyplot as plt
import seaborn as sns

### Recognize pattern to Fill NaN values

In [None]:
## let's draw scatter plot for all 3 award categories
def getAwardData(df,a):
    return df[df['award']==a]

gold = df[df['award']=="Gold medal"]
silver = df[df['award']=="Silver medal"]
bronze = df[df['award']=="Bronze medal"]
honor_mention = df[df['award']=="Honourable mention"]
nan_award = df[pd.isnull(df['award'])]

# let's draw
plt.scatter(gold['rank'],gold['total'],color="g",label="gold")
plt.scatter(silver['rank'],silver['total'],color="y",label="silver")
plt.scatter(bronze['rank'],bronze['total'],color="b",label="bronze")
plt.scatter(honor_mention['rank'],honor_mention['total'],color="orange",label="Honour mention")
# plt.scatter(nan_award["rank"],nan_award["total"],color="r",label="nan vals")
# let's draw nan values
plt.legend()
plt.show()

Most of the nan vals are belongs from honourable mention and bronze medal class so lets find boundr for honourable mention

just take bronze and honourable classes under conisderation

In [None]:
plt.scatter(bronze['rank'],bronze['total'],color="b",label="bronze")

plt.scatter(honor_mention['rank'],honor_mention['total'],color="orange",label="Honour mention")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(30,10))

fig.add_subplot("121")
sns.boxplot(x="award",y="total",order=['Bronze medal',"Honourable mention"],data = df)
plt.title("Based on total values")


fig.add_subplot("122")
sns.boxplot(x="award",y="rank",order=['Bronze medal',"Honourable mention"],data = df)
plt.title("Based on rank values")

## observation :

range of bronze based on total : 8 to 26

range of mention based on total : 3 to 17

avg upper bound val of total : 22




range of bronze based on rank : 50 to 250

range of mention based on rank : 140 to 480

avg upper bound val of rank : 320

In [None]:
print("total bronze  lower : ",df[df['award']=="Bronze medal"]['total'].quantile(0.25),"  upper val : ",df[df['award']=="Bronze medal"]['total'].quantile(0.75))
print("total mention  lower : ",df[df['award']=="Honourable mention"]['total'].quantile(0.25),"  upper val : ",df[df['award']=="Honourable mention"]['total'].quantile(0.75))

print("rank bronze   lower : ",df[df['award']=="Bronze medal"]['rank'].quantile(0.25),"  upper val : ",df[df['award']=="Bronze medal"]['rank'].quantile(0.75))
print("rank mention  lower : ",df[df['award']=="Honourable mention"]['rank'].quantile(0.25),"  upper val : ",df[df['award']=="Honourable mention"]['rank'].quantile(0.75))

In [None]:
dff = df.copy()

In [None]:
# fill nan values for  bronze medals
dff.loc[(pd.isnull(dff['award'])) & (dff["total"] > 16) & (dff["rank"] < 273) , "award"] = "Bronze medal"

In [None]:
# fill nan values for  honourable mention
dff.loc[(pd.isnull(dff['award'])) & (dff["total"] > 9) & (dff["rank"] < 440) , "award"] = "Honourable mention"

In [None]:
## now remove other useless nan values records
df = dff.dropna()

### Data engineering : remove outliers

In [None]:
# let's draw simple boxplot to identify outliers in dataset

def drawBoxplot(d):
    fig = plt.figure(figsize=(25,10))

    fig.add_subplot("121")
    sns.boxplot(x="award",y="total",data=d)
    plt.title("Boxplot for total")


    fig.add_subplot("122")
    sns.boxplot(x="award",y="rank",data=d)
    plt.title("Boxplot for rank")
  
    
drawBoxplot(df)

In [None]:
df['total'].hist()

In [None]:
from scipy.stats import zscore

## we are going to use zscore to remove outliers from dataset
df['zscore'] = df.groupby("award")['total'].transform(lambda x : zscore(x,ddof=1))
df

In [None]:
main_points = df[(df['zscore']<3) & (df['zscore']>-3)]
outliers = df[(df['zscore']>3) | (df['zscore']<-3)]

drawBoxplot(main_points)
plt.show()


You can observe that most of the outliers are removed 

In [None]:
df = main_points.copy()

## Let's apply stastical analysis to compute relation between award and country categorical features


#### let's apply chi-square test to find whether country and award are dependent or not ?

In [None]:
from scipy.stats import chi2_contingency

source link  : https://www.geeksforgeeks.org/python-pearsons-chi-square-test/

Expected Values Table :

Next, we prepare a similar table of calculated(or expected) values. To do this we need to calculate each item in the new table as :

[![image.png](attachment:image.png)](http://)

In [None]:
ct = pd.crosstab(df['country'],df['award'])
ct

In [None]:
stat,p,dof,expected = chi2_contingency(ct)
if p<=0.05:
    print("Both are dependent")
else :
    print("Independent")
## stat : 
## dof is degree of free dom = ( No. of rows - 1 ) * (No. of cols  - 1 ) = 115*3 = 345
## H0 : both are dependent , when value of p <= value of alpha (0.05)
## If the p-value for the test comes out to be strictly greater than the alpha value, then H0 holds true.


In [None]:
print(expected)

In [None]:
## Now its may create problem because both are dependent on each other , it may result into bad prediction while applying ML algorithms , 
## But we don't want to predict , we just want to analyse the data so ignore it

In [None]:
print(p)

## Data Visualization

In [None]:

sns.set_style("darkgrid")
def countrywiseTotalaward(award="all",col="country",country="all"):
    if col=="country":
        if award=="all":
            d1 = df.copy()
        else:
            d1 = df[df['award']==award]

        sns.countplot(x="country",data = d1,order=d1["country"].value_counts()[:10].index)
        plt.title(f"countries countplot of {award}")
        
        plt.show()
        
    elif col=="award":
        if country=="all":
            d2 = df.copy()
        else:
            d2 = df[df['country']==country]
        

        sns.countplot(x="award",data = d2)
        plt.title(f"Awards countplot of {country}")
        
        print(f"Total medals : ",len(d2[~(d2['award']=="Honourable mention")]))
        plt.show()
        

In [None]:
countrywiseTotalaward()

In [None]:
df['award'].unique()

In [None]:
countrywiseTotalaward(col="award",country="IND")

In [None]:
countrywiseTotalaward(col="award",country="CHN")

In [None]:
countrywiseTotalaward(col="award",country="USA")

In [None]:
countrywiseTotalaward(col="award",country="RUS")

In [None]:
df['year'] = pd.to_datetime(df['year'],format="%Y")
df['year'].dtype

In [None]:
df

In [None]:
def yearWiseGrowthofCountriesForMedals(country,medal = "all"):
    
    t = df[df['country']==country]
    
    if(medal!="all"):
        d1 = pd.DataFrame(t[t['award']==medal]['year'].value_counts())
    else:
        d1 = pd.DataFrame(t[t['award']!="Honourable mention"]['year'].value_counts())
    
    d1.plot(marker=">",figsize=(12,8))
    plt.title(f"Year wise Growth of country {country} for {medal} medal")
    

In [None]:
yearWiseGrowthofCountriesForMedals("IND",medal = "Silver medal")

In [None]:
yearWiseGrowthofCountriesForMedals("CHN",medal = "Gold medal")

In [None]:
df[(df['country']=="IND") & (df['year']=="2008")]

#### Let's see which problems were difficult ?

In [None]:
## Let's see which problem was difficult 
## for that we compute avg marks for each problem
d1 = {}
for i in "123456":
    col = "problem"+i
    d1[col] = df[col].mean()

In [None]:
d1.values()

In [None]:
## sort dict d into descending order
d1 = dict(sorted(d1.items(),key=lambda x : x[1])[::-1])
d1

In [None]:
plt.bar(x=d1.keys(),height=d1.values())

We can conclude that problem 1 and problem 4 are more difficult if we compare with others.