![alt](a.avif)

# OLYMPICS EDA ANALYSIS:

In [None]:
#IMPORTING LIBRARIES:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from warnings import filterwarnings 
filterwarnings('ignore')


In [None]:
#IMPORTING DATASET:

df=pd.read_csv('athlete_events.csv')
df.head()

# COLUMNS MEANINGS
![alt](img1.png)

In [None]:
#EXPLORATORY DATA ANALYSIS:

df.info() #to check the data types of the columns and null values

In [None]:
df.describe() #statistical summary

In [None]:
df.isnull().sum() #to check the null values

In [None]:
df.corr() #to check the correlation between the columns

In [None]:
#which countries have the most number of medals in total?

most_medals=df.groupby(['Team'])['Medal'].count().sort_values(ascending=False).head(15)

px.bar(most_medals, x=most_medals.index, y=most_medals.values, color=most_medals.index, title='Countries with most number of medals',
       labels={'Team':'Country', 'y':'Number of medals'}, template='plotly_dark', width=800, height=500, text=most_medals.values)


In [None]:
#which countries have the most number of gold medals?

gold=df[df['Medal']=='Gold']
gold_medals=gold.groupby(['Team'])['Medal'].count().sort_values(ascending=False).head(15)

px.bar(gold_medals, x=gold_medals.index, y=gold_medals.values, color=gold_medals.index, title='Countries with most number of gold medals',
       labels={'Team':'Country', 'y':'Number of gold medals'}, template='plotly_dark', width=800, height=500, text=gold_medals.values)

In [None]:
# distribution of age of the athletes:

px.histogram(df, x='Age', title='Distribution of age of the athletes' ,template='plotly_dark',
             width=800, height=500, nbins=50, color_discrete_sequence=['#F63366'],
             labels={'Age':'Age of the athletes'}, opacity=0.8, marginal='box')

In [None]:
#correration between variables:
df_corr=df.drop(['ID'], axis=1)

px.imshow(df_corr.corr(), title='Correlation between variables', template='plotly_dark', width=800, height=500)

In [None]:
# distribution of gained medals in years:
year=df.groupby(['Year'])['Medal'].count().sort_values(ascending=False).head(15)

px.bar(year, x=year.index, y=year.values, color=year.index, title='Distribution of gained medals in years')

In [None]:
#which sports do the athletes who are smaller than 150 cm and heavier than 100 kg do?

smaller=df[(df['Height']<150)]['Sport'].value_counts().head(10)
overweight=df[(df['Weight']>100)]['Sport'].value_counts().head(10)

fig=px.bar(smaller, x=smaller.index, y=smaller.values, color=smaller.index,
           title='Sports which athletes who are smaller than 150 cm ?',labels={'index':'Sports', 'y':'Number of athletes'},width=800, height=500)
fig.show()


fig1=px.bar(overweight, x=overweight.index, y=overweight.values, color=overweight.index,
            title='Sports which athletes  who are heavier than 100 kg?',labels={'index':'Sports', 'y':'Number of athletes'},width=800, height=500)
fig1.show()


In [None]:
#which sports do the athletes who are older than 40 ?

old=df[(df['Age']>40)]['Sport'].value_counts().head(10)

px.pie(old, values=old.values, names=old.index, title='Sports wich athletes who are older than 40 ?', template='plotly_dark',width=800, height=500)

# px.bar(old,x=old.index, y=old.values, color=old.index,
#        title='Sports of athletes who are older than 40 ?',labels={'index':'Sports', 'y':'Number of athletes'})

In [None]:
# showing the distribution of the variables:
sns.pairplot(df, vars=['Age', 'Height', 'Weight'], kind='reg', diag_kind='kde', palette='husl')

In [None]:
#what are athletes who gains most medals ?

atl=df.groupby(['Name'])['Medal'].count().sort_values(ascending=False).head(15)

fig = px.scatter(atl, x=atl.index, y=atl.values, size=atl.values, color=atl.index,hover_name="Medal", size_max=30,title="Most Medals Gained Athletes",template='plotly_dark',
                 labels={'Name':'Athletes', 'y':'Number of medals'})
fig.show()

In [None]:
#how many athletes were women in these olympics?

wmn=df[(df["Sex"]=="F")]["Year"].value_counts().head(15)

px.bar(wmn,x=wmn.index, y=wmn.values, color=wmn.values,title="Counts of women in olympics for per year",
       labels={"index":"Olympics Date","y":"Number of women athletes","color":"counts"},template='plotly_dark'
       ,color_continuous_scale='aggrnyl',width=800, height=500,text=wmn.values,opacity=0.8,animation_group="Year")

In [None]:
#In which season Have the olympics been held the most?

season=df["Season"].value_counts()

px.pie(season,values=season.values,names=season.index,title="Seasons of olympics",template='plotly_dark',width=800, height=500)

In [None]:
#DATA PREPROCESSING:

df.rename(columns={'Team':'Country'}, inplace=True)
df.head()



In [None]:
# encoding categorical variables:

from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
df["Sex"]=le.fit_transform(df["Sex"])

df.head()


In [None]:
# detecting outliers:

vars=df[["Age","Height","Weight","Year"]]

for i in vars:
    fig=px.box(df, y=i, title='Boxplot of '+i, template='plotly_dark', width=800, height=500)
    fig.show()


In [None]:
#Detecting how many outliers are there in the selected variables:
data=df[["Age","Height","Weight","Year"]]
data = pd.DataFrame(data)

outliers_sum = data.apply(lambda x: np.sum((x - x.mean()).abs() > 2 * x.std()))

for i in outliers_sum.index:
    print(i,":",outliers_sum[i])



In [None]:
#filling missing values:

from sklearn.impute import SimpleImputer

imputer=SimpleImputer(missing_values=np.nan, strategy='median')

df['Age']=imputer.fit_transform(df[['Age']])
df['Height']=imputer.fit_transform(df[['Height']])
df['Weight']=imputer.fit_transform(df[['Weight']])

df.isnull().sum()



In [None]:
#filling outliers with mean:

vars=["Age","Height","Weight"]

def fill_outliers_with_mean(df, variable, threshold=1.5):
    q1 = df[variable].quantile(0.25)
    q3 = df[variable].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    
    df.loc[(df[variable] < lower_bound) | (df[variable] > upper_bound), variable] = df[variable].mean()
    
for column in vars:
    fill_outliers_with_mean(df, column)


In [None]:
#visualizing the distribution of the variables after filling outliers:

vars=df[["Age","Height","Weight"]]

for i in vars:
    fig=px.box(df, y=i, title='Boxplot of '+i, template='plotly_dark', width=600, height=350)
    fig.show()