In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [2]:
# Step 2: Load the Data
# Load the dataset
url = 'https://raw.githubusercontent.com/rashida048/Datasets/master/olympics.csv'
df = pd.read_csv(url)


In [3]:
# Step 3: Understand the Data

# Display the first few rows
print(df.head())

# Get basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


                   0         1     2     3     4      5         6     7     8  \
0                NaN  № Summer  01 !  02 !  03 !  Total  № Winter  01 !  02 !   
1  Afghanistan (AFG)        13     0     0     2      2         0     0     0   
2      Algeria (ALG)        12     5     2     8     15         3     0     0   
3    Argentina (ARG)        23    18    24    28     70        18     0     0   
4      Armenia (ARM)         5     1     2     9     12         6     0     0   

      9     10       11    12    13    14              15  
0  03 !  Total  № Games  01 !  02 !  03 !  Combined total  
1     0      0       13     0     0     2               2  
2     0      0       15     5     2     8              15  
3     0      0       41    18    24    28              70  
4     0      0       11     1     2     9              12  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 
---  ------ 

In [4]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# For simplicity, we might drop rows with missing values
df.dropna(inplace=True)  # Alternatively, you can use df.fillna() for imputation


0     1
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


In [5]:
# Step 5: Data Visualization
# Distribution of medals by type
medal_count = df['medal'].value_counts()
sns.barplot(x=medal_count.index, y=medal_count.values)
plt.title('Distribution of Medals')
plt.show()

# Number of athletes by country
top_countries = df['country'].value_counts().head(10)
sns.barplot(y=top_countries.index, x=top_countries.values)
plt.title('Top 10 Countries by Number of Athletes')
plt.show()

# Distribution of athletes by age
sns.histplot(df['age'], kde=True, bins=30)
plt.title('Distribution of Athletes\' Ages')
plt.show()

# Distribution of medals by sport
top_sports = df['sport'].value_counts().head(10)
sns.barplot(y=top_sports.index, x=top_sports.values)
plt.title('Top 10 Sports by Number of Medals')
plt.show()


KeyError: 'medal'

In [6]:
# Step 6: Univariate Analysis
# Distribution of ages
sns.histplot(df['age'], kde=True, bins=30)
plt.title('Distribution of Ages')
plt.show()

# Distribution of heights
sns.histplot(df['height'], kde=True, bins=30)
plt.title('Distribution of Heights')
plt.show()

# Distribution of weights
sns.histplot(df['weight'], kde=True, bins=30)
plt.title('Distribution of Weights')
plt.show()


KeyError: 'age'

In [7]:
# Step 7: Bivariate Analysis
# Age vs Height
sns.scatterplot(x='age', y='height', data=df)
plt.title('Age vs Height')
plt.show()

# Age vs Weight
sns.scatterplot(x='age', y='weight', data=df)
plt.title('Age vs Weight')
plt.show()

# Medal count by country
plt.figure(figsize=(12, 8))
sns.countplot(x='country', data=df, order=df['country'].value_counts().index[:10])
plt.title('Medal Count by Country')
plt.xticks(rotation=90)
plt.show()


ValueError: Could not interpret value `age` for parameter `x`

In [None]:
# Step 8: Multivariate Analysis
# Pair plot for selected features
sns.pairplot(df[['age', 'height', 'weight', 'medal']])
plt.show()


In [8]:
# Step 9: Identify and Handle Outliers
# Box plot to identify outliers in Age
sns.boxplot(x=df['age'])
plt.title('Boxplot of Ages')
plt.show()

# Removing outliers from Age
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['age'] < (Q1 - 1.5 * IQR)) | (df['age'] > (Q3 + 1.5 * IQR)))]

# Box plot to identify outliers in Height
sns.boxplot(x=df['height'])
plt.title('Boxplot of Heights')
plt.show()


KeyError: 'age'

In [9]:
# Step 10: Feature Engineering
# Create additional features or normalize data if needed
# For example, normalize age
df['age_normalized'] = (df['age'] - df['age'].mean()) / df['age'].std()


KeyError: 'age'

In [10]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Medal distribution
medal_dist = df['medal'].value_counts()
print(f"Medal Distribution:\n{medal_dist}")

# Average age, height, and weight of medal winners
avg_age = df['age'].mean()
avg_height = df['height'].mean()
avg_weight = df['weight'].mean()
print(f"Average Age of Athletes: {avg_age}")
print(f"Average Height of Athletes: {avg_height}")
print(f"Average Weight of Athletes: {avg_weight}")

# Top countries by number of athletes
top_countries = df['country'].value_counts().head(10)
print(f"Top Countries by Number of Athletes:\n{top_countries}")

# Insights from scatter plots
print("The relationship between age, height, and weight of athletes can provide insights into the physical characteristics associated with success in various sports.")


Key Insights:


KeyError: 'medal'

Findings:
1. Medal Distribution: The dataset provides insights into the distribution of medals across different countries and sports.
2. Athlete Characteristics: Analyzing the distribution of ages, heights, and weights offers a view of the physical characteristics of athletes.
3. Top Countries and Sports: Identifying the top countries by number of athletes and the top sports by number of medals provides insights into the global distribution of athletic talent and success.
4. Correlations: Understanding the relationships between age, height, weight, and medal counts can help in analyzing the factors contributing to athletic success.