In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Load dataset
df = pd.read_csv('data/Summer-Olympic-medals-1976-to-2008.csv', encoding='latin1')
df.head()


## 🧹 2. Data Cleaning

In [None]:
# Drop unused columns
df.drop(['Event_gender', 'Country_Code'], axis=1, inplace=True)

# Drop fully empty rows
df.dropna(how='all', inplace=True)

# Convert year to int
df['Year'] = df['Year'].astype(int)

# Drop rows with missing values
df.dropna(inplace=True)

df.info()


## 📊 3. Exploratory Data Analysis (EDA)

In [None]:
# 3.1 Top 10 Countries by Medal Count
medals_by_country = df['Country'].value_counts().head(10)
medals_by_country.plot(kind='bar', figsize=(10,6), color='gold')
plt.title("Top 10 Countries by Total Medals")
plt.xlabel("Country")
plt.ylabel("Medals")
plt.xticks(rotation=45)
plt.show()


In [None]:
# 3.2 Medal Distribution Over Years
medals_over_years = df['Year'].value_counts().sort_index()
plt.figure(figsize=(10,6))
sns.lineplot(x=medals_over_years.index, y=medals_over_years.values, marker='o')
plt.title("Total Medals Over the Years")
plt.xlabel("Year")
plt.ylabel("Number of Medals")
plt.grid(True)
plt.show()


In [None]:
# 3.3 Gender Distribution
gender_dist = df['Gender'].value_counts()
gender_dist.plot.pie(autopct='%1.1f%%', colors=['#ff9999','#66b3ff'], explode=[0.05,0], figsize=(6,6))
plt.title("Gender Distribution")
plt.ylabel('')
plt.show()


In [None]:
# 3.4 Top 10 Athletes
top_athletes = df['Athlete'].value_counts().head(10)
top_athletes.plot(kind='bar', figsize=(10,6), color='silver')
plt.title("Top 10 Athletes by Medal Count")
plt.xlabel("Athlete")
plt.ylabel("Medals")
plt.xticks(rotation=75)
plt.show()


## 🤖 4. Logistic Regression Model

In [None]:
# Label Encoding for categorical features
df_encoded = df.copy()
le = LabelEncoder()

df_encoded['Country'] = le.fit_transform(df_encoded['Country'])
df_encoded['Sport'] = le.fit_transform(df_encoded['Sport'])
df_encoded['Gender'] = le.fit_transform(df_encoded['Gender'])
df_encoded['Medal'] = df_encoded['Medal'].map({'Gold': 1, 'Silver': 1, 'Bronze': 1})

# Features and Target
X = df_encoded[['Country', 'Sport', 'Gender']]
y = df_encoded['Medal']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


## 📌 5. Conclusion & Observations

- **Top Performing Countries**: United States, Russia, China consistently lead in medals.
- **Gender Distribution**: Male participation is higher, though female representation is rising.
- **Top Athlete**: Michael Phelps stands out with unmatched medal counts.
- **Model Prediction**: A basic logistic regression model provides decent accuracy predicting medal wins.

---

📁 Dataset: Summer Olympics 1976–2008  
📈 Powered by Python, pandas, seaborn, and scikit-learn.
