### 🧠 Section 1: Python Refresher – Core Concepts


In [None]:
# 📦 Variables & Data Types
car = "McLaren"
laps = 58
avg_speed = 210.5
pit_stopped = True

print(f"Car: {car}, Laps: {laps}, Avg Speed: {avg_speed} km/h, Pit Stop? {pit_stopped}")

# conditional statements
# Loops


In [37]:
# 1.2 Data Structures

# 🧾 List of F1 teams
# Ordered, mutable collections
teams = ["McLaren", "Mercedes", "Red Bull", "Ferrari", "Alpine"]
print("Teams:", teams)

# list comprehensions
## Concise way to transform or filter lists
upper_teams = [team.upper() for team in teams]
print(upper_teams)

Teams: ['McLaren', 'Mercedes', 'Red Bull', 'Ferrari', 'Alpine']
['MCLAREN', 'MERCEDES', 'RED BULL', 'FERRARI', 'ALPINE']


In [24]:
# tuples
# Immutable sequences, often used for fixed values (like coordinates or paired data).

point = (10, 20)
print("Tuple:",point[0]) 

# Use in ML for (feature, label) pairs:
data = [(0.5, 1), (0.7, 0), (0.2, 1)]

Tuple: 10


In [35]:
# Dictionaries
# Key-value mappings — very common in ML for config, record storage, etc.

# 🗂️ Dictionary mapping drivers to teams
drivers = {
    "Norris": "McLaren",
    "Hamilton": "Mercedes",
    "Verstappen": "Red Bull",
    "Leclerc": "Ferrari"
}
print("Dictionary: ",drivers['Norris'])  # Accessing value by key

# Can also store nested data:
student = {
    "name": "Alice",
    "scores": {"math": 90, "science": 85}
}
print("Nested Structure:" ,student["scores"]["math"])


Dictionary:  McLaren
Nested Structure: 90


In [28]:
# Nested & access patterns
# You often deal with nested data (e.g., JSON, ML outputs, grouped stats).

# List of Dicts
dataset = [
    {"id": 1, "score": 80},
    {"id": 2, "score": 95}
]

# Get all scores
scores = [row["score"] for row in dataset]
print("List of Dict",scores)

# Dict of Lists
class_scores = {
    "Alice": [85, 90],
    "Bob": [70, 75]
}

# Average score for Alice
avg = sum(class_scores["Alice"]) / len(class_scores["Alice"])
print("Dict of Lists",avg) 

# Mixed Nesting
experiment = {
    "run1": {"accuracy": 0.91, "loss": 0.2},
    "run2": {"accuracy": 0.89, "loss": 0.3}
}

# Get best accuracy
best = max([v["accuracy"] for v in experiment.values()])
print("Mixed Nesting",best)

List of Dict [80, 95]
Dict of Lists 87.5
Mixed Nesting 0.91


In [31]:
# sets
# Unordered, unique collections — great for deduplication or fast membership testing.

labels = [1, 0, 1, 0, 1]
unique_labels = set(labels)
print("Sets:",unique_labels)  

# Check if a label exists
print("Label Exists:",1 in unique_labels) 

Sets: {0, 1}
Label Exists: True


In [36]:

# 🔁 Looping through a list
for team in teams:
    print("Team:", team)

# 🔁 Looping through dictionary
for driver, team in drivers.items():
    print(f"{driver} drives for {team}")

Team: McLaren
Team: Mercedes
Team: Red Bull
Team: Ferrari
Team: Alpine
Norris drives for McLaren
Hamilton drives for Mercedes
Verstappen drives for Red Bull
Leclerc drives for Ferrari


In [38]:
# 🧠 Define a function to convert speed from km/h to mph
def kmh_to_mph(speed_kmh):
    return round(speed_kmh * 0.621371, 2)

print("210 km/h is", kmh_to_mph(210), "mph")


# 🎯 Function to describe a driver
def describe_driver(name, team, speed):
    mph = kmh_to_mph(speed)
    return f"{name} drives for {team} at {speed} km/h ({mph} mph)."

print(describe_driver("Norris", "McLaren", 210))


210 km/h is 130.49 mph
Norris drives for McLaren at 210 km/h (130.49 mph).


### 📥 Section 2: Data Handling with Pandas

#### Load the Dataset

In [None]:

# Load required packages
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\p.muthusenapathy\VSCode_Projects\Python_Training\datasets\F1 data.csv")
# Use double backslashes or raw string for Windows file paths to avoid unicode errors
# Preview the first few rows
df.head()

#### Inspect Data

In [None]:
# Basic info
df.info()

In [None]:
# Summary statistics for numeric and non-numeric columns
df.describe(include='all')

In [None]:
# List all columns
print("Columns:", df.columns.tolist())

#### Clean Missing Values

In [None]:
# Count missing values
missing = df.isnull().sum()
print("Missing values:\n", missing)

In [None]:
# Option 1: Drop rows with critical missing values
df_cleaned = df.dropna(subset=["points", "milliseconds"])

# Option 2: Fill missing values for less important columns
df_cleaned["fastestLapSpeed"] = df_cleaned["fastestLapSpeed"].fillna(df_cleaned["fastestLapSpeed"].mean())

#### Group By Team or Driver

In [None]:
# Average points per team
team_avg_points = df_cleaned.groupby("Constructor name")["points"].mean().sort_values(ascending=False)
print("Avg Points by Team:\n", team_avg_points)

# Total laps per driver
laps_per_driver = df_cleaned.groupby("Driver's surname")["laps"].sum().sort_values(ascending=False)
print("Total Laps by Driver:\n", laps_per_driver)

In [None]:
import matplotlib.pyplot as plt

# Bar chart for average points
team_avg_points.head(10).plot(kind="bar", figsize=(10, 5), title="Top Teams by Avg Points", ylabel="Points")
plt.grid(True)
plt.show()

### 🧮 Section 3: Exploratory Data Analysis (EDA)

#### Calculate Descriptive Satistics

In [None]:
df_cleaned.describe()

np.float64(1687141.5922702271)

In [69]:
# Median lap time
df_cleaned['milliseconds'].median()

# Standard deviation of lap times
df_cleaned['milliseconds'].std()

np.float64(1687141.5922702274)

#### Visualize Distributions

In [None]:
### Histogram + KDE for Lap Times
# Visualizing the distribution of lap times using a histogram with a kernel density estimate (KDE).


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['milliseconds'].dropna(), bins=30, kde=True, color='skyblue')
plt.title('Distribution of Lap Times')
plt.xlabel('Lap Time (milliseconds)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
### Boxplot for Team Comparison
## Compare lap times across teams using a boxplot.
#This shows median, interquartile range, and outliers per team.



plt.figure(figsize=(12, 6))
sns.boxplot(x='Constructor name', y='milliseconds', data=df_cleaned, palette='Set2')
plt.xticks(rotation=45)
plt.title('Lap Time Distribution by Team')
plt.ylabel('Lap Time (ms)')
plt.show()

In [None]:
## Plot Lap Times Over Race Laps
## This helps analyze performance consistency across the race.

# Ensure lap is numeric
df_cleaned['laps'] = pd.to_numeric(df_cleaned['laps'], errors='coerce')

# Filter McLaren and Mercedes
df_filtered = df_cleaned[df_cleaned['Constructor name'].isin(['McLaren', 'Mercedes'])]

plt.figure(figsize=(14, 6))
sns.lineplot(
    data=df_filtered,
    x='laps',
    y='milliseconds',
    hue='Constructor name',
    estimator='mean',
    ci=None,
    palette='Set2'
)
plt.title('Average Lap Time per Lap: McLaren vs. Mercedes')
plt.xlabel('Lap Number')
plt.ylabel('Lap Time (ms)')
plt.grid(True)
plt.show()

### ⚙️ Section 4: Feature Engineering Basics

What is Feature Engineering?
Feature Engineering is the process of creating, transforming, or selecting the right input variables (features) from raw data that help a machine learning model learn patterns effectively.

Think of it as preparing your dataset to make it more meaningful and useful for predictions.

✅ Good features → Better model accuracy
✅ Simplifies complexity
✅ Reduces noise
✅ Highlights relationships

In [None]:
# 1. Create New Columns
# Example: average_speed from milliseconds and lap distance
# Assume each lap is ~5.8 km (you can change based on actual circuit data):

# Avoid division by zero
df_cleaned = df_cleaned[df_cleaned['milliseconds'] > 0]

# Create new column: average speed (km/h)
df_cleaned['avg_speed_kph'] = (5.8 / (df_cleaned['milliseconds'] / 1000)) * 3600
df_cleaned[['laps', 'Constructor name', 'milliseconds', 'avg_speed_kph']].head()

In [None]:
# 2. Encode Categorical Variables
# 2.1 One-Hot Encode Constructor name (for ML models)
df_encoded = pd.get_dummies(df_cleaned, columns=['Constructor name'], prefix='team')
df_encoded.head()

In [None]:
# 2.2 Label Encode Constructor Names (optional)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Constructor_encoded'] = le.fit_transform(df['Constructor name'].astype(str))
df[['Constructor name', 'Constructor_encoded']].drop_duplicates().head()

In [None]:
# 3. Handle Missing Values

# Check missing values
df.isnull().sum()

# Fill numeric columns with median
df['milliseconds'] = df['milliseconds'].fillna(df['milliseconds'].median())
df['avg_speed_kph'] = df_cleaned['avg_speed_kph'].fillna(df_cleaned['avg_speed_kph'].median())

# For advanced use cases: you can use KNN imputation or domain-specific logic.

In [None]:
# 4. Scale Numeric Features
# Use MinMaxScaler or StandardScaler.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['milliseconds_scaled', 'avg_speed_scaled']] = scaler.fit_transform(df[['milliseconds', 'avg_speed_kph']])
df[['milliseconds', 'milliseconds_scaled', 'avg_speed_kph', 'avg_speed_scaled']].head()

In [None]:
# 5. Feature Correlation
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(df[['milliseconds', 'avg_speed_kph', 'points']].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()