<a href="https://colab.research.google.com/github/ranggaadinugraha/bmw-sales-data-analysis/blob/main/BMW_Sales_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#BMW Sales Data Analysis (2010â€“2024)

##This project focuses on exploratory data analysis (EDA) and data visualization of BMW sales data from 2010 to 2024.The objective is to uncover sales trends, pricing patterns, regional performance and relationships between vehicle specifications and sales volume using Python.

ðŸ”· Import Libraries & Load Dataset

In [None]:
# Import Required Libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Set global visualization style
sns.set_theme(style="ticks", palette="deep")

In [None]:
# Load Dataset

df = pd.read_csv("BMW sales data (2010-2024).csv")

print("Dataset successfully loaded!")
display(df.head())


ðŸ”· Initial Data Understanding

In [None]:
# Basic Dataset Overview

# Dataset structure
df.info()

# Missing values check
df.isnull().sum()

# Target variable distribution
df['Sales_Classification'].value_counts()

# Statistical summary
df.describe()


ðŸ”· Feature Engineering â€“ Engine Size Categorization

In [None]:
# Feature Engineering

# Create categorical labels for engine size
engine_bins = [0, 2.0, 3.5, 5.5]
engine_labels = ['Compact', 'Mid-Size', 'High Performance']

df['Engine_Category'] = pd.cut(
    df['Engine_Size_L'],
    bins=engine_bins,
    labels=engine_labels,
    include_lowest=True
)

df['Engine_Category'].value_counts()


ðŸ”· Distribution Analysis (Core Numerical Variables)

In [None]:
# Distribution Analysis

fig, axes = plt.subplots(2, 2, figsize=(15,10))
axes = axes.flatten()

numeric_features = [
    ('Price_USD', 'Price Distribution (USD)'),
    ('Mileage_KM', 'Mileage Distribution (KM)'),
    ('Sales_Volume', 'Sales Volume Distribution'),
    ('Engine_Size_L', 'Engine Size Distribution (L)')
]

for ax, (col, title) in zip(axes, numeric_features):
    sns.histplot(
        data=df,
        x=col,
        bins=35,
        kde=True,
        color="#4C72B0",
        ax=ax
    )
    ax.set_title(title)

plt.tight_layout()
plt.show()


ðŸ”· Categorical Feature Analysis

In [None]:
# Categorical Distribution Analysis

categorical_columns = [
    'Model',
    'Fuel_Type',
    'Transmission',
    'Region',
    'Color'
]

for col in categorical_columns:
    plt.figure(figsize=(10,4))
    sns.countplot(
        data=df,
        x=col,
        order=df[col].value_counts().index,
        palette="crest"
    )
    plt.title(f"Distribution of Vehicles by {col}")
    plt.xticks(rotation=40)
    plt.show()


ðŸ”· Price Analysis Across Categories

In [None]:
# Price Comparison Analysis

fig, axes = plt.subplots(2, 2, figsize=(16,12))
axes = axes.flatten()

price_categories = ['Model', 'Region', 'Fuel_Type', 'Transmission']

for ax, col in zip(axes, price_categories):
    sns.boxplot(
        data=df,
        x=col,
        y='Price_USD',
        palette="coolwarm",
        ax=ax
    )
    ax.set_title(f"Price Distribution by {col}")
    ax.tick_params(axis='x', rotation=35)

plt.tight_layout()
plt.show()


ðŸ”· Relationship Analysis (Business Insight Focus)

In [None]:
# Relationship Analysis

plt.figure(figsize=(15,10))

# Price vs Engine Size
plt.subplot(2,2,1)
sns.scatterplot(
    data=df,
    x='Engine_Size_L',
    y='Price_USD',
    hue='Fuel_Type',
    palette="Set3"
)
plt.title("Price vs Engine Size")

# Price vs Mileage
plt.subplot(2,2,2)
sns.scatterplot(
    data=df,
    x='Mileage_KM',
    y='Price_USD',
    hue='Transmission',
    palette="Set1"
)
plt.title("Price vs Mileage")

# Sales Volume vs Price
plt.subplot(2,2,3)
sns.scatterplot(
    data=df,
    x='Price_USD',
    y='Sales_Volume',
    hue='Sales_Classification',
    palette="Dark2"
)
plt.title("Sales Volume vs Price")

# Sales Volume vs Engine Size
plt.subplot(2,2,4)
sns.scatterplot(
    data=df,
    x='Engine_Size_L',
    y='Sales_Volume',
    hue='Engine_Category',
    palette="tab10"
)
plt.title("Sales Volume vs Engine Size")

plt.tight_layout()
plt.show()


ðŸ”· Time-Series Trend Analysis

In [None]:
# Time-Based Trend Analysis

yearly_metrics = df.groupby('Year').agg({
    'Price_USD': 'mean',
    'Sales_Volume': 'sum',
    'Engine_Size_L': 'mean'
}).reset_index()

plt.figure(figsize=(14,10))

# Average price trend
plt.subplot(3,1,1)
sns.lineplot(data=yearly_metrics, x='Year', y='Price_USD', marker='o', color='#1f77b4')
plt.title("Average Vehicle Price Over Time")

# Sales volume trend
plt.subplot(3,1,2)
sns.lineplot(data=yearly_metrics, x='Year', y='Sales_Volume', marker='o', color='#2ca02c')
plt.title("Total Sales Volume Over Time")

# Engine size trend
plt.subplot(3,1,3)
sns.lineplot(data=yearly_metrics, x='Year', y='Engine_Size_L', marker='o', color='#ff7f0e')
plt.title("Average Engine Size Over Time")

plt.tight_layout()
plt.show()


ðŸ”· Regional Market Insight

In [None]:
# Regional Market Analysis


region_summary = df.groupby('Region').agg({
    'Year': 'mean',
    'Price_USD': 'mean',
    'Sales_Volume': 'sum'
}).sort_values('Sales_Volume', ascending=False)

plt.figure(figsize=(12,5))
sns.barplot(
    x=region_summary.index,
    y=region_summary['Sales_Volume'],
    palette="mako"
)
plt.title("Total Sales Volume by Region")
plt.ylabel("Total Units Sold")
plt.show()
