# Exploratory Data Analysis (EDA)

This notebook explores the processed car sales dataset, including synthetic features, to uncover insights and prepare for machine learning modeling.

In [1]:
# Load necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set default style for plots
sns.set(style="whitegrid")

# Load processed dataset
file_path = '../data/processed/car_sales_data_with_synthetic_features.csv'
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=['Customer Name', 'Phone', 'Car_id'], inplace=True)

# Show first 5 rows
df.head()

Unnamed: 0,Date,Gender,Annual Income,Dealer_Name,Company,Model,Engine,Transmission,Color,Price ($),Dealer_No,Body Style,Dealer_Region,Mileage_km_total,Crash_Test_Score,Family_Size
0,1/2/2022,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,Middletown,68468,4,2
1,1/2/2022,Male,1480000,C & M Motors Inc,Dodge,Durango,DoubleÂ Overhead Camshaft,Auto,Black,19000,60504-7114,SUV,Aurora,64053,3,2
2,1/2/2022,Male,1035000,Capitol KIA,Cadillac,Eldorado,Overhead Camshaft,Manual,Red,31500,38701-8047,Passenger,Greenville,24942,5,3
3,1/2/2022,Male,13500,Chrysler of Tri-Cities,Toyota,Celica,Overhead Camshaft,Manual,Pale White,14000,99301-3882,SUV,Pasco,108673,3,3
4,1/2/2022,Male,1465000,Chrysler Plymouth,Acura,TL,DoubleÂ Overhead Camshaft,Auto,Red,24500,53546-9427,Hatchback,Janesville,51660,4,1


In [None]:
# Check for missing values
df.isnull().sum()

# Rename columns using a dictionary
df.rename(columns={
    'Date': 'date',
    'Gender': 'gender',
    'Annual Income': 'annual_income',
    'Dealer_Name': 'dealer_name',
    'Company': 'company',
    'Model': 'model',
    'Engine': 'engine',
    'Transmission': 'transmission',
    'Color': 'color',
    'Price ($)': 'price',
    'Dealer_No ': 'dealer_no',
    'Body Style': 'body_style',
    'Phone': 'phone',
    'Dealer_Region': 'dealer_region',
    'Mileage_km_total': 'mileage_km_total',
    'Crash_Test_Score': 'crash_test_score',
    'Family_Size': 'family_size',
    'Brand_Model': 'brand_model'
}, inplace=True)

# Create a combined identifier for brand + model
df['brand_model'] = df['company'] + '-' + df['model']

# Show basic info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23906 entries, 0 to 23905
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              23906 non-null  object
 1   gender            23906 non-null  object
 2   annual_income     23906 non-null  int64 
 3   dealer_name       23906 non-null  object
 4   company           23906 non-null  object
 5   model             23906 non-null  object
 6   engine            23906 non-null  object
 7   transmission      23906 non-null  object
 8   color             23906 non-null  object
 9   price             23906 non-null  int64 
 10  dealer_no         23906 non-null  object
 11  body_style        23906 non-null  object
 12  dealer_region     23906 non-null  object
 13  mileage_km_total  23906 non-null  int64 
 14  crash_test_score  23906 non-null  int64 
 15  family_size       23906 non-null  int64 
dtypes: int64(5), object(11)
memory usage: 2.9+ MB


In [4]:
# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# List of categorical columns to convert
categorical_columns = [
    'gender', 'dealer_name', 'company', 'model', 'engine',
    'transmission', 'color', 'dealer_no', 'body_style',
    'dealer_region', 'brand_model'
]

# Convert each column to 'category' dtype
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Check new data types
df.dtypes

date                datetime64[ns]
gender                    category
annual_income                int64
dealer_name               category
company                   category
model                     category
engine                    category
transmission              category
color                     category
price                        int64
dealer_no                 category
body_style                category
dealer_region             category
mileage_km_total             int64
crash_test_score             int64
family_size                  int64
brand_model               category
dtype: object