In [1]:
## Creating mileage (trying to get somehow a realistic view)

def generate_total_mileage(row):
    base_km = 50000
    price = row['Price ($)']
    body = row['Body Style']

    if price < 15000:
        base_km += 40000
    elif price > 30000:
        base_km -= 20000

    if body in ['SUV', 'Truck', 'Van']:
        base_km += 20000
    elif body in ['Convertible', 'Coupe']:
        base_km -= 10000

    mileage = round(np.random.normal(loc=base_km, scale=5000))
    return max(mileage, 5000)

In [2]:
## Create a crash test score (1-5 stars as the real world and attributing some "reality" to it by defining the brands info)
## also starting with a base score of 3.5 and adjusting randomly based on price

luxury_brands = ['BMW', 'Mercedes', 'Cadillac', 'Acura', 'Lexus', 'Audi']
safe_brands = ['Volvo', 'Subaru', 'Toyota', 'Honda']

def generate_crash_test_score(row):
    score = 3.5

    if row['Company'] in safe_brands:
        score += 0.7
    if row['Company'] in luxury_brands:
        score += 0.5
    if row['Price ($)'] > 30000:
        score += 0.3
    elif row['Price ($)'] < 15000:
        score -= 0.5

    return int(np.clip(np.random.normal(loc=score, scale=0.5), 1, 5))

In [3]:
## Generating a family size based on body style, mileage, and price. (random)

def generate_family_size(row):
    size = 2
    if row['Body Style'] in ['SUV', 'Van']:
        size += 1
    if row['Mileage_km_total'] > 70000:
        size += 1
    if row['Price ($)'] > 35000:
        size -= 1
    return int(np.clip(np.random.normal(loc=size, scale=1), 1, 5))

In [4]:
## Reading the dataset
import pandas as pd
import numpy as np  
import os

file_path = '../data/raw/car_sales_data.csv'
df = pd.read_csv(file_path)

In [5]:
## checking if the dataset is loaded correctly
df.head()

Unnamed: 0,Car_id,Date,Customer Name,Gender,Annual Income,Dealer_Name,Company,Model,Engine,Transmission,Color,Price ($),Dealer_No,Body Style,Phone,Dealer_Region
0,C_CND_000001,1/2/2022,Geraldine,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,8264678,Middletown
1,C_CND_000002,1/2/2022,Gia,Male,1480000,C & M Motors Inc,Dodge,Durango,DoubleÂ Overhead Camshaft,Auto,Black,19000,60504-7114,SUV,6848189,Aurora
2,C_CND_000003,1/2/2022,Gianna,Male,1035000,Capitol KIA,Cadillac,Eldorado,Overhead Camshaft,Manual,Red,31500,38701-8047,Passenger,7298798,Greenville
3,C_CND_000004,1/2/2022,Giselle,Male,13500,Chrysler of Tri-Cities,Toyota,Celica,Overhead Camshaft,Manual,Pale White,14000,99301-3882,SUV,6257557,Pasco
4,C_CND_000005,1/2/2022,Grace,Male,1465000,Chrysler Plymouth,Acura,TL,DoubleÂ Overhead Camshaft,Auto,Red,24500,53546-9427,Hatchback,7081483,Janesville


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23906 entries, 0 to 23905
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Car_id         23906 non-null  object
 1   Date           23906 non-null  object
 2   Customer Name  23905 non-null  object
 3   Gender         23906 non-null  object
 4   Annual Income  23906 non-null  int64 
 5   Dealer_Name    23906 non-null  object
 6   Company        23906 non-null  object
 7   Model          23906 non-null  object
 8   Engine         23906 non-null  object
 9   Transmission   23906 non-null  object
 10  Color          23906 non-null  object
 11  Price ($)      23906 non-null  int64 
 12  Dealer_No      23906 non-null  object
 13  Body Style     23906 non-null  object
 14  Phone          23906 non-null  int64 
 15  Dealer_Region  23906 non-null  object
dtypes: int64(3), object(13)
memory usage: 2.9+ MB


- It shows we have 16 columns

In [7]:
## Generating the synthetic values with the functions defined above
df['Mileage_km_total'] = df.apply(generate_total_mileage, axis=1)
df['Crash_Test_Score'] = df.apply(generate_crash_test_score, axis=1)
df['Family_Size'] = df.apply(generate_family_size, axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23906 entries, 0 to 23905
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Car_id            23906 non-null  object
 1   Date              23906 non-null  object
 2   Customer Name     23905 non-null  object
 3   Gender            23906 non-null  object
 4   Annual Income     23906 non-null  int64 
 5   Dealer_Name       23906 non-null  object
 6   Company           23906 non-null  object
 7   Model             23906 non-null  object
 8   Engine            23906 non-null  object
 9   Transmission      23906 non-null  object
 10  Color             23906 non-null  object
 11  Price ($)         23906 non-null  int64 
 12  Dealer_No         23906 non-null  object
 13  Body Style        23906 non-null  object
 14  Phone             23906 non-null  int64 
 15  Dealer_Region     23906 non-null  object
 16  Mileage_km_total  23906 non-null  int64 
 17  Crash_Test_S

- 19 columns after the creation of the 3 new features

In [9]:
df.head()

Unnamed: 0,Car_id,Date,Customer Name,Gender,Annual Income,Dealer_Name,Company,Model,Engine,Transmission,Color,Price ($),Dealer_No,Body Style,Phone,Dealer_Region,Mileage_km_total,Crash_Test_Score,Family_Size
0,C_CND_000001,1/2/2022,Geraldine,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,8264678,Middletown,68468,4,2
1,C_CND_000002,1/2/2022,Gia,Male,1480000,C & M Motors Inc,Dodge,Durango,DoubleÂ Overhead Camshaft,Auto,Black,19000,60504-7114,SUV,6848189,Aurora,64053,3,2
2,C_CND_000003,1/2/2022,Gianna,Male,1035000,Capitol KIA,Cadillac,Eldorado,Overhead Camshaft,Manual,Red,31500,38701-8047,Passenger,7298798,Greenville,24942,5,3
3,C_CND_000004,1/2/2022,Giselle,Male,13500,Chrysler of Tri-Cities,Toyota,Celica,Overhead Camshaft,Manual,Pale White,14000,99301-3882,SUV,6257557,Pasco,108673,3,3
4,C_CND_000005,1/2/2022,Grace,Male,1465000,Chrysler Plymouth,Acura,TL,DoubleÂ Overhead Camshaft,Auto,Red,24500,53546-9427,Hatchback,7081483,Janesville,51660,4,1


In [10]:
## Saving the modified DataFrame to a new CSV file
output_file_path = '../data/processed/car_sales_data_with_synthetic_features.csv'
df.to_csv(output_file_path, index=False)