In [23]:
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
import altair as alt

In [24]:
df = pd.read_csv('vehicles_us.csv')

In [26]:
display(df.dtypes)

price             int64
model_year      float64
model            object
condition        object
cylinders       float64
fuel             object
odometer        float64
transmission     object
type             object
paint_color      object
is_4wd          float64
date_posted      object
days_listed       int64
dtype: object

In [27]:

# Fill missing model_year by grouping by model and using the median year
df['model_year'] = df.groupby('model')['model_year'].transform(lambda x: x.fillna(x.median()))

# cylindres: group by model fill by median cylindres
df['cylinders'] = df.groupby('model')['cylinders'].transform(lambda x: x.fillna(x.median()))

# Function to safely calculate the median, returning a default value if all are NaN
def safe_median(series, default_value=np.nan):
    if series.isnull().all():
        return default_value
    else:
        return series.median()
#odometer: group by model year(or year+model) fill by median(mean) odometer
# Fill missing odometer by grouping by model_year and using the median odometer
df['odometer'] = df.groupby(['model', 'model_year'])['odometer'].transform(lambda x: x.fillna(safe_median(x)))

# If there are still missing values in odometer, fill them by grouping only by model
df['odometer'] = df.groupby('model')['odometer'].transform(lambda x: x.fillna(safe_median(x)))

# If there are still missing values in odometer, fill them by the overall median
global_median_odometer = df['odometer'].median()
df['odometer'] = df['odometer'].fillna(global_median_odometer)

#replace all nan in paimt_color with other.
df['paint_color'] = df['paint_color'].fillna('other')

g = df['is_4wd'].median()
df['is_4wd'] = df['is_4wd'].fillna(g)
df

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,other,1.0,2018-06-23,19
1,25500,2011.0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,1.0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,177500.0,automatic,pickup,other,1.0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,1.0,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,1.0,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,1.0,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,1.0,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,1.0,2018-07-02,71


In [28]:
nan_rows =df[df.isna().any(axis=1)]
display(nan_rows)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed


In [29]:
#clean  up model year and the odometer column by converting to int.64
df['model_year'] =df['model_year'].astype(int)
df['odometer'] =df['odometer'].astype(int)

#convert date_posted column to datetime format
df['date_posted'] = pd.to_datetime(df['date_posted'])

display(df.dtypes)

price                    int64
model_year               int64
model                   object
condition               object
cylinders              float64
fuel                    object
odometer                 int64
transmission            object
type                    object
paint_color             object
is_4wd                 float64
date_posted     datetime64[ns]
days_listed              int64
dtype: object

In [30]:
# split model column to create the first part of the string as the manufacturer and keep the sec part of the string as model.
df[['manufacturer', 'model']] = df['model'].str.split(' ', n=1, expand=True)
df.head()


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,manufacturer
0,9400,2011,x5,good,6.0,gas,145000,automatic,SUV,other,1.0,2018-06-23,19,bmw
1,25500,2011,f-150,good,6.0,gas,88705,automatic,pickup,white,1.0,2018-10-19,50,ford
2,5500,2013,sonata,like new,4.0,gas,110000,automatic,sedan,red,1.0,2019-02-07,79,hyundai
3,1500,2003,f-150,fair,8.0,gas,177500,automatic,pickup,other,1.0,2019-03-22,9,ford
4,14900,2017,200,excellent,4.0,gas,80903,automatic,sedan,black,1.0,2019-04-02,28,chrysler


In [31]:
#move the manufacturer column to be close to the front of model to keep unison.
desired_order =  ['price', 'model_year', 'manufacturer', 'model', 'date_posted', 'days_listed', 'condition', 'cylinders', 'fuel', 'odometer', 'transmission', 'type', 'paint_color', 'is_4wd']
df = df[desired_order]
df.head()

Unnamed: 0,price,model_year,manufacturer,model,date_posted,days_listed,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd
0,9400,2011,bmw,x5,2018-06-23,19,good,6.0,gas,145000,automatic,SUV,other,1.0
1,25500,2011,ford,f-150,2018-10-19,50,good,6.0,gas,88705,automatic,pickup,white,1.0
2,5500,2013,hyundai,sonata,2019-02-07,79,like new,4.0,gas,110000,automatic,sedan,red,1.0
3,1500,2003,ford,f-150,2019-03-22,9,fair,8.0,gas,177500,automatic,pickup,other,1.0
4,14900,2017,chrysler,200,2019-04-02,28,excellent,4.0,gas,80903,automatic,sedan,black,1.0


In [32]:
#the oldest listing in the dataset:
df['days_listed'].max()

np.int64(271)

In [33]:
#The newest listing ion the dataset:
df['days_listed'].min()

np.int64(0)

In [34]:
#Highest price listed for a vehicle
df['price'].max()

np.int64(375000)

In [35]:
#lowest to highest milage listed:
lowest_mls, highest_mls = int(df['odometer'].min()), int(df['odometer'].max())

In [36]:
print(highest_mls)

990000


In [39]:
#How old is the car?
df.loc[:, 'age'] = 2024 - df.loc[:, 'model_year']
df.head()

Unnamed: 0,price,model_year,manufacturer,model,date_posted,days_listed,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,age
0,9400,2011,bmw,x5,2018-06-23,19,good,6.0,gas,145000,automatic,SUV,other,1.0,13
1,25500,2011,ford,f-150,2018-10-19,50,good,6.0,gas,88705,automatic,pickup,white,1.0,13
2,5500,2013,hyundai,sonata,2019-02-07,79,like new,4.0,gas,110000,automatic,sedan,red,1.0,11
3,1500,2003,ford,f-150,2019-03-22,9,fair,8.0,gas,177500,automatic,pickup,other,1.0,21
4,14900,2017,chrysler,200,2019-04-02,28,excellent,4.0,gas,80903,automatic,sedan,black,1.0,7


In [38]:
#df.to_csv('preprocessed_data_vehicles_us.csv')
