In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# dataset origin: https://www.kaggle.com/datasets/deepcontractor/car-price-prediction-challenge
# load dataset
df= pd.read_csv('/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv')

# Inspect Dataset

In [3]:
df.shape

In [4]:
df.head()

In [5]:
# remove duplicate rows, if any
df = df.drop_duplicates()
df.shape

In [6]:
df.describe()

In [7]:
df.isnull().any()

In [8]:
df.info()

1) Columns Levy, Engine volume and Mileage would be expected to be numeric -> search for non-numeric values
2) Doors: dates need to be replaced with plausible values

In [9]:
# as could be seen by the .head(), some entries in Levy are '-', which we will interpret as 0
df['Levy'] = df['Levy'].replace('-', 0)
df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')
df.info()  # check, if conversion worked

In [10]:
df['Engine volume']

In [11]:
# because of the string ' Turbo' this column is not numeric. Instead of just removing the string, we add a new column to encode
# this information as a categorical variable before removing from column 'Engine volume'
df['Turbo'] = [1 if 'Turbo' in x else 0 for x in df['Engine volume']]
df['Engine volume'] = df['Engine volume'].replace(' Turbo', '', regex=True)
df['Engine volume'] = pd.to_numeric(df['Engine volume'], errors='coerce')
df['Engine volume'].isnull().any()  # check, if conversion to numeric column worked or if there are more strings to take care of

In [12]:
# clean Mileage column, as ' km' was present in the cells displayed with .head()
df['Mileage'] = df['Mileage'].replace(' km', '', regex=True)
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
df['Mileage'].isnull().any()  # check, if conversion to numeric column worked or if there are more strings to take care of

In [13]:
df['Doors'].value_counts()

In [14]:
# there are 3 values, which are interpreted as dates. 
# 02-Mar is most likely 2-3, 04-May is most likely 4-5, >5 makes sense
df.loc[df['Doors'] == '04-May', 'Doors'] = '4-5'
df.loc[df['Doors'] == '02-Mar', 'Doors'] = '2-3'
df['Doors'].value_counts()

In [15]:
# replace production year with age of car instead by year of publication - prodYear (no publication date found, 2021 is assumed)
df['Age'] = 2021 - df['Prod. year']
df['Age']

In [16]:
df = df.drop(['ID', 'Prod. year'], axis=1)

In [17]:
df.info()

In [18]:
df.describe(())

In [19]:
df.hist()

In [20]:
# from the histograms it can be seen that there seem to be outliers esp. in the columns for age, price, cylinders. 
# However, the min and max values for each column look plausible.
# For illustration we could print only the price buckets <50000
df[df['Price']<50000].Price.hist()