In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler as StandardScaler

In [30]:

# import warnings filter (for normalization in sklearn model)
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


In [3]:
# dataset origin: https://www.kaggle.com/datasets/deepcontractor/car-price-prediction-challenge
# load dataset
df= pd.read_csv('/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv')

# Inspect and Clean Dataset

In [4]:
df.shape

In [5]:
df.head()

In [6]:
# remove duplicate rows, if any
df = df.drop_duplicates()
df.shape

In [7]:
df.describe()

In [8]:
df.isnull().any()

In [9]:
df.info()

1) Columns Levy, Engine volume and Mileage would be expected to be numeric -> search for non-numeric values
2) Doors: dates need to be replaced with plausible values

In [10]:
# as could be seen by the .head(), some entries in Levy are '-', which we will interpret as 0
df['Levy'] = df['Levy'].replace('-', 0)
df['Levy'] = pd.to_numeric(df['Levy'], errors='coerce')
#df.info()  # check, if conversion worked

In [11]:
df['Engine volume']

In [12]:
# because of the string ' Turbo' this column is not numeric. Instead of just removing the string, we add a new column to encode
# this information as a categorical variable before removing from column 'Engine volume'
df['Turbo'] = [1 if 'Turbo' in x else 0 for x in df['Engine volume']]
df['Engine volume'] = df['Engine volume'].replace(' Turbo', '', regex=True)
df['Engine volume'] = pd.to_numeric(df['Engine volume'], errors='coerce')
df['Engine volume'].isnull().any()  # check, if conversion to numeric column worked or if there are more strings to take care of

In [13]:
# clean Mileage column, as ' km' was present in the cells displayed with .head()
df['Mileage'] = df['Mileage'].replace(' km', '', regex=True)
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
df['Mileage'].isnull().any()  # check, if conversion to numeric column worked or if there are more strings to take care of

In [14]:
df['Doors'].value_counts()

In [15]:
# there are 3 values, which are interpreted as dates. 
# 02-Mar is most likely 2-3, 04-May is most likely 4-5, >5 makes sense
df.loc[df['Doors'] == '04-May', 'Doors'] = '4-5'
df.loc[df['Doors'] == '02-Mar', 'Doors'] = '2-3'
df['Doors'].value_counts()

In [16]:
# replace production year with age of car instead by year of publication - prodYear (no publication date found, 2021 is assumed)
df['Age'] = 2021 - df['Prod. year']
df['Age']

In [17]:
df = df.drop(['ID', 'Prod. year'], axis=1)

In [18]:
df.info()

In [19]:
df.describe(())

In [20]:
df.hist()

In [21]:
# from the histograms it can be seen that there seem to be outliers esp. in the columns for age, price, cylinders. 
# However, the min and max values for each column look plausible.
# For illustration we could print only the price buckets <50000
df[df['Price']<50000].Price.hist()

In [22]:
cars_per_year = df.groupby('Age').count()['Price']
cars_per_year

In [23]:
df.groupby('Age').mean().astype(int)['Price']

In [24]:
ax = sns.lineplot(data=cars_per_year)
ax2=ax.twinx()
sns.lineplot(x='Age', y='Price', data=df, ax=ax2, color='g')
plt.xlabel='Age'
plt.ylabel='Price'

In [34]:
# to not be affected by outliers too much, we restrict our analysis to ages with >100 cars (cutoff). This would be cars aged 
# 2 - 26 years.
# as we can see that the mean price has an outlier for 22 years old cars and fluctuates a lot afterwards, 
# we'll cut off at 20 years for our analysis. Cars older than 20 years are hardly relevant for the average buyer and 
# vintage car prices would be worth a separate analysis (which this dataset is not suited for)
df = df[df['Age'] >1]  # remove cars <1 years as there are fewer than 100 cars in the dataset
df = df[df['Age'] <=20]  # remove cars <20 years for the reasons mentioned above
sns.lineplot(x='Age', y='Price', data=df)

In [26]:
# encode numerical variables
df_cat = df.select_dtypes(include=['object']).copy()
df_cat = pd.get_dummies(df_cat, drop_first=True)
df_cat.shape

In [27]:
# with 1514 columns (due to encoding) and 17638 rows, our dataset is big enough to satisfy the rule of thumb of >10 rows per column
df = df.drop(df.select_dtypes(include=['object']).columns, axis=1)
df = pd.concat([df, df_cat], axis=1)
df

In [31]:
#0) create vectors and split dataset
X = df.drop(['Price'], axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

#1) Instantiate
lm_model = LinearRegression(normalize=True) 
#lm_model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

#2) Fit
lm_model.fit(X_train, y_train)

#3) Predict
y_test_preds = lm_model.predict(X_test)

#4) Score
r2_test = r2_score(y_pred=y_test_preds, y_true=y_test)  # Rsquared


In [32]:
r2_test