In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cars = pd.read_csv('../input/cars-germany/autoscout24-germany-dataset.csv')

cars

<h1>Data Analysis</h1>

First we look into the data itself and see our basic observations

In [None]:
cars.head()

Lets check if there are any empty entries

In [None]:
cars.isna().sum()

We see that there are empty entries. Before we do Imputation of the hp, we see that model, make, fuel, and gear doesn't really matter so we can delete them. We also would need to do .describe() to see some null values that are represented in another way.

In [None]:
cars.drop(['make', 'model', 'fuel', 'gear'], axis=1, inplace=True)

cars

Before we go futher, we should one-hot-encode the offertype for the model.

In [None]:
cars = pd.get_dummies(cars, columns=['offerType'])

cars

To prevent the dummy trap we then remove one category. We can delete the offerType_Demonstration column

In [None]:
cars.drop('offerType_Demonstration', axis=1, inplace=True)

cars

Next we can now impute hp with SimpleImputer

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()

cars['hp'] = imp.fit_transform(cars['hp'].values.reshape(-1,1))

cars

In [None]:
cars.isna().sum()

We now don't have anymore empty entires? Let's see with .describe()

In [None]:
cars.describe()

We now have no empty entires. Now, a more significant way to represent the car is its age than when it was created. We do this with the datetime library in python

In [None]:
import datetime

cars['age'] = datetime.datetime.now().year - cars['year']

cars.drop('year', axis=1, inplace=True)

cars

Now the only thing we need to do is scale it. We do that with preprocessing.scale. We do this on the non categorical variables and price.

In [None]:
from sklearn import preprocessing

cols = ['mileage','hp','age']

cars[cols] = preprocessing.scale(cars[cols])

cars

Let's see the relationships between these features with the price

In [None]:
import seaborn as sns
sns.scatterplot(x=cars['mileage'], y=cars['price'])#Inverse Exponential

In [None]:
sns.scatterplot(x=cars['age'], y=cars['price']) #Decreasing

In [None]:
sns.scatterplot(x=cars['hp'], y=cars['price']) #Exponential

It seems that the price is related in a polynomial way. To faciliate a linear regression, we transform the price to log(price)

In [None]:
cars['price'] = cars['price'].map(lambda p: np.log(p))

cars

In [None]:
sns.scatterplot(x=cars['hp'], y=cars['price']) #Has now become a Linear Reg! Positive Correlation

In [None]:
sns.scatterplot(x=cars['age'], y=cars['price']) #Now Linear Reg Decreasing

In [None]:
sns.scatterplot(x=cars['mileage'], y=cars['price']) #Negative Correlation

We can delete the outliers on the mileage to make a more general pattern

In [None]:
outlier_index = cars.loc[cars['mileage'] > 9].index

print(outlier_index)

cars.drop(outlier_index, axis=0, inplace=True)

cars

In [None]:
#Reset the index

cars = cars.reset_index(drop=True)

cars

This is now our processed csv. We put this in a .csv file to save our progress

In [None]:
cars.to_csv('germany_cars_processed.csv')

<h1>Price Prediction</h1>

We create helper functions to evaluate what we can use to predict

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.metrics import r2_score

def train_and_evaluate_model(model_fn, X, y):
    x_train, x_test, y_train, y_test = train_test_split(
        X,y,test_size=0.2, random_state=0
    )
    
    model = model_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    #Adjsuted R2
    score = 1 - ((1-r2_score(y_test, y_pred))*(len(y_test)-1))/(len(y_test) - (x_test.shape[1] + 1))
    
    return score

From now on here, we have the builder functions for each of the models and at the end

In [None]:
def lin_reg(x_train, y_train):
    model = LinearRegression(normalize=True).fit(x_train, y_train)
    return model

def ridge_reg(x_train, y_train, alpha=0.5, normalize=True):
    model = Ridge(alpha=alpha, normalize=True).fit(x_train, y_train)
    return model

def lasso_reg(x_train, y_train, alpha=0.5):
    model = Lasso(alpha=alpha).fit(x_train, y_train)
    return model

def en_reg(x_train, y_train, alpha=0.5, l1_ratio=0.5, normalize=True, max_iter=100000, warm_start=True):
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, normalize=normalize, max_iter=max_iter, warm_start=warm_start)
    model.fit(x_train, y_train)
    return model

def knn_reg(x_train, y_train, n_neighbors=5):
    model = KNeighborsRegressor(n_neighbors=n_neighbors)
    model.fit(x_train, y_train)
    return model

def svr_reg(x_train, y_train, kernel='linear', epsilon=0.05, C=0.3):
    model=SVR(kernel=kernel, epsilon=epsilon, C=C)
    model.fit(x_train,y_train)
    return model

def dt_reg(x_train, y_train):
    model=DecisionTreeRegressor()
    model.fit(x_train,y_train)
    return model

def rf_reg(x_train, y_train):
    model = RandomForestRegressor()
    model.fit(x_train, y_train)
    return model

def xgb_reg(x_train, y_train):
    model = XGBRegressor()
    model.fit(x_train, y_train)
    return model

In [None]:
results_dict = {}

X = cars.drop('price', axis=1)
y = cars['price']

results_dict['linear'] = train_and_evaluate_model(lin_reg, X, y)
results_dict['ridge'] = train_and_evaluate_model(ridge_reg, X, y)
results_dict['lasso'] = train_and_evaluate_model(lasso_reg, X, y)
results_dict['elasticnet'] = train_and_evaluate_model(en_reg, X, y)
results_dict['knn'] = train_and_evaluate_model(knn_reg, X, y)
results_dict['svr'] = train_and_evaluate_model(svr_reg, X, y)
results_dict['dt'] = train_and_evaluate_model(dt_reg, X, y)
results_dict['rf'] = train_and_evaluate_model(rf_reg, X, y)
results_dict['xgb'] = train_and_evaluate_model(xgb_reg, X, y)

for model, score in results_dict.items():
    print("{}: {}".format(model, score))

In [None]:
results_dict['elasticnet'] = train_and_evaluate_model(en_reg, X, y)

for model, score in results_dict.items():
    print("{}: {}".format(model, score))

We see that XGB has the best prediction power!

In [None]:
#XGB implementation predicting price

X = cars.drop('price', axis=1)
y = cars['price']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=0.1, random_state=0
)
    
model = XGBRegressor(n_estimators=500, learning_rate=0.02, 
                    objective='reg:squarederror', max_depth=10,
                    )

model.fit(x_train, y_train,
         early_stopping_rounds=5,
         eval_set=[(x_valid, y_valid)],
         verbose=False)

y_pred = model.predict(x_test)

score = 1 - ((1-r2_score(y_test, y_pred))*(len(y_test)-1))/(len(y_test) - (x_test.shape[1] + 1))

print(model.score(x_train, y_train))
print(score)

We see that we have a model that has a 91% R^2 Score