In [76]:
import pandas as pd
from scipy.stats import randint, norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np

## Lies `cars.csv` in einen DataFrame

In [28]:
df = pd.read_csv('data/cars.csv')
df.head()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,model,fuelType,name
0,1450,1997,75,90000,andere,benzin,Toyota_Toyota_Starlet_1._Hand__TÜV_neu
1,13100,2005,280,5000,golf,benzin,R32_tauschen_oder_kaufen
2,4500,2008,87,90000,yaris,benzin,Toyota_Yaris_1.3_VVT_i
3,6000,2009,177,125000,3er,diesel,320_Alpinweiss_Kohlenstoff
4,3990,1999,118,90000,3er,benzin,BMW_318i_E46_+++_1._Hand_+++_Liebhaberfahrzeug


## Data Cleaning

* entferne die Features `model` und `name`
* entferne Observations mit `NaN`-Einträgen
* entferne Observations, deren `fuelType` nicht `benzin` oder `diesel` ist
* Führe ein One-Hot-Encoding für `fuelType` durch

In [29]:
df = df.drop(columns=['model', 'name'])
df = df[df.notna()]
df = df.query('fuelType == "benzin" or fuelType == "diesel"')
df.head()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,fuelType
0,1450,1997,75,90000,benzin
1,13100,2005,280,5000,benzin
2,4500,2008,87,90000,benzin
3,6000,2009,177,125000,diesel
4,3990,1999,118,90000,benzin


In [39]:
df['isBenzin'] = df['fuelType'].replace({'benzin': 1, 'diesel': 0})
df = df.drop(columns='fuelType')
df.head()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,isBenzin
0,1450,1997,75,90000,1
1,13100,2005,280,5000,1
2,4500,2008,87,90000,1
3,6000,2009,177,125000,0
4,3990,1999,118,90000,1


## Training

* Verwende 20% Testdaten und `random_state=42`
* predicte `price`
* Verwende eine LinearRegression und ermittle, welches Feature einen negativen Einfluss auf den Preis hat
* Verwende einen RandomForestRegressor und ermittle das wichtigste Feature

In [100]:
X = df.drop(columns='price')
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
lin_model.coef_

array([ 1.31662959e+02,  8.42243833e+01, -7.99518456e-02, -4.99755376e+03])

In [53]:
df.corr()["price"][1:]

yearOfRegistration    0.357412
powerPS               0.608716
kilometer            -0.443695
isBenzin             -0.443016
Name: price, dtype: float64

In [54]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)
forest.feature_importances_

array([0.29530925, 0.45743525, 0.21389106, 0.03336444])

Most important Feature for the RandomForestRegressor is `powerPS`

## Evaluierung

* Ermittle den mean squared error für beide Modelle
* Performt eines der models besser, wenn die Daten skaliert werden?

In [61]:
def calc_error(model, X_test, y_true):
    predictions = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_true, predictions))

In [71]:
print('LinearRegression error for training dataset: ', end='')
print(calc_error(lin_model, X_train, y_train))
print('LinearRegression error for test dataset:     ', end='')
print(calc_error(lin_model, X_test, y_test))

LinearRegression error for training dataset: 4977.009336292843
LinearRegression error for test dataset:     3703.981155490193


In [72]:
print('RandomForestRegressor error for training dataset: ', end='')
print(calc_error(forest, X_train, y_train))
print('RandomForestRegressor error for test dataset:     ', end='')
print(calc_error(forest, X_test, y_test))

RandomForestRegressor error for training dataset: 1887.944264947669
RandomForestRegressor error for test dataset:     3375.256651494581


### Tuning

Finde ein model, welches am Testset einen mse < 3200 aufweist

In [96]:
gridRandomized = RandomizedSearchCV(estimator=RandomForestRegressor(),
                                    param_distributions={'n_estimators': randint(0, 1000),
                                                         'max_features': norm(loc=0.5, scale=0.15)},
                                    scoring='neg_mean_squared_error',  # -mse, damit höher=besser
                                    cv=5,
                                    n_iter=20,
                                    n_jobs=8)
gridRandomized.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20, n_jobs=8,
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001A25730C760>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001A2562B5C10>},
                   scoring='neg_mean_squared_error')

In [97]:
pd.DataFrame(gridRandomized.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.565403,0.083362,0.036598,0.013351,0.361901,259,"{'max_features': 0.36190138409881734, 'n_estim...",-37258240.0,-24146980.0,-34809090.0,-8642599.0,-9119785.0,-22795340.0,12187320.0,4
1,0.214205,0.075899,0.010598,0.000486,0.278094,97,"{'max_features': 0.27809418121612106, 'n_estim...",-40151780.0,-25059890.0,-35929890.0,-9416239.0,-10309220.0,-24173400.0,12683050.0,19
2,1.470994,0.083258,0.085998,0.010369,0.465387,901,"{'max_features': 0.4653865005504697, 'n_estima...",-38454840.0,-23453320.0,-34449580.0,-8707785.0,-8901000.0,-22793300.0,12433710.0,3
3,1.120996,0.045938,0.059595,0.002245,0.544212,715,"{'max_features': 0.5442120733070823, 'n_estima...",-40882070.0,-26554070.0,-30622150.0,-6374483.0,-12860910.0,-23458730.0,12398820.0,17
4,1.0176,0.029319,0.057801,0.004216,0.528105,682,"{'max_features': 0.528104845039258, 'n_estimat...",-40688200.0,-25400480.0,-30821340.0,-6201164.0,-12603150.0,-23142870.0,12409680.0,11
5,1.423603,0.027187,0.084399,0.001628,0.308982,990,"{'max_features': 0.30898217298530417, 'n_estim...",-38097600.0,-24403350.0,-34210030.0,-8298631.0,-9251023.0,-22852130.0,12333910.0,6
6,0.4616,0.01234,0.0294,0.002498,0.500506,316,"{'max_features': 0.5005060893288119, 'n_estima...",-40227610.0,-25867230.0,-30628010.0,-6936857.0,-12185730.0,-23169090.0,12149310.0,12
7,0.574601,0.021021,0.035,0.002607,0.366043,392,"{'max_features': 0.36604345058147, 'n_estimato...",-38488010.0,-25588040.0,-34595680.0,-7844349.0,-9959589.0,-23295130.0,12492770.0,16
8,0.802995,0.030378,0.047205,0.001601,0.46431,559,"{'max_features': 0.46430988007106144, 'n_estim...",-38751410.0,-23849730.0,-34806710.0,-8957301.0,-9307789.0,-23134590.0,12432300.0,9
9,1.361602,0.07329,0.077197,0.001936,0.615706,909,"{'max_features': 0.615705990989708, 'n_estimat...",-39902850.0,-26192910.0,-31244020.0,-6489946.0,-12382280.0,-23242400.0,12238370.0,14


In [98]:
calc_error(gridRandomized, X_test, y_test)

3019.694196201905