# Model erstellen und trainieren

# 1) Libraries importieren

In [1]:
# Libraries importieren
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 3) Daten einlesen

In [2]:
# Daten importieren
df_clean = pd.read_csv("/Users/allegratrepte/Desktop/df_clean_final.csv")

# Daten inspizieren
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69796 entries, 0 to 69795
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   body_type             69796 non-null  object 
 1   engine_type           69796 non-null  object 
 2   fuel_type             69796 non-null  object 
 3   horsepower            69796 non-null  float64
 4   make_name             69796 non-null  object 
 5   mileage               69796 non-null  float64
 6   model_name            69796 non-null  object 
 7   price                 69796 non-null  float64
 8   wheel_system_display  69796 non-null  object 
 9   average_fuel_economy  69796 non-null  float64
 10  manual                69796 non-null  int64  
 11  age                   69796 non-null  int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 6.4+ MB


# 4) Dummy-Variablen erstellen

In [3]:
df_dummies = pd.get_dummies(df_clean, drop_first=True, dtype=float)

In [4]:
df_dummies.head()

Unnamed: 0,horsepower,mileage,price,average_fuel_economy,manual,age,body_type_Coupe,body_type_Hatchback,body_type_Minivan,body_type_Pickup Truck,...,model_name_iA,model_name_iM,model_name_iQ,model_name_tC,model_name_xA,model_name_xD,wheel_system_display_All-Wheel Drive,wheel_system_display_Four-Wheel Drive,wheel_system_display_Front-Wheel Drive,wheel_system_display_Rear-Wheel Drive
0,184.0,25794.0,13000.0,29.5,0,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,295.0,15732.0,27300.0,21.5,0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,138.0,4580.0,15724.0,29.0,0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,180.0,25122.0,21000.0,24.0,0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,241.0,61161.0,17300.0,20.5,0,6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69796 entries, 0 to 69795
Columns: 755 entries, horsepower to wheel_system_display_Rear-Wheel Drive
dtypes: float64(753), int64(2)
memory usage: 402.0 MB


# 6) Daten für Training und Testing erstellen

 X und y definieren

In [6]:
X = df_dummies.drop(["price"], axis=1)
y = df_dummies["price"]

Training und Testdaten splitten

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 7) Modell Auswahl

## 7.1  Modell 1 - Lineare Regression

7.1.1 Ohne CrossValidation

In [8]:
# Imports
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Modell erstellen und trainieren
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Prediction
y_pred = lin_reg.predict(X_test)

# Modellevaluierung
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("RMSE", np.sqrt(mse))
print("R²-Wert:", r2)

MSE: 2.7100608161446518e+17
MAE: 15458215.640967455
RMSE 520582444.5891978
R²-Wert: -1380218672.106091


7.1.2 Lineare Regression mit CrossValidation

Das macht irgendwie keinen Sinn: ??

In [10]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



# Modell erstellen
lin_reg = LinearRegression()

# Cross-Validation anwenden
# Hier verwenden wir 5-fache Cross-Validation und den R^2 Score als Bewertungsmetrik
cv_scores = cross_val_score(lin_reg, X_train, y_train, cv=5, scoring='r2')
print("Cross-Validation R^2 Scores: ", cv_scores)
print("Durchschnittlicher R^2 Score: ", np.mean(cv_scores))
print("Standardabweichung der R^2 Scores: ", np.std(cv_scores))

# Modell auf dem Trainingsdatensatz trainieren
lin_reg.fit(X_train, y_train)

# Vorhersagen auf dem Testset machen
y_pred = lin_reg.predict(X_test)

# Modellevaluierung
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("RMSE:", np.sqrt(mse))
print("R²-Wert:", r2)


Cross-Validation R^2 Scores:  [-5.43482832e+07 -4.38878280e+09 -3.05230009e+07 -6.10499410e+09
 -9.39475428e+09]
Durchschnittlicher R^2 Score:  -3994680492.6717405
Standardabweichung der R^2 Scores:  3605836923.5644464
MSE: 2.7100608161446518e+17
MAE: 15458215.640967455
RMSE: 520582444.5891978
R²-Wert: -1380218672.106091


Quelle: https://chatgpt.com/share/45ad46bc-c945-4c74-8aa2-deadc4f621b9

##7.2) Decision Tree Regressor

7.2.1 Ohne CrossValdation

In [11]:
# Imports
from sklearn.tree import DecisionTreeRegressor
# Notwendige Importe
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Decision Tree Regressor erstellen
tree_model = DecisionTreeRegressor(random_state=42)

# Modell trainieren
tree_model.fit(X_train, y_train)

# Vorhersagen auf dem Testset machen
y_pred = tree_model.predict(X_test)

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE", mae)
print("R²-Wert:", r2)

MSE: 21030324.989876304
RMSE: 4585.883228983955
MAE 3137.488838414664
R²-Wert: 0.8928937421640222


7.2.2 Mit Crossvalidation: Variante (ChatGPT Vorschlag)

In [12]:
# Notwendige Importe
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Decision Tree Regressor erstellen
tree_model = DecisionTreeRegressor(random_state=42)

# Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-Fold Cross-Validation

# Cross-Validation Scores
mse_scores = -cross_val_score(tree_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
mae_scores = -cross_val_score(tree_model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
r2_scores = cross_val_score(tree_model, X_train, y_train, cv=kf, scoring='r2')

# Mittelwerte der Scores
mean_mse = mse_scores.mean()
mean_mae = mae_scores.mean()
mean_rmse = np.sqrt(mean_mse)
mean_r2 = r2_scores.mean()

print("Cross-Validation MSE:", mean_mse)
print("Cross-Validation MAE:", mean_mae)
print("Cross-Validation RMSE:", mean_rmse)
print("Cross-Validation R²-Wert:", mean_r2)

Cross-Validation MSE: 21367666.8229613
Cross-Validation MAE: 3174.1945899801453
Cross-Validation RMSE: 4622.517368594877
Cross-Validation R²-Wert: 0.890939233212386


Anderer: ChatGPT Vorschlag: https://chatgpt.com/share/a0dac765-c550-4949-8887-a07e0b21a0f0

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Hyperparameter-Raster für die Optimierung
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

# Decision Tree Regressor erstellen
tree_model = DecisionTreeRegressor(random_state=42)

# GridSearchCV mit 5-facher Cross-Validation einrichten
grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# GridSearchCV auf Trainingsdaten anpassen
grid_search.fit(X_train, y_train)

# Beste Parameter und Modell
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Beste Hyperparameter:", best_params)

# Vorhersagen auf dem Testset machen
y_pred = best_model.predict(X_test)

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE", mae)
print("R²-Wert:", r2)

# Cross-Validation Scores für das beste Modell
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)

print("Cross-Validation RMSE Scores:", cv_rmse_scores)
print("Durchschnittliches Cross-Validation RMSE:", np.mean(cv_rmse_scores))


Fitting 5 folds for each of 1728 candidates, totalling 8640 fits




KeyboardInterrupt: 

7.2.3 Mit CrossValidation: Variante Vorlesung

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Parameter für GridSearchCV
parameters = {
    'max_depth': range(1, 30),
    'min_samples_leaf': [1, 10, 20, 30, 50, 100],
}

# DecisionTreeRegressor-Modell mit gültigem Kriterium initialisieren
tree_model = DecisionTreeRegressor(criterion="squared_error", random_state=1)
tree_model_CV = GridSearchCV(tree_model, parameters, cv=5)

# Trainieren Sie das Modell mit den Trainingsdaten
tree_model_CV.fit(X_train, y_train)

# Vorhersagen auf dem Testset machen
y_pred = tree_model_CV.predict(X_test)

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²-Wert:", r2)



KeyboardInterrupt: 

## 7.3) Random Forest Regressor

In [None]:
# Imports
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor erstellen
forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Modell trainieren
forest_model.fit(X_train, y_train)

# Vorhersagen auf dem Testset machen
y_pred = forest_model.predict(X_test)

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print ("MAE", mae)
print("R²-Wert:", r2)

MSE: 15484332.355136288
RMSE: 3935.013641035605
MAE 2673.3199364479633
R²-Wert: 0.9211391695351563


Modell Exportieren

In [None]:
import pickle

# Verbinde Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Dateipfad in Google Drive
filename = '/content/drive/My Drive/finalized_model_age.sav'

# Modell exportieren
pickle.dump(forest_model, open(filename, 'wb'))

print(f"Das Modell wurde erfolgreich in der Datei '{filename}' gespeichert.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Das Modell wurde erfolgreich in der Datei '/content/drive/My Drive/finalized_model_age.sav' gespeichert.


In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

2673.3199364479633

Modell mit CrossValidation und Hyperparamentern

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42, shuffle=True)

In [None]:
# Notwendige Importe
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Parameterbereich definieren
parameters = {
    'n_estimators': [100, 200],
    'max_depth': [1, 30],
    'min_samples_split': [ 5, 10],
    'min_samples_leaf': [10,100],
}

# Konfiguriere den KFold-Generator für Kreuzvalidierung
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Erstelle das GridSearchCV-Objekt
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), parameters, cv= 5)

# Trainiere das Modell mit Grid-Search
grid_search.fit(X_train, y_train)

# Beste Hyperparameter
print("Beste Hyperparameter:", grid_search.best_params_)

# Trainiere das finale Modell mit den besten Hyperparametern
best_forest_model = grid_search.best_estimator_

# Vorhersagen auf dem Testset machen
y_pred = best_forest_model.predict(X_test)

#parameter printen 

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²-Wert:", r2)


## 10) Modell 4 - Gradient Boosting Regressor

In [None]:
#Import
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Regressor erstellen
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Modell trainieren
gb_model.fit(X_train, y_train)

# Vorhersagen auf dem Testset machen
y_pred = gb_model.predict(X_test)

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mae

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²-Wert:", r2)

MSE: 23724127.863842584
RMSE: 4870.742023946925
MAE: 3440.4810292021857
R²-Wert: 0.8791743562145792


Mit CrossValidation und Hyperparametern

In [45]:
# Notwendige Importe
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Parameterbereich definieren
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Konfiguriere den KFold-Generator für Kreuzvalidierung
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Erstelle das GridSearchCV-Objekt
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=kf, scoring='neg_mean_squared_error', verbose=1)

# Trainiere das Modell mit Grid-Search
grid_search.fit(X_train, y_train)

# Beste Hyperparameter
print("Beste Hyperparameter:", grid_search.best_params_)

# Trainiere das finale Modell mit den besten Hyperparametern
best_gb_model = grid_search.best_estimator_

# Vorhersagen auf dem Testset machen
y_pred = best_gb_model.predict(X_test)

# Modellleistung bewerten
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²-Wert:", r2)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 

Quelle CrossValidation: https://chatgpt.com/share/6fcf8984-041d-4b38-8beb-e93c2e32ccd0

## Hyperparameter-Tuning und CrossValidation

In [None]:
# Libraries
from sklearn.model_selection import KFold
# Notwendige Importe
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Parameterbereich definieren
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Konfiguriere den KFold-Generator für Kreuzvalidierung
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Erstelle das GridSearchCV-Objekt
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=kf, scoring='neg_mean_squared_error', verbose=1)

# Trainiere das Modell mit Grid-Search
grid_search.fit(X_train, y_train)

In [None]:
# Libraries
from sklearn.model_selection import GridSearchCV, KFold

# Parameterbereich definieren
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Konfiguriere den KFold-Generator für Kreuzvalidierung
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Erstelle das GridSearchCV-Objekt
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=kf, scoring='neg_mean_squared_error', verbose=1)

# Trainiere das Modell mit Grid-Search
grid_search.fit(X_train, y_train)

In [None]:
# Konfiguriere den KFold-Generator für Kreuzvalidierung
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Erstelle das GridSearchCV-Objekt
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=kf, scoring='neg_mean_squared_error', verbose=1)

# Trainiere das Modell mit Grid-Search
grid_search.fit(X_train, y_train)

##12) Das beste Modell auswählen und auf den ganzen Daten trainieren

## 11) Feature Importance

In [None]:
# Feature Importance oder Koeffizienten extrahieren
dt_importances = tree_model.feature_importances_
lr_coefficients = abs(lin_reg.coef_)  # Absolute Werte der Koeffizienten für lineare Regression
rf_importances = forest_model.feature_importances_
gb_importances = gb_model.feature_importances_

# Features aus den Daten
features = X_train.columns

# DataFrames erstellen für jede Modellart
dt_importance_df = pd.DataFrame({'Feature': features, 'Importance': dt_importances})
lr_importance_df = pd.DataFrame({'Feature': features, 'Coefficient': lr_coefficients})
rf_importance_df = pd.DataFrame({'Feature': features, 'Importance': rf_importances})
gb_importance_df = pd.DataFrame({'Feature': features, 'Importance': gb_importances})

# Top 10 Features sortieren
top10_dt = dt_importance_df.sort_values(by='Importance', ascending=False).head(10)
top10_lr = lr_importance_df.sort_values(by='Coefficient', ascending=False).head(10)
top10_rf = rf_importance_df.sort_values(by='Importance', ascending=False).head(10)
top10_gb = gb_importance_df.sort_values(by='Importance', ascending=False).head(10)

# Ergebnisse ausgeben
print("\nTop 10 Features Lineare Regression:")
print(top10_lr)
print("Top 10 Features Decision Tree:")
print(top10_dt)
print("\nTop 10 Features Random Forest:")
print(top10_rf)
print("\nTop 10 Features Gradient Boosting:")
print(top10_gb)
