In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error

import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import optuna
import tqdm as notebook_tqdm
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Datei öffnen und lesen
#with open('data_description.txt', 'r') as file:
#    content = file.read()

# Inhalt als Markdown anzeigen
#display(Markdown(content))

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
missing_values_count = train.isnull().sum()
missing_values_percent = (train.isnull().sum() / len(train)) * 100

# Umwandeln des Ergebnisses in ein DataFrame
missing_data_df = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Percent Missing': missing_values_percent
})

missing_data_df = missing_data_df[missing_data_df['Missing Values'] > 0]
missing_data_df

Unnamed: 0,Missing Values,Percent Missing
LotFrontage,259,17.739726
Alley,1369,93.767123
MasVnrType,872,59.726027
MasVnrArea,8,0.547945
BsmtQual,37,2.534247
BsmtCond,37,2.534247
BsmtExposure,38,2.60274
BsmtFinType1,37,2.534247
BsmtFinType2,38,2.60274
Electrical,1,0.068493


In [6]:
unique_values_count = train['BsmtQual'].value_counts()

# Ausgabe des Ergebnisses
print(unique_values_count)

BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64


In [7]:
data = train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'])

for column in data.select_dtypes(include=['float64', 'int64']).columns:
    if data[column].isnull().mean() > 0:
        data[column] = data[column].fillna(data[column].mean())

for column in data.select_dtypes(include=['object']).columns:
    if data[column].isnull().mean() > 0:
        data[column] = data[column].fillna(data[column].mode()[0])

data = pd.get_dummies(data, drop_first=True)

missing_values_after = data.isnull().sum().sum()
print(f"Missing values after preprocessing: {missing_values_after}")

Missing values after preprocessing: 0


In [8]:
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisieren und Trainieren des XGBoost Regressors
xg_reg = xgb.XGBRegressor(objective='reg:squarederror',
                          colsample_bytree=0.3,
                          learning_rate=0.1,
                          max_depth=5,
                          alpha=10,
                          n_estimators=500)
xg_reg.fit(X_train, y_train)

# Vorhersagen und Berechnen des RMSE
y_pred = xg_reg.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"XGBoost RMSE: {rmse:.2f}")


XGBoost RMSE: 25913.94


In [9]:
# Initialisieren und Trainieren des RandomForestRegressors
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# Vorhersagen und Berechnen des RMSE
y_pred_rf = rf_reg.predict(X_test)
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest RMSE: {rmse_rf:.2f}")


Random Forest RMSE: 28711.87


In [10]:
# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

# Initialisieren des Keras-Modells
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

# Kompilieren des Modells
model.compile(optimizer='adam', loss='mean_squared_error')

# Trainieren des Modells
model.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=1)

# Vorhersagen und Berechnen des RMSE
y_pred_nn = model.predict(X_test)
rmse_nn = root_mean_squared_error(y_test, y_pred_nn)
print(f"Keras NN RMSE: {rmse_nn:.2f}")


Epoch 1/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 35244638208.0000 - val_loss: 34148681728.0000
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 939us/step - loss: 32269766656.0000 - val_loss: 24129609728.0000
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 958us/step - loss: 21063653376.0000 - val_loss: 8924145664.0000
Epoch 4/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 915us/step - loss: 8711989248.0000 - val_loss: 4183036672.0000
Epoch 5/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 916us/step - loss: 7712146944.0000 - val_loss: 3671467776.0000
Epoch 6/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step - loss: 7167407616.0000 - val_loss: 3337244672.0000
Epoch 7/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 917us/step - loss: 4546073600.0000 - val_loss: 3112493056.0000
Epoch 8/100
[1m30/30[0

In [11]:
# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

# Initialisieren der Modelle
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost Regressor": xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100, random_state=42)
}

# Trainieren und Evaluieren der Modelle
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"{name} RMSE: {rmse:.2f}")


Linear Regression RMSE: 60989.88
Ridge Regression RMSE: 30065.05
Lasso Regression RMSE: 51803.62
Decision Tree Regressor RMSE: 43192.41
Random Forest Regressor RMSE: 28711.87
Gradient Boosting Regressor RMSE: 28316.66
XGBoost Regressor RMSE: 26023.49


In [12]:
# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

In [13]:
ridge_params = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge = Ridge()
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid.fit(X_train, y_train)

best_ridge = ridge_grid.best_estimator_
ridge_rmse = root_mean_squared_error(y_test, best_ridge.predict(X_test))
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

Best Ridge Regression RMSE: 30625.29




In [14]:
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
rf_grid = RandomizedSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
rf_rmse = root_mean_squared_error(y_test, best_rf.predict(X_test))
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")


Best Random Forest RMSE: 28899.30


In [15]:
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb = GradientBoostingRegressor(random_state=42)
gb_grid = RandomizedSearchCV(gb, gb_params, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
gb_grid.fit(X_train, y_train)

best_gb = gb_grid.best_estimator_
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


Best Gradient Boosting RMSE: 26374.18


In [16]:
xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_grid = RandomizedSearchCV(xgb_reg, xgb_params, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
xgb_grid.fit(X_train, y_train)

best_xgb = xgb_grid.best_estimator_
xgb_rmse = mean_squared_error(y_test, best_xgb.predict(X_test), squared=False)
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best XGBoost RMSE: 25682.01


In [17]:
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best Ridge Regression RMSE: 30625.29
Best Random Forest RMSE: 28899.30
Best Gradient Boosting RMSE: 26374.18
Best XGBoost RMSE: 25682.01


In [18]:
# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

In [19]:
def objective_ridge(trial):
    alpha = trial.suggest_loguniform('alpha', 0.1, 100.0)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    
    model = Ridge(alpha=alpha, solver=solver)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_ridge = optuna.create_study(direction='minimize')
study_ridge.optimize(objective_ridge, n_trials=50, n_jobs=-1)

best_params_ridge = study_ridge.best_params
best_ridge = Ridge(**best_params_ridge)
best_ridge.fit(X_train, y_train)
ridge_rmse = mean_squared_error(y_test, best_ridge.predict(X_test), squared=False)
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")


[I 2024-06-18 20:52:06,239] A new study created in memory with name: no-name-d1b5774f-b3a6-4118-ae56-2f1d80b3969d
[I 2024-06-18 20:52:06,757] Trial 2 finished with value: 40412.6875 and parameters: {'alpha': 0.910052609527051, 'solver': 'lsqr'}. Best is trial 2 with value: 40412.6875.
[I 2024-06-18 20:52:07,507] Trial 8 finished with value: 40412.71484375 and parameters: {'alpha': 0.6179970056402856, 'solver': 'lsqr'}. Best is trial 2 with value: 40412.6875.
[I 2024-06-18 20:52:07,556] Trial 4 finished with value: 40412.69921875 and parameters: {'alpha': 0.2704796366889385, 'solver': 'lsqr'}. Best is trial 2 with value: 40412.6875.
[I 2024-06-18 20:52:09,323] Trial 7 finished with value: 43379.76171875 and parameters: {'alpha': 42.57079429944377, 'solver': 'sag'}. Best is trial 2 with value: 40412.6875.
[I 2024-06-18 20:52:09,324] Trial 0 finished with value: 43380.390625 and parameters: {'alpha': 0.8482496372775298, 'solver': 'sag'}. Best is trial 2 with value: 40412.6875.
[I 2024-06-

Best Ridge Regression RMSE: 30063.90


In [20]:
# Definieren der Parameter für Bayesian Optimization
gb_params = {
    'n_estimators': Integer(50, 300),
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 10),
    'subsample': Real(0.7, 1.0),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 4)
}

# Initialisieren des Gradient Boosting Regressors
gb = GradientBoostingRegressor(random_state=42)

# Bayesian Optimization mit BayesSearchCV
gb_opt = BayesSearchCV(gb, gb_params, n_iter=32, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
gb_opt.fit(X_train, y_train)

# Bestes Modell und RMSE berechnen
best_gb = gb_opt.best_estimator_
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


Best Gradient Boosting RMSE: 26218.34


In [21]:
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best Ridge Regression RMSE: 30063.90
Best Random Forest RMSE: 28899.30
Best Gradient Boosting RMSE: 26218.34
Best XGBoost RMSE: 25682.01


In [22]:
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=50, n_jobs=-1)
best_params_rf = study_rf.best_params

best_rf = RandomForestRegressor(
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    random_state=42
)
best_rf.fit(X_train, y_train)
rf_rmse = mean_squared_error(y_test, best_rf.predict(X_test), squared=False)
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")


[I 2024-06-18 20:54:48,262] A new study created in memory with name: no-name-08f2cf99-99f1-4590-a5d4-5d25279ca827
[I 2024-06-18 20:54:49,029] Trial 6 finished with value: 30740.802740359533 and parameters: {'n_estimators': 54, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 6 with value: 30740.802740359533.
[I 2024-06-18 20:54:49,274] Trial 1 finished with value: 30360.06486732964 and parameters: {'n_estimators': 53, 'max_depth': 43, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 1 with value: 30360.06486732964.
[I 2024-06-18 20:54:50,165] Trial 9 finished with value: 30424.430938349724 and parameters: {'n_estimators': 57, 'max_depth': 37, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 1 with value: 30360.06486732964.
[I 2024-06-18 20:54:50,219] Trial 5 finished with value: 29465.231761614563 and parameters: {'n_estimators': 106, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 5 with value: 2946

Best Random Forest RMSE: 28390.75


In [23]:
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=50, n_jobs=-1)
best_params_rf = study_rf.best_params

best_rf = RandomForestRegressor(
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    random_state=42
)
best_rf.fit(X_train, y_train)
rf_rmse = mean_squared_error(y_test, best_rf.predict(X_test), squared=False)
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")


[I 2024-06-18 20:55:36,861] A new study created in memory with name: no-name-5f1f44e9-975a-4b27-a8c1-3a4a16b31c71


[I 2024-06-18 20:55:38,523] Trial 5 finished with value: 31492.097725202875 and parameters: {'n_estimators': 71, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 5 with value: 31492.097725202875.
[I 2024-06-18 20:55:38,640] Trial 3 finished with value: 31891.583447554138 and parameters: {'n_estimators': 77, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 5 with value: 31492.097725202875.
[I 2024-06-18 20:55:39,179] Trial 1 finished with value: 29548.61623143314 and parameters: {'n_estimators': 81, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 29548.61623143314.
[I 2024-06-18 20:55:39,339] Trial 4 finished with value: 33693.41586040484 and parameters: {'n_estimators': 151, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 1 with value: 29548.61623143314.
[I 2024-06-18 20:55:39,942] Trial 2 finished with value: 30032.696837011106 and parameters: {'n_est

Best Random Forest RMSE: 28314.02


In [24]:
def objective_gb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    subsample = trial.suggest_uniform('subsample', 0.7, 1.0)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_gb = optuna.create_study(direction='minimize')
study_gb.optimize(objective_gb, n_trials=50, n_jobs=-1)
best_params_gb = study_gb.best_params

best_gb = GradientBoostingRegressor(
    n_estimators=best_params_gb['n_estimators'],
    learning_rate=best_params_gb['learning_rate'],
    max_depth=best_params_gb['max_depth'],
    subsample=best_params_gb['subsample'],
    min_samples_split=best_params_gb['min_samples_split'],
    min_samples_leaf=best_params_gb['min_samples_leaf'],
    random_state=42
)
best_gb.fit(X_train, y_train)
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


[I 2024-06-18 20:56:13,755] A new study created in memory with name: no-name-61345457-54fe-4ca4-8b4b-06e30d0ce650


[I 2024-06-18 20:56:15,426] Trial 1 finished with value: 27924.631787909704 and parameters: {'n_estimators': 59, 'learning_rate': 0.06003709314881805, 'max_depth': 9, 'subsample': 0.9030725102232159, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 1 with value: 27924.631787909704.
[I 2024-06-18 20:56:15,673] Trial 6 finished with value: 26710.30184441344 and parameters: {'n_estimators': 153, 'learning_rate': 0.05785785839584269, 'max_depth': 4, 'subsample': 0.8469130834819931, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 6 with value: 26710.30184441344.
[I 2024-06-18 20:56:15,755] Trial 2 finished with value: 48947.080234240464 and parameters: {'n_estimators': 67, 'learning_rate': 0.011799624126393867, 'max_depth': 9, 'subsample': 0.9691784789387252, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 6 with value: 26710.30184441344.
[I 2024-06-18 20:56:17,054] Trial 5 finished with value: 26526.38545384522 and parameters: {'n_estimators': 164,

Best Gradient Boosting RMSE: 23709.24


In [25]:
def objective_gb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 800)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    subsample = trial.suggest_uniform('subsample', 0.7, 1.0)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_gb = optuna.create_study(direction='minimize')
study_gb.optimize(objective_gb, n_trials=50, n_jobs=-1)
best_params_gb = study_gb.best_params

best_gb = GradientBoostingRegressor(
    n_estimators=best_params_gb['n_estimators'],
    learning_rate=best_params_gb['learning_rate'],
    max_depth=best_params_gb['max_depth'],
    subsample=best_params_gb['subsample'],
    min_samples_split=best_params_gb['min_samples_split'],
    min_samples_leaf=best_params_gb['min_samples_leaf'],
    random_state=42
)
best_gb.fit(X_train, y_train)
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


[I 2024-06-18 20:56:31,395] A new study created in memory with name: no-name-58216d7f-2e4e-4c55-b253-9df3d03daceb
[I 2024-06-18 20:56:32,881] Trial 4 finished with value: 32733.578440262143 and parameters: {'n_estimators': 137, 'learning_rate': 0.01405064267357732, 'max_depth': 5, 'subsample': 0.8058331082049244, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 4 with value: 32733.578440262143.
[I 2024-06-18 20:56:33,260] Trial 1 finished with value: 29092.411939777758 and parameters: {'n_estimators': 265, 'learning_rate': 0.013660150118139446, 'max_depth': 3, 'subsample': 0.7498589867439706, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 1 with value: 29092.411939777758.
[I 2024-06-18 20:56:33,869] Trial 7 finished with value: 26883.4615991445 and parameters: {'n_estimators': 167, 'learning_rate': 0.09098905971921131, 'max_depth': 6, 'subsample': 0.9164776488947898, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 7 with value: 26883.461599144

Best Gradient Boosting RMSE: 24032.39


In [26]:
def objective_xgb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    subsample = trial.suggest_uniform('subsample', 0.7, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.7, 1.0)
    
    model = xgb.XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective='reg:squarederror',
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    return rmse

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50, n_jobs=-1)
best_params_xgb = study_xgb.best_params

best_xgb = xgb.XGBRegressor(
    n_estimators=best_params_xgb['n_estimators'],
    learning_rate=best_params_xgb['learning_rate'],
    max_depth=best_params_xgb['max_depth'],
    subsample=best_params_xgb['subsample'],
    colsample_bytree=best_params_xgb['colsample_bytree'],
    objective='reg:squarederror',
    random_state=42
)
best_xgb.fit(X_train, y_train)
xgb_rmse = root_mean_squared_error(y_test, best_xgb.predict(X_test))
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


[I 2024-06-18 20:57:15,259] A new study created in memory with name: no-name-6277ad67-6436-490e-8e4f-bbc3c60a8b2e
[I 2024-06-18 20:57:16,324] Trial 0 finished with value: 27104.0078125 and parameters: {'n_estimators': 93, 'learning_rate': 0.09448812676075298, 'max_depth': 3, 'subsample': 0.994293259288344, 'colsample_bytree': 0.8909285122708718}. Best is trial 0 with value: 27104.0078125.
[I 2024-06-18 20:57:17,469] Trial 3 finished with value: 26736.0078125 and parameters: {'n_estimators': 98, 'learning_rate': 0.04628907011407666, 'max_depth': 8, 'subsample': 0.7738181919087406, 'colsample_bytree': 0.913234529796185}. Best is trial 3 with value: 26736.0078125.
[I 2024-06-18 20:57:17,680] Trial 8 finished with value: 41804.484375 and parameters: {'n_estimators': 120, 'learning_rate': 0.011463339469293794, 'max_depth': 4, 'subsample': 0.9622196308155149, 'colsample_bytree': 0.8949784454743753}. Best is trial 3 with value: 26736.0078125.
[I 2024-06-18 20:57:18,261] Trial 5 finished with 

Best XGBoost RMSE: 24277.32


In [27]:
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best Ridge Regression RMSE: 30063.90
Best Random Forest RMSE: 28314.02
Best Gradient Boosting RMSE: 24032.39
Best XGBoost RMSE: 24277.32


***
### Anwenden Models

In [28]:
# Vorbereitungen zur Behandlung der fehlenden Werte im Testdatensatz
test_data = test.copy()

for column in test_data.select_dtypes(include=['float64', 'int64']).columns:
    if test_data[column].isnull().mean() > 0:
        test_data[column] = test_data[column].fillna(test_data[column].mean())

for column in test_data.select_dtypes(include=['object']).columns:
    if test_data[column].isnull().mean() > 0:
        test_data[column] = test_data[column].fillna('Missing')

test_data = pd.get_dummies(test_data, drop_first=True)

missing_cols = set(X_train.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X_train.columns]

X_test_final = test_data.astype(np.float32).values

# Sicherstellen, dass 'Id' in den ursprünglichen Testdaten vorhanden ist
test_ids = test['Id'].values


In [29]:
# Vorhersagen mit dem Ridge-Modell
y_pred_ridge = best_ridge.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Ridge-Modell)
submission_ridge = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_ridge
})

# Exportieren als CSV
submission_ridge.to_csv('submission_ridge.csv', index=False)


In [30]:
# Vorhersagen mit dem Random Forest-Modell
y_pred_rf = best_rf.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Random Forest-Modell)
submission_rf = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_rf
})

# Exportieren als CSV
submission_rf.to_csv('submission_rf.csv', index=False)


In [31]:
# Vorhersagen mit dem Gradient Boosting-Modell
y_pred_gb = best_gb.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Gradient Boosting-Modell)
submission_gb = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_gb
})

# Exportieren als CSV
submission_gb.to_csv('submission_gb.csv', index=False)


In [32]:
# Vorhersagen mit dem XGBoost-Modell
y_pred_xgboost = best_xgb.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von XGBoost-Modell)
submission_xgboost = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_xgboost
})

# Exportieren als CSV
submission_xgboost.to_csv('submission_xgboost.csv', index=False)
