In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import optuna
import tqdm as notebook_tqdm
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Datei öffnen und lesen
#with open('data_description.txt', 'r') as file:
#    content = file.read()

# Inhalt als Markdown anzeigen
#display(Markdown(content))

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
missing_values_count = train.isnull().sum()
missing_values_percent = (train.isnull().sum() / len(train)) * 100

# Umwandeln des Ergebnisses in ein DataFrame
missing_data_df = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Percent Missing': missing_values_percent
})

missing_data_df = missing_data_df[missing_data_df['Missing Values'] > 0]
missing_data_df

Unnamed: 0,Missing Values,Percent Missing
LotFrontage,259,17.739726
Alley,1369,93.767123
MasVnrType,872,59.726027
MasVnrArea,8,0.547945
BsmtQual,37,2.534247
BsmtCond,37,2.534247
BsmtExposure,38,2.60274
BsmtFinType1,37,2.534247
BsmtFinType2,38,2.60274
Electrical,1,0.068493


## Missing value imputation

In [7]:
unique_values_count = train['BsmtQual'].value_counts()

# Ausgabe des Ergebnisses
print(unique_values_count)

BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64


### Drop Columns which have a missing value score over 90 percent

In [8]:
data = train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'])

In [9]:
data['MasVnrType']

0       BrkFace
1           NaN
2       BrkFace
3           NaN
4       BrkFace
         ...   
1455        NaN
1456      Stone
1457        NaN
1458        NaN
1459        NaN
Name: MasVnrType, Length: 1460, dtype: object

In [10]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Load the data
train_data_path = 'train.csv'  # Replace with your actual data path
test_data_path = 'test.csv'    # Replace with your actual data path
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

# Drop columns with more than 90% missing values
threshold = 0.90
df_train = df_train.loc[:, df_train.isnull().mean() < threshold]
df_test = df_test.loc[:, df_test.isnull().mean() < threshold]

# Drop the target variable 'SalePrice' from train data
if 'SalePrice' in df_train.columns:
    df_train = df_train.drop(columns=['SalePrice'])

# Identify columns with missing values
missing_cols_train = df_train.columns[df_train.isnull().any()]
missing_cols_test = df_test.columns[df_test.isnull().any()]

# Identify categorical and numerical columns
categorical_cols_train = df_train.select_dtypes(include=['object']).columns
numerical_cols_train = df_train.select_dtypes(exclude=['object']).columns

categorical_cols_test = df_test.select_dtypes(include=['object']).columns
numerical_cols_test = df_test.select_dtypes(exclude=['object']).columns

# One-Hot Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
df_train_encoded = pd.DataFrame(encoder.fit_transform(df_train[categorical_cols_train]))
df_test_encoded = pd.DataFrame(encoder.transform(df_test[categorical_cols_test]))

# Restore the column names for the encoded features
df_train_encoded.columns = encoder.get_feature_names_out(categorical_cols_train)
df_test_encoded.columns = encoder.get_feature_names_out(categorical_cols_test)

# Drop original categorical columns and append the encoded columns
df_train = df_train.drop(columns=categorical_cols_train).reset_index(drop=True)
df_test = df_test.drop(columns=categorical_cols_test).reset_index(drop=True)
df_train = pd.concat([df_train, df_train_encoded], axis=1)
df_test = pd.concat([df_test, df_test_encoded], axis=1)

# List of imputation models to test
imputation_models = {
    'Mean Imputer': SimpleImputer(strategy='mean'),
    'Median Imputer': SimpleImputer(strategy='median'),
    'KNN Imputer': KNNImputer(n_neighbors=5),
    'Iterative Imputer (Bayesian Ridge)': IterativeImputer(estimator=BayesianRidge(), random_state=42),
    'Iterative Imputer (Random Forest)': IterativeImputer(estimator=RandomForestRegressor(), random_state=42),
    #'Iterative Imputer (Decision Tree)': IterativeImputer(estimator=DecisionTreeRegressor(), random_state=42),
}

# Function to evaluate imputation models
def evaluate_imputation_models(models, train_data, test_data, target):
    results = {}
    for name, model in models.items():
        # Split the data into features and target
        X_train = train_data.drop(columns=[target])
        y_train = train_data[target]
        X_test = test_data.drop(columns=[target])
        y_test = test_data[target]

        # Fit the model on training data
        imputer = model.fit(X_train, y_train)
        # Transform both training and test data
        imputed_train_data = imputer.transform(X_train)
        imputed_test_data = imputer.transform(X_test)

        # Combine the imputed data with the target column
        imputed_train_data = pd.DataFrame(imputed_train_data, columns=X_train.columns)
        imputed_test_data = pd.DataFrame(imputed_test_data, columns=X_test.columns)
        imputed_train_data[target] = y_train.values
        imputed_test_data[target] = y_test.values

        # Calculate MSE for the imputation
        mse = mean_squared_error(y_test.dropna(), imputed_test_data[target].dropna())
        results[name] = mse
    return results

# Evaluate models for each missing column
imputation_results = {}
best_models = {}
for col in missing_cols_train.union(missing_cols_test):
    if col in df_train.columns:
        # Drop rows where the target variable is missing in training data
        train_data = df_train.dropna(subset=[col]).copy()
        test_data = df_test.dropna(subset=[col]).copy()
        
        # Check if there are still missing values in the test data for the target column
        if train_data[col].isnull().any() or test_data[col].isnull().any():
            continue
        
        # Evaluate imputation models
        results = evaluate_imputation_models(imputation_models, train_data, test_data, col)
        imputation_results[col] = results
        best_model = min(results, key=results.get)
        best_models[col] = best_model

# Display the results
print("Imputation Results:")
for col, results in imputation_results.items():
    print(f"\nColumn: {col}")
    for model_name, mse in results.items():
        print(f"{model_name}: MSE = {mse}")

print("\nBest Models for Each Column:")
for col, best_model in best_models.items():
    print(f"Column: {col} -> Best Model: {best_model}")


Imputation Results:

Column: BsmtFinSF1
Mean Imputer: MSE = 0.0
Median Imputer: MSE = 0.0
KNN Imputer: MSE = 0.0
Iterative Imputer (Bayesian Ridge): MSE = 0.0
Iterative Imputer (Random Forest): MSE = 0.0

Column: BsmtFinSF2
Mean Imputer: MSE = 0.0
Median Imputer: MSE = 0.0
KNN Imputer: MSE = 0.0
Iterative Imputer (Bayesian Ridge): MSE = 0.0
Iterative Imputer (Random Forest): MSE = 0.0

Column: BsmtFullBath
Mean Imputer: MSE = 0.0
Median Imputer: MSE = 0.0
KNN Imputer: MSE = 0.0
Iterative Imputer (Bayesian Ridge): MSE = 0.0
Iterative Imputer (Random Forest): MSE = 0.0

Column: BsmtHalfBath
Mean Imputer: MSE = 0.0
Median Imputer: MSE = 0.0
KNN Imputer: MSE = 0.0
Iterative Imputer (Bayesian Ridge): MSE = 0.0
Iterative Imputer (Random Forest): MSE = 0.0

Column: BsmtUnfSF
Mean Imputer: MSE = 0.0
Median Imputer: MSE = 0.0
KNN Imputer: MSE = 0.0
Iterative Imputer (Bayesian Ridge): MSE = 0.0
Iterative Imputer (Random Forest): MSE = 0.0

Column: GarageArea
Mean Imputer: MSE = 0.0
Median Impute

In [11]:
for col, best_model in best_models.items():
    print(f"Column: {col} -> Best Model: {best_model}")

Column: BsmtFinSF1 -> Best Model: Mean Imputer
Column: BsmtFinSF2 -> Best Model: Mean Imputer
Column: BsmtFullBath -> Best Model: Mean Imputer
Column: BsmtHalfBath -> Best Model: Mean Imputer
Column: BsmtUnfSF -> Best Model: Mean Imputer
Column: GarageArea -> Best Model: Mean Imputer
Column: GarageCars -> Best Model: Mean Imputer
Column: GarageYrBlt -> Best Model: Mean Imputer
Column: LotFrontage -> Best Model: Mean Imputer
Column: MasVnrArea -> Best Model: Mean Imputer
Column: TotalBsmtSF -> Best Model: Mean Imputer


In [12]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor

# Define a function to get the appropriate imputer model
def get_imputer(model_name):
    if model_name == 'Mean Imputer':
        return SimpleImputer(strategy='mean')
    elif model_name == 'Median Imputer':
        return SimpleImputer(strategy='median')
    elif model_name == 'KNN Imputer':
        return KNNImputer(n_neighbors=5)
    elif model_name == 'Iterative Imputer (Bayesian Ridge)':
        return IterativeImputer(estimator=BayesianRidge(), random_state=42)
    elif model_name == 'Iterative Imputer (Random Forest)':
        return IterativeImputer(estimator=RandomForestRegressor(), random_state=42)
    elif model_name == 'Iterative Imputer (Decision Tree)':
        return IterativeImputer(estimator=DecisionTreeRegressor(), random_state=42)
    else:
        raise ValueError(f"Unknown model name: {model_name}")

# Load the original train data to get the SalePrice column
df_train_original = pd.read_csv(train_data_path)

# Function to fill missing values using the best model for each column
def fill_missing_values(df, best_models):
    for col in best_models:
        best_model_name = best_models[col]
        imputer = get_imputer(best_model_name)
        
        # Fit the imputer on the data
        df[col] = imputer.fit_transform(df[[col]])
    
    return df

# Impute missing values separately for train and test data
df_train_imputed = fill_missing_values(df_train.copy(), best_models)
df_test_imputed = fill_missing_values(df_test.copy(), best_models)

# Add the SalePrice column back to the imputed train data using the 'Id' column
df_train_imputed = df_train_imputed.set_index('Id')
df_train_original = df_train_original.set_index('Id')
df_train_imputed['SalePrice'] = df_train_original['SalePrice']

# Reset the index to default
df_train_imputed.reset_index(inplace=True)
df_test_imputed.reset_index(inplace=True)

# Save the updated train and test datasets to new CSV files
df_train_imputed.to_csv('train_filled_with_SalePrice.csv', index=False)
df_test_imputed.to_csv('test_filled.csv', index=False)

print("SalePrice column added to the imputed train dataset and saved to 'train_filled_with_SalePrice.csv'")
print("Imputed test dataset saved to 'test_filled.csv'")


SalePrice column added to the imputed train dataset and saved to 'train_filled_with_SalePrice.csv'
Imputed test dataset saved to 'test_filled.csv'


In [13]:
#data = pd.get_dummies(data, drop_first=True)

missing_values_after = df_train_imputed.isnull().sum().sum()
print(f"Missing values after preprocessing: {missing_values_after}")

Missing values after preprocessing: 0


## Model Engineering

In [14]:
data = df_train_imputed

In [15]:
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisieren und Trainieren des XGBoost Regressors
xg_reg = xgb.XGBRegressor(objective='reg:squarederror',
                          colsample_bytree=0.3,
                          learning_rate=0.1,
                          max_depth=5,
                          alpha=10,
                          n_estimators=500)
xg_reg.fit(X_train, y_train)

# Vorhersagen und Berechnen des RMSE
y_pred = xg_reg.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"XGBoost RMSE: {rmse:.2f}")


XGBoost RMSE: 25153.38


In [16]:
# Initialisieren und Trainieren des RandomForestRegressors
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# Vorhersagen und Berechnen des RMSE
y_pred_rf = rf_reg.predict(X_test)
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest RMSE: {rmse_rf:.2f}")


Random Forest RMSE: 29220.98


In [17]:
# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

# Initialisieren des Keras-Modells
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

# Kompilieren des Modells
model.compile(optimizer='adam', loss='mean_squared_error')

# Trainieren des Modells
model.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=1)

# Vorhersagen und Berechnen des RMSE
y_pred_nn = model.predict(X_test)
rmse_nn = root_mean_squared_error(y_test, y_pred_nn)
print(f"Keras NN RMSE: {rmse_nn:.2f}")


Epoch 1/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 38131437568.0000 - val_loss: 34571628544.0000
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - loss: 34148986880.0000 - val_loss: 23237545984.0000
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 981us/step - loss: 18425634816.0000 - val_loss: 5722854912.0000
Epoch 4/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 950us/step - loss: 7442180608.0000 - val_loss: 3511546368.0000
Epoch 5/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 942us/step - loss: 7989471744.0000 - val_loss: 3358743552.0000
Epoch 6/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 925us/step - loss: 5330455040.0000 - val_loss: 3085826816.0000
Epoch 7/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 939us/step - loss: 5631615488.0000 - val_loss: 2910709248.0000
Epoch 8/100
[1m30/30[0

In [18]:
# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

# Initialisieren der Modelle
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost Regressor": xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100, random_state=42)
}

# Trainieren und Evaluieren der Modelle
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"{name} RMSE: {rmse:.2f}")


Linear Regression RMSE: 34873.20
Ridge Regression RMSE: 29349.26
Lasso Regression RMSE: 28349.81
Decision Tree Regressor RMSE: 44601.12
Random Forest Regressor RMSE: 29220.98
Gradient Boosting Regressor RMSE: 27140.03
XGBoost Regressor RMSE: 26168.06


In [19]:
# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

In [20]:
ridge_params = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge = Ridge()
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid.fit(X_train, y_train)

best_ridge = ridge_grid.best_estimator_
ridge_rmse = root_mean_squared_error(y_test, best_ridge.predict(X_test))
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

Best Ridge Regression RMSE: 30479.78


In [21]:
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
rf_grid = RandomizedSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
rf_rmse = root_mean_squared_error(y_test, best_rf.predict(X_test))
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")


Best Random Forest RMSE: 28854.85


In [22]:
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb = GradientBoostingRegressor(random_state=42)
gb_grid = RandomizedSearchCV(gb, gb_params, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
gb_grid.fit(X_train, y_train)

best_gb = gb_grid.best_estimator_
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


Best Gradient Boosting RMSE: 26389.63


In [23]:
xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_grid = RandomizedSearchCV(xgb_reg, xgb_params, cv=5, scoring='neg_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
xgb_grid.fit(X_train, y_train)

best_xgb = xgb_grid.best_estimator_
xgb_rmse = mean_squared_error(y_test, best_xgb.predict(X_test), squared=False)
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best XGBoost RMSE: 26977.62


In [24]:
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best Ridge Regression RMSE: 30479.78
Best Random Forest RMSE: 28854.85
Best Gradient Boosting RMSE: 26389.63
Best XGBoost RMSE: 26977.62


In [25]:
# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

In [26]:
def objective_ridge(trial):
    alpha = trial.suggest_loguniform('alpha', 0.1, 100.0)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    
    model = Ridge(alpha=alpha, solver=solver)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_ridge = optuna.create_study(direction='minimize')
study_ridge.optimize(objective_ridge, n_trials=50, n_jobs=-1)

best_params_ridge = study_ridge.best_params
best_ridge = Ridge(**best_params_ridge)
best_ridge.fit(X_train, y_train)
ridge_rmse = mean_squared_error(y_test, best_ridge.predict(X_test), squared=False)
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")


[I 2024-06-21 18:22:27,290] A new study created in memory with name: no-name-03088e28-74ef-41b9-90d3-993645b67388
[I 2024-06-21 18:22:28,069] Trial 7 finished with value: 34514.71875 and parameters: {'alpha': 0.2107060758643458, 'solver': 'sparse_cg'}. Best is trial 7 with value: 34514.71875.
[I 2024-06-21 18:22:28,201] Trial 3 finished with value: 34506.86328125 and parameters: {'alpha': 0.34829860250427097, 'solver': 'sparse_cg'}. Best is trial 3 with value: 34506.86328125.
[I 2024-06-21 18:22:29,069] Trial 6 finished with value: 34515.9140625 and parameters: {'alpha': 24.115126381871317, 'solver': 'sparse_cg'}. Best is trial 3 with value: 34506.86328125.
[I 2024-06-21 18:22:31,538] Trial 4 finished with value: 43380.43359375 and parameters: {'alpha': 9.675794280284423, 'solver': 'sag'}. Best is trial 3 with value: 34506.86328125.
[I 2024-06-21 18:22:31,546] Trial 1 finished with value: 43380.29296875 and parameters: {'alpha': 5.019367958719341, 'solver': 'sag'}. Best is trial 3 with

Best Ridge Regression RMSE: 28827.65


In [27]:
# Definieren der Parameter für Bayesian Optimization
gb_params = {
    'n_estimators': Integer(50, 300),
    'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 10),
    'subsample': Real(0.7, 1.0),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 4)
}

# Initialisieren des Gradient Boosting Regressors
gb = GradientBoostingRegressor(random_state=42)

# Bayesian Optimization mit BayesSearchCV
gb_opt = BayesSearchCV(gb, gb_params, n_iter=32, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
gb_opt.fit(X_train, y_train)

# Bestes Modell und RMSE berechnen
best_gb = gb_opt.best_estimator_
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


Best Gradient Boosting RMSE: 27324.99


In [28]:
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best Ridge Regression RMSE: 28827.65
Best Random Forest RMSE: 28854.85
Best Gradient Boosting RMSE: 27324.99
Best XGBoost RMSE: 26977.62


In [29]:
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=50, n_jobs=-1)
best_params_rf = study_rf.best_params

best_rf = RandomForestRegressor(
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    random_state=42
)
best_rf.fit(X_train, y_train)
rf_rmse = mean_squared_error(y_test, best_rf.predict(X_test), squared=False)
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")


[I 2024-06-21 18:25:55,143] A new study created in memory with name: no-name-4c749431-ed65-4296-aea3-0f8f948910a7
[I 2024-06-21 18:25:57,022] Trial 3 finished with value: 29633.545684343488 and parameters: {'n_estimators': 88, 'max_depth': 25, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 3 with value: 29633.545684343488.
[I 2024-06-21 18:25:57,665] Trial 6 finished with value: 30008.55948570428 and parameters: {'n_estimators': 127, 'max_depth': 41, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 3 with value: 29633.545684343488.
[I 2024-06-21 18:25:57,972] Trial 0 finished with value: 30880.747874100678 and parameters: {'n_estimators': 161, 'max_depth': 46, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 3 with value: 29633.545684343488.
[I 2024-06-21 18:25:58,069] Trial 2 finished with value: 29717.171400022697 and parameters: {'n_estimators': 145, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 3 with value: 

Best Random Forest RMSE: 28804.95


In [30]:
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=50, n_jobs=-1)
best_params_rf = study_rf.best_params

best_rf = RandomForestRegressor(
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    random_state=42
)
best_rf.fit(X_train, y_train)
rf_rmse = mean_squared_error(y_test, best_rf.predict(X_test), squared=False)
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")


[I 2024-06-21 18:26:26,862] A new study created in memory with name: no-name-3be7c142-3507-4bba-b977-eb4e9c7f9a66
[I 2024-06-21 18:26:28,038] Trial 7 finished with value: 30158.440419390045 and parameters: {'n_estimators': 61, 'max_depth': 42, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 7 with value: 30158.440419390045.
[I 2024-06-21 18:26:28,072] Trial 0 finished with value: 30128.956557459784 and parameters: {'n_estimators': 54, 'max_depth': 36, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 0 with value: 30128.956557459784.
[I 2024-06-21 18:26:28,407] Trial 6 finished with value: 30138.141218524284 and parameters: {'n_estimators': 79, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 30128.956557459784.
[I 2024-06-21 18:26:28,742] Trial 1 finished with value: 29843.197971878828 and parameters: {'n_estimators': 90, 'max_depth': 44, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 1 with value: 29

Best Random Forest RMSE: 28834.45


In [31]:
def objective_gb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    subsample = trial.suggest_uniform('subsample', 0.7, 1.0)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_gb = optuna.create_study(direction='minimize')
study_gb.optimize(objective_gb, n_trials=50, n_jobs=-1)
best_params_gb = study_gb.best_params

best_gb = GradientBoostingRegressor(
    n_estimators=best_params_gb['n_estimators'],
    learning_rate=best_params_gb['learning_rate'],
    max_depth=best_params_gb['max_depth'],
    subsample=best_params_gb['subsample'],
    min_samples_split=best_params_gb['min_samples_split'],
    min_samples_leaf=best_params_gb['min_samples_leaf'],
    random_state=42
)
best_gb.fit(X_train, y_train)
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


[I 2024-06-21 18:26:52,751] A new study created in memory with name: no-name-920c92fa-471f-4ef2-8b14-683425e34220
[I 2024-06-21 18:26:53,392] Trial 0 finished with value: 53627.189204728056 and parameters: {'n_estimators': 86, 'learning_rate': 0.010283404763510704, 'max_depth': 3, 'subsample': 0.7103465807840714, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 53627.189204728056.
[I 2024-06-21 18:26:53,701] Trial 4 finished with value: 26379.38182324945 and parameters: {'n_estimators': 85, 'learning_rate': 0.062482629932233066, 'max_depth': 4, 'subsample': 0.9420362547938566, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 4 with value: 26379.38182324945.
[I 2024-06-21 18:26:53,992] Trial 1 finished with value: 39549.51694198498 and parameters: {'n_estimators': 103, 'learning_rate': 0.012996323811688896, 'max_depth': 5, 'subsample': 0.8054917375840991, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 4 with value: 26379.38182324945

Best Gradient Boosting RMSE: 24752.87


In [32]:
def objective_gb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 800)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    subsample = trial.suggest_uniform('subsample', 0.7, 1.0)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

study_gb = optuna.create_study(direction='minimize')
study_gb.optimize(objective_gb, n_trials=50, n_jobs=-1)
best_params_gb = study_gb.best_params

best_gb = GradientBoostingRegressor(
    n_estimators=best_params_gb['n_estimators'],
    learning_rate=best_params_gb['learning_rate'],
    max_depth=best_params_gb['max_depth'],
    subsample=best_params_gb['subsample'],
    min_samples_split=best_params_gb['min_samples_split'],
    min_samples_leaf=best_params_gb['min_samples_leaf'],
    random_state=42
)
best_gb.fit(X_train, y_train)
gb_rmse = mean_squared_error(y_test, best_gb.predict(X_test), squared=False)
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")


[I 2024-06-21 18:27:16,956] A new study created in memory with name: no-name-97752b99-8102-40d8-9364-6811c3d8bbe6
[I 2024-06-21 18:27:19,255] Trial 6 finished with value: 36614.29148786407 and parameters: {'n_estimators': 173, 'learning_rate': 0.012033775551671087, 'max_depth': 3, 'subsample': 0.9465329227040771, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 6 with value: 36614.29148786407.
[I 2024-06-21 18:27:19,744] Trial 5 finished with value: 38874.869611721115 and parameters: {'n_estimators': 118, 'learning_rate': 0.010971519662208567, 'max_depth': 6, 'subsample': 0.9501847354626695, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 6 with value: 36614.29148786407.
[I 2024-06-21 18:27:19,850] Trial 1 finished with value: 37898.10687525293 and parameters: {'n_estimators': 80, 'learning_rate': 0.016969850997249004, 'max_depth': 10, 'subsample': 0.9963886760945527, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 6 with value: 36614.2914878640

Best Gradient Boosting RMSE: 24351.30


In [33]:
def objective_xgb(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    subsample = trial.suggest_uniform('subsample', 0.7, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.7, 1.0)
    
    model = xgb.XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective='reg:squarederror',
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    return rmse

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50, n_jobs=-1)
best_params_xgb = study_xgb.best_params

best_xgb = xgb.XGBRegressor(
    n_estimators=best_params_xgb['n_estimators'],
    learning_rate=best_params_xgb['learning_rate'],
    max_depth=best_params_xgb['max_depth'],
    subsample=best_params_xgb['subsample'],
    colsample_bytree=best_params_xgb['colsample_bytree'],
    objective='reg:squarederror',
    random_state=42
)
best_xgb.fit(X_train, y_train)
xgb_rmse = root_mean_squared_error(y_test, best_xgb.predict(X_test))
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


[I 2024-06-21 18:28:08,062] A new study created in memory with name: no-name-46b8c0a2-e549-472a-894c-0ecca74845a0
[I 2024-06-21 18:28:09,298] Trial 4 finished with value: 55820.5390625 and parameters: {'n_estimators': 61, 'learning_rate': 0.010668140802295433, 'max_depth': 9, 'subsample': 0.9658454569157986, 'colsample_bytree': 0.8682015181566904}. Best is trial 4 with value: 55820.5390625.
[I 2024-06-21 18:28:09,496] Trial 2 finished with value: 29567.28515625 and parameters: {'n_estimators': 72, 'learning_rate': 0.03531730952878587, 'max_depth': 9, 'subsample': 0.8978142119618417, 'colsample_bytree': 0.9697431010781125}. Best is trial 2 with value: 29567.28515625.
[I 2024-06-21 18:28:09,890] Trial 6 finished with value: 25203.140625 and parameters: {'n_estimators': 168, 'learning_rate': 0.03418601977043926, 'max_depth': 5, 'subsample': 0.8265267343501088, 'colsample_bytree': 0.8559118471989939}. Best is trial 6 with value: 25203.140625.
[I 2024-06-21 18:28:10,101] Trial 5 finished wi

Best XGBoost RMSE: 24118.86


In [34]:
print(f"Best Ridge Regression RMSE: {ridge_rmse:.2f}")
print(f"Best Random Forest RMSE: {rf_rmse:.2f}")
print(f"Best Gradient Boosting RMSE: {gb_rmse:.2f}")
print(f"Best XGBoost RMSE: {xgb_rmse:.2f}")


Best Ridge Regression RMSE: 28827.65
Best Random Forest RMSE: 28834.45
Best Gradient Boosting RMSE: 24351.30
Best XGBoost RMSE: 24118.86


In [35]:
uzf

NameError: name 'uzf' is not defined

***
### Anwenden Models

In [37]:
df_test_original = pd.read_csv(test_data_path)
# Prepare the test data for prediction
df_test_imputed = pd.get_dummies(df_test_imputed, drop_first=True)

# Ensure the test data has the same columns as the training data
missing_cols = set(X_train.columns) - set(df_test_imputed.columns)
for col in missing_cols:
    df_test_imputed[col] = 0
df_test_imputed = df_test_imputed[X_train.columns]

# Sicherstellen, dass 'Id' in den ursprünglichen Testdaten vorhanden ist
test_ids = df_test_original['Id'].values

In [38]:
# Vorhersagen mit dem XGBoost-Modell
X_test_final = df_test_imputed.astype(np.float32)
y_pred_xgb = xg_reg.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von XGBoost-Modell)
submission_xgb = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_xgb
})

# Exportieren als CSV
submission_xgb.to_csv('submission_xgb.csv', index=False)

print("Predictions saved to 'submission_xgb.csv'")

Predictions saved to 'submission_xgb.csv'


In [39]:
# Vorhersagen mit dem Ridge-Modell
X_test_final = df_test_imputed.astype(np.float32)
y_pred_ridge = best_ridge.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Ridge-Modell)
submission_ridge = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_ridge
})

# Exportieren als CSV
submission_ridge.to_csv('submission_ridge.csv', index=False)


In [40]:
# Vorhersagen mit dem Random Forest-Modell
y_pred_rf = best_rf.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Random Forest-Modell)
submission_rf = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_rf
})

# Exportieren als CSV
submission_rf.to_csv('submission_rf.csv', index=False)

In [41]:
# Vorhersagen mit dem Gradient Boosting-Modell
y_pred_gb = best_gb.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Gradient Boosting-Modell)
submission_gb = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_gb
})

# Exportieren als CSV
submission_gb.to_csv('submission_gb.csv', index=False)

In [42]:
# Vorhersagen mit dem XGBoost-Modell
y_pred_xgboost = best_xgb.predict(X_test_final)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von XGBoost-Modell)
submission_xgboost = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_xgboost
})

# Exportieren als CSV
submission_xgboost.to_csv('submission_xgboost.csv', index=False)

print("Predictions saved to 'submission_ridge.csv', 'submission_rf.csv', 'submission_gb.csv', and 'submission_xgboost.csv'")

Predictions saved to 'submission_ridge.csv', 'submission_rf.csv', 'submission_gb.csv', and 'submission_xgboost.csv'
