In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [8]:
# Datei öffnen und lesen
#with open('data_description.txt', 'r') as file:
#    content = file.read()

# Inhalt als Markdown anzeigen
#display(Markdown(content))

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
missing_values_count = train.isnull().sum()
missing_values_percent = (train.isnull().sum() / len(train)) * 100

# Umwandeln des Ergebnisses in ein DataFrame
missing_data_df = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Percent Missing': missing_values_percent
})

missing_data_df = missing_data_df[missing_data_df['Missing Values'] > 0]
missing_data_df

Unnamed: 0,Missing Values,Percent Missing
LotFrontage,259,17.739726
Alley,1369,93.767123
MasVnrType,872,59.726027
MasVnrArea,8,0.547945
BsmtQual,37,2.534247
BsmtCond,37,2.534247
BsmtExposure,38,2.60274
BsmtFinType1,37,2.534247
BsmtFinType2,38,2.60274
Electrical,1,0.068493


In [12]:
unique_values_count = train['BsmtQual'].value_counts()

# Ausgabe des Ergebnisses
print(unique_values_count)

BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64


In [13]:
data = train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'])

for column in data.select_dtypes(include=['float64', 'int64']).columns:
    if data[column].isnull().mean() > 0:
        data[column] = data[column].fillna(data[column].mean())

for column in data.select_dtypes(include=['object']).columns:
    if data[column].isnull().mean() > 0:
        data[column] = data[column].fillna(data[column].mode()[0])

data = pd.get_dummies(data, drop_first=True)

missing_values_after = data.isnull().sum().sum()
print(f"Missing values after preprocessing: {missing_values_after}")

Missing values after preprocessing: 0


In [14]:
# MasVnrArea -> mean

In [23]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisieren und Trainieren des XGBoost Regressors
xg_reg = xgb.XGBRegressor(objective='reg:squarederror',
                          colsample_bytree=0.3,
                          learning_rate=0.1,
                          max_depth=5,
                          alpha=10,
                          n_estimators=100)
xg_reg.fit(X_train, y_train)

# Vorhersagen und Berechnen des RMSE
y_pred = xg_reg.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"XGBoost RMSE: {rmse:.2f}")


XGBoost RMSE: 26120.58


In [22]:
from sklearn.ensemble import RandomForestRegressor

# Initialisieren und Trainieren des RandomForestRegressors
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# Vorhersagen und Berechnen des RMSE
y_pred_rf = rf_reg.predict(X_test)
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest RMSE: {rmse_rf:.2f}")


Random Forest RMSE: 28711.87


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Angenommen, 'data' ist Ihr DataFrame mit den Features und der Zielvariable 'SalePrice'
# Extrahieren der Merkmale und Zielvariable
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sicherstellen, dass alle Daten numerisch sind und in numpy-Arrays konvertieren
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

# Initialisieren des Keras-Modells
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

# Kompilieren des Modells
model.compile(optimizer='adam', loss='mean_squared_error')

# Trainieren des Modells
model.fit(X_train, y_train, epochs=2000, validation_split=0.2, verbose=1)

# Vorhersagen und Berechnen des RMSE
y_pred_nn = model.predict(X_test)
rmse_nn = root_mean_squared_error(y_test, y_pred_nn)
print(f"Keras NN RMSE: {rmse_nn:.2f}")


Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000
Epoch 72/5000
E

### Anwenden Models

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Behandeln der fehlenden Werte im Testdatensatz
test_data = test.copy()

for column in test_data.select_dtypes(include=['float64', 'int64']).columns:
    if test_data[column].isnull().mean() > 0:
        test_data[column] = test_data[column].fillna(test_data[column].mean())

for column in test_data.select_dtypes(include=['object']).columns:
    if test_data[column].isnull().mean() > 0:
        test_data[column] = test_data[column].fillna('Missing')

test_data = pd.get_dummies(test_data, drop_first=True)

missing_cols = set(X_train.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X_train.columns]

X_test_final = test_data.astype(np.float32).values

# Vorhersagen mit dem XGBoost-Modell
y_pred_xgboost = xg_reg.predict(X_test_final)

# Vorhersagen mit dem finetuned Keras-Modell
y_pred_nn_final = model_finetune.predict(X_test_final)

# Sicherstellen, dass 'Id' in den ursprünglichen Testdaten vorhanden ist
test_ids = test['Id'].values

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von XGBoost-Modell)
submission_xgboost = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_xgboost
})

# Exportieren als CSV
submission_xgboost.to_csv('submission_xgboost.csv', index=False)

# Erstellen des DataFrames mit 'Id' und 'SalePrice' (von Keras NN-Modell)
submission_nn = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': y_pred_nn_final.flatten()  # Ensure the output is a flat array
})

# Exportieren als CSV
submission_nn.to_csv('submission_nn.csv', index=False)
