# Task for Today  

***

## Apartment Type Prediction  
  
Given *data about home rentals in Germany*, let's try to predict if a given home is **an apartment** or not.  
  
We will use a TensorFlow/Keras neural network to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/apartment-rental-offers-in-germany/immo_data.csv')

In [None]:
data

In [None]:
data.info()

# Helper Functions

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop high-cardinality categorical columns
    df = df.drop(['houseNumber', 'street', 'streetPlain', 'regio3', 'description', 'facilities'], axis=1)
    
    # Drop columns with more than 25% missing values
    missing_value_columns = df.loc[:, df.isna().mean() > 0.25].columns
    df = df.drop(missing_value_columns, axis=1)
    
    # Drop examples with missing label values
    missing_label_rows = df.loc[df['typeOfFlat'].isna(), :].index
    df = df.drop(missing_label_rows, axis=0).reset_index(drop=True)
    
    # Construct label column
    df['isApartment'] = df['typeOfFlat'].apply(lambda x: 1 if x == 'apartment' else 0)
    df = df.drop('typeOfFlat', axis=1)
    
    # Get columns with remaining missing values
    remaining_na_columns = df.loc[:, df.isna().sum() > 0]
    categorical_na_columns = remaining_na_columns.select_dtypes('object').columns
    numeric_na_columns = remaining_na_columns.drop(categorical_na_columns, axis=1).columns
    
    # Fill numeric missing values with column mean
    for column in numeric_na_columns:
        df[column] = df[column].fillna(df[column].mean())
    
    # Fill categorical missing values with "missing"
    for column in categorical_na_columns:
        df[column] = df[column].fillna("missing")
    
    # Convert booleans columns to int columns
    for column in df.columns:
        if df[column].dtype == 'bool':
            df[column] = df[column].astype(np.int)
    
    # Extract date features
    df['date'] = pd.to_datetime(df['date'], format='%b%y')
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df = df.drop('date', axis=1)
    
    # One-hot encode
    for column in df.select_dtypes('object'):
        df = onehot_encode(df, column)
    
    # Split df into X and y
    y = df['isApartment']
    X = df.drop('isApartment', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
def evaluate_model(model, X_test, y_test):
    
    results = model.evaluate(X_test, y_test, verbose=0)
    print("    Test Loss: {:.4f}".format(results[0]))
    print("Test Accuracy: {:.2f}%".format(results[1] * 100))
    print("     Test AUC: {:.4f}".format(results[2]))
    
    y_pred = np.squeeze(np.array(model.predict(X_test) >= 0.5, dtype=np.int))
    
    cm = confusion_matrix(y_test, y_pred)
    clr = classification_report(y_test, y_pred, target_names=["NOT APARTMENT", "APARTMENT"])
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["NOT APARTMENT", "APARTMENT"])
    plt.yticks(np.arange(2) + 0.5, ["NOT APARTMENT", "APARTMENT"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

# Preprocessing

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
inputs = tf.keras.Input(shape=(X_train.shape[1],))
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

print(model.summary())

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
    ]
)

In [None]:
epochs_range = range(len(history.history['loss']))

plt.figure(figsize=(16, 10))
plt.plot(epochs_range, history.history['loss'], label="Training Loss")
plt.plot(epochs_range, history.history['val_loss'], label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.show()

# Results

In [None]:
evaluate_model(model, X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/bX6A5S7V_e4