# HUK Coding Challenge

Die Aufgabe besteht in der Modellierung einer Kundenaffinität zum Abschluss einer KFZ-Versicherung.

## Loading Data

In [None]:
import os
from datetime import datetime
import logging
import csv
import pickle

import pandas as pd
from pandas_profiling import ProfileReport
from functools import reduce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(message)s')
logger = logging.getLogger()

In [None]:
def detect_delimiter(filename):
    """
    This function looks for the delimiter in a file.

    Inputs:
        - filename (str): path to specific file
    Returns:
        - delimiter (str)
    """
    with open(filename, 'r', newline='') as file:
        dialect = csv.Sniffer().sniff(file.read(1024))
        return dialect.delimiter

In [None]:
input_folder_path = 'data/input_data/'
directories = [os.path.join(os.getcwd(), '..', input_folder_path)]
file_list = []
dataframes = []

# search in all specified directories
for directory in directories:
    # list content of directory
    file_names = os.listdir(os.path.join(os.getcwd(), '..', directory))
    logger.info(f'Found files: {file_names}')
    for each_file_name in file_names:
        file_list.append(each_file_name)
        # get filepath to relevant files
        file_path = os.path.join(os.getcwd(), '..', directory, each_file_name)
        # error handling for the one file using a different delimiter
        delimiter = detect_delimiter(file_path)
        if delimiter == ',':
            current_df = pd.read_csv(file_path)
            dataframes.append(current_df)
        else:
            current_df = pd.read_csv(file_path, delimiter=';')
            dataframes.append(current_df)

# Merge all dataframes into one
df_merged = reduce(lambda left, right: pd.merge(left, right, on='id'), dataframes)

# Deduplicate
df_merged = df_merged.drop_duplicates()

# Check wether data path exists
if not os.path.exists('../data/raw_data/'):
    os.makedirs('../data/raw_data/')

# save merged dataframe as csv
df_merged.to_csv('../data/raw_data/raw_data.csv')

## Explorative Datenanalyse (EDA)

Machen Sie sich mit dem Datensatz vertraut. Identifizieren Sie dabei mögliche Probleme sowie grundlegende statistische Zusammenhänge, welche für die anschließende Modellierung wichtig sein könnten.

In [None]:
df_raw = df_merged.copy()

In [None]:
profile = ProfileReport(df_raw, title='Pandas Profiling Report on raw data')
# open report from output.html file generated from this cell
profile.to_file("../eda_output.html")

### Cleaning Data

- Number of variables:      12
- Number of observations:   381109
- Missing cells:            0
- Missing cells %:          0%
- Duplicate rows:           0
- Categorical:              2
- Numeric:                  8
- Boolean:                  1
- Variables:
    - Fahrerlaubnis:                Highly imbalanced (97.8%)
    - Vertriebskanal > Alter:       High correlation
    - Vorversicherung > Vorschaden: High correlation
    - Alter_Fzg > Vertriebskanal:   High correlation
    - id:                           uniformly distributed & unique values

In [None]:
df_raw.info()

In [None]:
# Check wether data path exists
if not os.path.exists('../data/cleaned_data/'):
    os.makedirs('../data/cleaned_data/')

# save merged dataframe as csv
df_raw.to_csv('../data/cleaned_data/cleaned_data.csv')
df_cleaned = df_raw.copy()

## Feature Engineering

Bereiten Sie, soweit für Ihre Modellierung nötig, die Variablen geeignet auf.

In [None]:
# Get column names of numerical columns
column_names = df_cleaned.columns

# Get categorical columns
categorical_columns = df_cleaned.select_dtypes(include="object").columns

# Get numerical columns
numerical_columns = df_cleaned.select_dtypes(include="number").columns

# Create pipeline for categorical columns
categorical_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Create pipline for numerical columns with StandardScaler
numerical_pipeline = Pipeline([
    ('standard_scaler', StandardScaler())
])

# Create transformer for all columns
preprocessor = ColumnTransformer([
    ('categorical_pipeline', categorical_pipeline, categorical_columns)#,
    #('numerical_pipeline', numerical_pipeline, numerical_columns)
], remainder='passthrough')

# Fit and transform data
df_cleaned = preprocessor.fit_transform(df_cleaned)

# Convert to dataframe
df_cleaned = pd.DataFrame(df_cleaned, columns=['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'Fahrerlaubnis', 'Regional_Code', 'Vorversicherung', 'Jahresbeitrag', 'Vertriebskanal', 'Kundentreue', 'id', 'Interesse', 'Alter'])

# Put id column as first column
# and Interesse as last column
df_cleaned = df_cleaned[['id', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'Fahrerlaubnis', 'Regional_Code', 'Vorversicherung', 'Jahresbeitrag', 'Vertriebskanal', 'Kundentreue', 'Alter', 'Interesse']]                                             

# change unnecessary floats to int
float_columns = ['Fahrerlaubnis', 'Regional_Code', 'Vorversicherung', 'Vertriebskanal', 'Kundentreue', 'Alter', 'Interesse']
df_cleaned[float_columns] = df_cleaned[float_columns].astype('int64')

# Check wether data path exists
if not os.path.exists('../data/encoded_data/'):
    os.makedirs('../data/encoded_data/')

# save merged dataframe as csv
df_cleaned.to_csv('../data/encoded_data/encoded_data.csv')
df_encoded = df_cleaned.copy()

In [None]:
df_encoded.head()

## Modellvergleich

Entscheiden Sie sich für ein geeignetes Modell zur Prognose der Kundenaffinität. Erläutern Sie wie Sie dabei vorgehen und begründen Sie Ihre Entscheidung.

Um die Kundenaffinität zum Abschluss einer Kfz-Versicherung vorherzusagen, empfehle ich die Verwendung eines binären Klassifikationsmodells, wie beispielsweise das logistische Regressionsmodell oder ein Random Forest Classifier. Beide Modelle eignen sich gut für diese Art von Prognoseaufgaben.

Logistische Regression:
Die logistische Regression ist eine weit verbreitete Methode zur Vorhersage von binären Ergebnissen. Sie modelliert die Wahrscheinlichkeit, dass eine Beobachtung einer bestimmten Klasse angehört, basierend auf einer Kombination von Eingangsvariablen. In diesem Fall könnten Merkmale wie Alter, Geschlecht, Fahrzeugtyp, Vorversicherungshistorie, Schadensfreiheitsklasse, geografischer Standort usw. als Eingangsvariablen dienen. Das Modell kann dann die Wahrscheinlichkeit schätzen, dass ein Kunde affin oder nicht affin ist und eine Entscheidungsgrenze festlegen, um die Vorhersage zu treffen.

Random Forest Classifier:
Ein Random Forest Classifier ist ein Ensemble-Modell, das aus mehreren Entscheidungsbäumen besteht. Jeder Baum wird auf einem zufälligen Teil des Datensatzes trainiert, und die Vorhersage erfolgt durch Abstimmung der Vorhersagen der einzelnen Bäume. Random Forests sind in der Regel robust gegenüber Overfitting und können gut mit einer Mischung aus kategorischen und numerischen Variablen umgehen. Sie können auch die wichtigsten Merkmale identifizieren, die zur Vorhersage beitragen.

Bei der Auswahl des Modells sind folgende Faktoren zu berücksichtigen:

Datenverfügbarkeit: Verfügbarkeit der Daten überprüfen. Sicherstellen, dass ausreichend Daten vorhanden sind, um ein zuverlässiges Modell zu trainieren, und dass die relevanten Merkmale erfasst werden.

Interpretierbarkeit: Wenn es wichtig ist, die Vorhersage des Modells zu verstehen und zu erklären, könnte die logistische Regression die bessere Wahl sein. Die Koeffizienten des Modells können direkt interpretiert werden, um den Einfluss der einzelnen Merkmale zu verstehen.

Leistung: Gründliche Evaluation der Modelle durchführen, indem geeignete Leistungsmetriken wie Genauigkeit, Präzision, Recall oder den Flächenwert unter der ROC-Kurve (AUC-ROC) verwendet werden. Geeignetes Modell mit den besten Vorhersageergebnissen für Ihre spezifische Anwendung auswählen.

Dateninterpretation: Wenn interessant ist, zu verstehen, welche Merkmale am stärksten zur Vorhersage beitragen, könnte der Random Forest Classifier von Vorteil sein. Er kann die wichtigsten Merkmale identifizieren und Einblicke in die Beziehung zwischen den Merkmalen und der Zielvariable liefern.

Es ist grundsätzlich sinnvoll im Prozess verschiedene Modelle miteinander zu vergleichen, diese Code Challenge wird allerdings mit einem Random Forrest Classifier umgesetzt werden.

## Modellbuilding

1. Trainieren Sie das von Ihnen gewählte Modell. Wählen Sie geeignete Metriken um die Güte des finalen Modells zu beurteilen.
2. Zeigen Sie, welche Variablen und Zusammenhänge für Ihre finales Modell relevant sind.
3. Überlegen Sie sich (ohne Umsetzung) wie Sie Ihr Modell weiter optimieren können.

In [None]:
# Train a random forest classifier model 
# and select suitable hyperparameters
# and metrics to view the models performance

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(df_encoded.drop('Interesse', axis=1), df_encoded['Interesse'], test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on train set
y_train_pred = model.predict(X_train)

# Evaluate model on train set
logger.info(f'Train Accuracy: {accuracy_score(y_train, y_train_pred)}')
logger.info(f'Train Precision: {precision_score(y_train, y_train_pred)}')
logger.info(f'Train Recall: {recall_score(y_train, y_train_pred)}')
logger.info(f'Train F1: {f1_score(y_train, y_train_pred)}')

### Test the models performance on test data

In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model
logger.info(f'Tets Accuracy: {accuracy_score(y_test, y_pred)}')
logger.info(f'Test Precision: {precision_score(y_test, y_pred)}')
logger.info(f'Test Recall: {recall_score(y_test, y_pred)}')
logger.info(f'Test F1: {f1_score(y_test, y_pred)}')

Accuracy Score:
Accuracy_score calculates the percentage of correct predictions made by the model out of all the predictions. It is calculated by dividing the number of correct predictions by the total number of predictions. A high accuracy score indicates a high overall predictive performance of the model. However, accuracy alone might not be sufficient if the dataset is imbalanced.

Precision Score:
Precision_score measures the proportion of correctly predicted positive instances out of all instances predicted as positive. It is calculated by dividing the number of true positives by the sum of true positives and false positives. Precision focuses on the quality of positive predictions. A high precision score indicates a low false positive rate, meaning that when the model predicts a positive class, it is likely to be correct.

Recall Score:
Recall_score, also known as sensitivity or true positive rate, measures the proportion of correctly predicted positive instances out of all actual positive instances. It is calculated by dividing the number of true positives by the sum of true positives and false negatives. Recall focuses on the model's ability to find all positive instances without missing any. A high recall score indicates a low false negative rate, meaning that the model can correctly identify a large proportion of positive instances.

F1 Score:
The F1_score is the harmonic mean of precision and recall. It provides a balance between precision and recall and is useful when you want to consider both false positives and false negatives. It is calculated as 2 * ((precision * recall) / (precision + recall)). The F1 score is a single metric that combines precision and recall. A high F1 score indicates good overall performance when considering both precision and recall.

In summary, while accuracy_score provides an overall performance measure, precision_score, recall_score, and f1_score offer insights into different aspects of the model's performance. Consider your specific requirements and the nature of your problem to determine which metric(s) are most important for your evaluation.

In [None]:
# Inverse Transform to get original values
X_test_inv = preprocessor.named_transformers_['categorical_pipeline'].inverse_transform(X_test[['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7']])

# Add to dataframe X_train
X_test['Alter_Fzg'] = X_test_inv[:, 0]
X_test['Vorschaden'] = X_test_inv[:, 1]
X_test['Geschlecht'] = X_test_inv[:, 2]

# Drop lag columns
X_test = X_test.drop(['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7'], axis=1)

# Add actual values from y_train to dataframe
X_test['Interesse'] = y_test

# Add train predictions to dataframe
X_test['Preds'] = y_pred

X_test.head()

In [None]:
# Check wether data path exists
if not os.path.exists('../models/'):
    os.makedirs('../models/')

# Save model with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

with open(f'../models/model_{timestamp}.pkl', 'wb') as file:
    pickle.dump(model, file)

## Possible strategies to optimize the Random Forrest Model

#### __Feature Selection:__

Evaluate the importance of each feature in your model and consider removing less informative or highly correlated features. This can be done by examining the feature importances provided by the random forest model.

In [None]:
# Identify the most important features
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances

#### __Hyperparameter Tuning:__ 

Random forests have various hyperparameters that can be tuned to improve performance and reduce overfitting. Some key hyperparameters include the number of trees (n_estimators), the maximum depth of the trees (max_depth), and the minimum number of samples required to split an internal node (min_samples_split). Using techniques like grid search or randomized search, you can find optimal hyperparameter values that balance model complexity and performance.

#### __Regularization:__ 

Random forests have regularization techniques like subsampling (using a subset of samples for training each tree) and feature subspace sampling (using a subset of features for each split). These techniques can help reduce overfitting by introducing additional randomness into the model.

__Subsampling (Bootstrap Aggregating or Bagging):__
In a random forest, each decision tree is trained on a different bootstrap sample obtained by randomly selecting a subset of the original training data with replacement. This means that each tree is trained on a different subset of the data, allowing them to capture different patterns. By aggregating the predictions of all trees, the random forest reduces the variance and prevents overfitting. Subsampling introduces additional randomness into the model, improving its generalization.

__Feature Subspace Sampling:__
For each split in a decision tree within a random forest, only a random subset of features is considered. Instead of evaluating all features at each split, a limited set of features is randomly selected. This technique is also called feature bagging or random subspace method. By using only a subset of features for each tree, the random forest encourages diversity among the trees and reduces the correlation between them. This helps prevent overfitting and improves the robustness of the model.

**Regularization techniques like subsampling and feature subspace sampling are inherent in the random forest algorithm and do not require explicit user intervention. However, you can adjust the hyperparameters max_features and max_samples to fine-tune the regularization strength and control the trade-off between model complexity and performance.**


#### __Cross-Validation:__ 
Instead of evaluating the model solely on the training set, perform cross-validation to assess its performance on multiple train-test splits. This helps provide a more robust estimate of the model's performance and can indicate if it is overfitting.

The advantage of using cross-validation is that it provides a more robust estimate of the model's performance compared to a single train-test split. It helps assess the model's ability to generalize to unseen data and reduces the influence of the specific data split on the evaluation.
To perform cross-validation in scikit-learn, you can use the cross_val_score function or the cross_validate function, specifying the number of folds (cv parameter) and the desired evaluation metric. These functions handle the data splitting and model evaluation automatically, making it easier to perform cross-validation with random forests.

#### __Increase Training Data:__ 

If possible, obtain more training data to provide a broader representation of the underlying patterns. A larger dataset can help the model generalize better and reduce overfitting.

#### __Early Stopping:__ 

Monitor the model's performance on a validation set during training and stop training early if the performance starts to degrade. This prevents the model from overfitting to the training data excessively.

In [None]:
# Train a random forest classifier model
# select suitable hyperparamters with grit search
# select metrics to view the models performance

# Get categorical columns
categorical_columns = df_cleaned.select_dtypes(include="object").columns

# Get numerical columns
numerical_columns = df_cleaned.select_dtypes(include="number").columns

# Create pipeline for categorical columns
categorical_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Create pipline for numerical columns with StandardScaler
numerical_pipeline = Pipeline([
    ('standard_scaler', StandardScaler())
])

# Create transformer for all columns
preprocessor = ColumnTransformer([
    ('categorical_pipeline', categorical_pipeline, categorical_columns)#,
    #('numerical_pipeline', numerical_pipeline, numerical_columns)
], remainder='passthrough')

# Fit and transform data
df_cleaned = preprocessor.fit_transform(df_cleaned)

# Convert to dataframe
df_cleaned = pd.DataFrame(df_cleaned, columns=['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'Fahrerlaubnis', 'Regional_Code', 'Vorversicherung', 'Jahresbeitrag', 'Vertriebskanal', 'Kundentreue', 'id', 'Interesse', 'Alter'])

# Put id column as first column
# and Interesse as last column
df_cleaned = df_cleaned[['id', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'Fahrerlaubnis', 'Regional_Code', 'Vorversicherung', 'Jahresbeitrag', 'Vertriebskanal', 'Kundentreue', 'Alter', 'Interesse']]

# change unnecessary floats to int
float_columns = ['Fahrerlaubnis', 'Regional_Code', 'Vorversicherung', 'Vertriebskanal', 'Kundentreue', 'Alter']
df_cleaned[float_columns] = df_cleaned[float_columns].astype('int64')

df_encoded = df_cleaned.copy()

# Train a random forest classifier model
# select suitable hyperparamters with grit search
# select metrics to view the models performance

# train, test, split
# Split data into train and test
X = df_encoded.drop(['Interesse'], axis=1)
y = df_encoded['Interesse']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40, 50],
    'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [3, 4, 5, 6, 7],
    'min_samples_split': [8, 10, 12, 14, 16],
    'n_estimators': [100, 200, 300, 400, 500]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get best parameters
grid_search.best_params_

# Get best estimator
best_grid = grid_search.best_estimator_

# Get predictions
y_pred = best_grid.predict(X_test)

# Get metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

# Check wether data path exists
if not os.path.exists('../models/'):
    os.makedirs('../models/')

# save model
pickle.dump(best_grid, open('../models/model.pkl', 'wb'))

# save preprocessor
pickle.dump(preprocessor, open('../models/preprocessor.pkl', 'wb'))

# save metrics
metrics = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
pickle.dump(metrics, open('../models/metrics.pkl', 'wb'))

# save predictions
predictions = {'y_pred': y_pred}
pickle.dump(predictions, open('../models/predictions.pkl', 'wb'))