In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline

from sklearn.compose import make_column_transformer

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

# Introduction

**Dataset** : House Prices (https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

**Objectif** : Entrainer et améliorer un modèle de machine learning dans le but de prédire le prix de vente des maisons dans la ville de Aimes (Iowa - US).

**Modèle à tester** : DecisionTreeRregressor - régression à l'aide d'un arbre de décision

# Dataclean / Dataviz

## Vue d'ensemble

In [None]:
df = pd.read_csv('../input/decision-tree-regressor-house-prices/house_prices.csv')
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Pré-sélection de certaines variables
df = df[['LotArea', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'PoolArea', 'Neighborhood', 'HouseStyle','OverallQual', 'SalePrice']]
df.isna().sum()

In [None]:
df.describe()

## Dataviz

In [None]:
# Matrice de corrélation - heatmap
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=mask, vmin=-1,cmap='RdYlBu_r', vmax=1, square=True,
            cbar_kws={"shrink": .75}, annot=True).set_title('Matrice de corrélation', fontsize=20);

In [None]:
# Dataviz des variables numériques - histplots et boxplots
def plot_variables(df, variables_list):  
    sns.set()
    sns.set_palette("Paired")
    fig, axes = plt.subplots(nrows=len(variables_list), ncols=2, figsize=(14,14), constrained_layout=True)
    for i, variable in enumerate(variables_list): 
        sns.histplot(data = df, x=variable, kde = True,   ax=axes[i,0])
        sns.boxplot(data = df, x=variable, ax=axes[i,1])
    fig.suptitle('Histogrammes et boxplots des variables numériques', fontsize=20)
    plt.show()

variables_list = ['GrLivArea', 'GarageArea', 'SalePrice']  
plot_variables(df, variables_list)

## Outliers 

In [None]:
# Sélection des variables uniquement numériques
df_numerical_features = df.select_dtypes(include=np.number)

In [None]:
# Calcul des outliers
def outlier(x):
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)

    iqr = q3-q1

    min_range = q1 - iqr*1.5
    max_range = q3 + iqr*1.5

    outliers = x[(x < min_range) | (x > max_range)]
    return outliers

In [None]:
# Affichage du % d'outliers présent pour chaque variable
for col in df_numerical_features.columns:
    outliers = outlier(df_numerical_features[col])
    if len(outliers):
        print(f"* La colonne {col} à {(outliers.count()/1460*100).round(3)}% d'outliers")
    else:
        print(f"* {col} n'a pas d'outliers.")

# Mise en place du modèle

## Entrainement avec variables numériques

In [None]:
X = df[['GrLivArea', 'GarageArea']]
y = df[['SalePrice']]

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train set:', X_train.shape)
print('Test set:', X_test.shape)

### Pipeline simple, entrainement et ajustements avec RandomizedSearchCV()

In [None]:
# Mise en place pipeline simple
model = make_pipeline(RobustScaler(),
                      DecisionTreeRegressor(max_depth=3, random_state=42))

model.fit(X_train, y_train)
cross_val_score(model, X_train, y_train, cv=5).mean()

In [None]:
params = {
    'robustscaler__with_centering' : [True, False], # default = True
    'robustscaler__with_scaling' : [True, False], # default = True
    'robustscaler__unit_variance' : [True, False], # default = False
    'decisiontreeregressor__criterion':['mse', 'friedman_mse', 'mae', 'poisson'], # default : mse
    'decisiontreeregressor__max_depth':np.arange(1, 5, 1), # default = None
    'decisiontreeregressor__min_samples_split' : np.arange(2, 31, 1), # default = None
    'decisiontreeregressor__min_samples_leaf': np.arange(2, 31, 1), # default = 2
    'decisiontreeregressor__max_features' : ['auto', 'sqrt', 'log2'], # default = None
    'decisiontreeregressor__max_leaf_nodes': np.arange(2, 11, 1), # default = None
    'decisiontreeregressor__min_impurity_decrease' : np.arange(1, 31, 1), # default = 0.0
    'decisiontreeregressor__ccp_alpha' :  np.arange(1, 11, 1) # default = 0.0
    }

#grid = GridSearchCV(model, param_grid=params, cv=3)
grid = RandomizedSearchCV(model, params, cv=3, n_iter=1000)
grid.fit(X_train, y_train)
grid.best_params_

### Premiers résultats  et visualisation de l'arbre de décision

In [None]:
model_1 = grid.best_estimator_
grid.best_score_

In [None]:
prediction = model_1.predict(X_test)
model_1.score(X_test, y_test)

In [None]:
fig = plt.figure(figsize=(50,25))
tree.plot_tree(model_1['decisiontreeregressor'], 
                   feature_names=X.columns,  
                   class_names=y.columns,
                   fontsize=20,
                   rounded=True,
                   filled=True);

### Métriques de performances

In [None]:
print(f'Erreur absolue médiane (Median Aboslute Error) : {median_absolute_error(y_test, prediction).round()}')
print(f'Erreur absolue moyenne (Mean Absolute Error - MAE) : {mean_absolute_error(y_test, prediction).round()}')
print(f'Erreur quadratique moyenne(Mean Squared Error - MSE) : {mean_squared_error(y_test, prediction).round()}')
print(f'Racine carré de MSE : {np.sqrt(mean_squared_error(y_test, prediction)).round()}')

## Ajout de variables catégorielles et utilisation de make_column_transformer()

In [None]:
X = df[['GrLivArea', 'GarageArea', 'Neighborhood', 'OverallQual']]
y = df[['SalePrice']]

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train set:', X_train.shape)
print('Test set:', X_test.shape)

### Préparation pipeline multiple

In [None]:
# Différenciation entre les types de features
numerical_features = ['GrLivArea', 'GarageArea'] # numériques continues
categorical_features = ['Neighborhood'] # catégorielles nominales
ordinal_features = ['OverallQual'] # catégorielles ordinales

In [None]:
# Mise en place pipeline pour chaque catégorie de variable
numerical_pipeline = make_pipeline(RobustScaler())
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
ordinal_pipeline = make_pipeline(OrdinalEncoder())

In [None]:
# Création pipeline qui regroupe toutes les catégories de variables
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                      (categorical_pipeline, categorical_features),
                                      (ordinal_pipeline, ordinal_features))

### Entrainement et ajustements avec RandomizedSearchCV()

In [None]:
# Pipeline final avec modèle
model = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=42))
model.fit(X_train, y_train)
cross_val_score(model, X_train, y_train, cv=5).mean()

In [None]:
params = {
    'columntransformer__pipeline-1__robustscaler__with_centering' : [True, False],
    'columntransformer__pipeline-1__robustscaler__with_scaling' : [True, False],
    'columntransformer__pipeline-1__robustscaler__unit_variance' : [True, False],
    'decisiontreeregressor__criterion':['mse', 'friedman_mse', 'mae', 'poisson'], # default : mse
    'decisiontreeregressor__max_depth':np.arange(1, 5, 1), # default = None
    'decisiontreeregressor__min_samples_split' : np.arange(2, 31, 1), # default = None
    'decisiontreeregressor__min_samples_leaf': np.arange(2, 31, 1), # default = 2
    'decisiontreeregressor__max_features' : ['auto', 'sqrt', 'log2'], # default = None
    'decisiontreeregressor__max_leaf_nodes': np.arange(2, 11, 1), # default = None
    'decisiontreeregressor__min_impurity_decrease' : np.arange(1, 31, 1), # default = 0.0
    'decisiontreeregressor__ccp_alpha' :  np.arange(1, 11, 1) # default = 0.0
    }

#grid = GridSearchCV(model, param_grid=params, cv=3)
grid = RandomizedSearchCV(model, params, cv=3, n_iter=1000)
grid.fit(X_train, y_train)
grid.best_params_

### Résultats et métriques de performances

In [None]:
model_2 = grid.best_estimator_
grid.best_score_

In [None]:
prediction = model_2.predict(X_test)
model_2.score(X_test, y_test)

In [None]:
print(f'Erreur absolue médiane (Median Aboslute Error) : {median_absolute_error(y_test, prediction).round()}')
print(f'Erreur absolue moyenne (Mean Absolute Error - MAE) : {mean_absolute_error(y_test, prediction).round()}')
print(f'Erreur quadratique moyenne(Mean Squared Error - MSE) : {mean_squared_error(y_test, prediction).round()}')
print(f'Racine carré de MSE : {np.sqrt(mean_squared_error(y_test, prediction)).round()}')