In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import p_reporting.m_submission as submit
import p_analysis.m_custom_error_metrics as custom_error_metrics

# Load Data

In [2]:
df = pd.read_csv("data/diamonds_train.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


# Features y Labels
Separo los datos en:
- features: los datos que usaremos para predecir el precio.
- label: el precio, el objetivo a predecir.

In [3]:
features_cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y','z']
label_cols = 'price'

X = df[features_cols]
y = df[label_cols]

print(X.shape)
print(y.shape)

(40455, 9)
(40455,)


## Numerical Features vs Categorical Features
Las *features* se dividen en:
- Numéricas: aquellas que contienen números.
- Categóricas: aquellas que contienen texto y se pueden separar en categorías.

In [4]:
numerical_features = ['carat', 'depth', 'table', 'x', 'y','z']
categorical_features = ['cut', 'color', 'clarity']

# Transformaciones

## One-hot-encoding
Los datos *categóricos* hay que convertirlos en datos numéricos para que los modelos de *machine learning* puedan  trabajar con ellos. 

Utilizo *one-hot-encoding* en los datos categóricos.

## Feature Scaling
Los modelos de *machine learing* trabajan mejor si los datos numéricos están en una escala similar.

El escalado se aplica a los datos numéricos.

## Column Transformer Preprocessor
Junto las transformaciones anteriores en un solo *ColumnTransformer*.

In [5]:
preprocessor = ColumnTransformer(transformers=[('numerical', StandardScaler(), numerical_features),
                                               ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [6]:
preprocessor

ColumnTransformer(transformers=[('numerical', StandardScaler(),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('categorical',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['cut', 'color', 'clarity'])])

### Comprobamos que el *transformer* funciona

In [7]:
X_transformed = pd.DataFrame(data=preprocessor.fit_transform(X))
X_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Machine Learning

## Model Selection
Utilizo una *pipeline* para automatizar el preprocesado de datos y el entrenamiento del modelo de *machine learning*.

In [8]:
ml_model = LGBMRegressor(n_estimators=256)

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', ml_model)])

## Validación Cruzada
Utilizo validación cruzada para evaluar las predicciones de forma más precisa.

In [9]:
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=10, 
                         n_jobs=-1)

In [10]:
print(f"El RMSE medio es: {custom_error_metrics.get_average_rmse(scores)}")

El RMSE medio es: 537.5094525133507


## Train
Una vez tengo un RMSE decente entreno el modelo con **todos** los datos del dataset de entrenamiento.

In [11]:
model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical', StandardScaler(),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['cut', 'color',
                                                   'clarity'])])),
                ('regressor', LGBMRegressor(n_estimators=256))])

### Nota
En la siguiente sección hago optimización de metaparámetros con *RandomizedSearchCV*. 

La parte de Validación Cruzada queda cubierta con ese método, así que ya no es necesaria.

Dejo este apartado como ilustración de cómo hacer validación cruzada individualmente.

## Optimización de Metaparámetros
Utilizo *RandomizedSearchCV* para probar el modelo con distintas combinaciones de metaparámetros.

In [12]:
param_grid = {
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__boosting_type': ['gbdt', 'dart', 'goss', 'rf'],
    'regressor__num_leaves': [4, 8, 16, 31, 64, 70, 80],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(X, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  2

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numerical',
                                                                               StandardScaler(),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
                                                                                'y',
                                                                                'z']),
                                                                              ('categorical',
                                                                               OneHotEncoder(handle_unknow

### ¿Cuales son los mejores parámetros?

In [13]:
grid_search.best_params_

{'regressor__num_leaves': 64,
 'regressor__n_estimators': 512,
 'regressor__boosting_type': 'dart'}

In [17]:
print(f"El RMSE con los mejores parámetros ha sido: {-grid_search.best_score_}")

El RMSE con los mejores parámetros ha sido: 532.3287515602095


A fecha 23032021@00:35, los mejores parámetros que me han salido son:
- 'regressor__num_leaves: 64,
- 'regressor__n_estimators: 512,
- 'regressor__boosting_type: 'dart'

Con un RMSE de 532.32 aprox.

# Competi
Cargo el set de datos de test y hago una predicción.

Como he hecho Grid Search, no hace falta un fit del modelo (utilizo el mejor fit del Grid Search).

In [15]:
X_test = pd.read_csv('data/diamonds_test.csv')
y_predict = grid_search.predict(X_test)



## Submission
Guardo los datos para el envio a la competición de Kaggle.

In [16]:
submit.to_csv(submit.to_df(X_test, y_predict))
print("Guardado!!!")

Guardado!!!
