In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import p_reporting.m_submission as submit
import p_analysis.m_custom_error_metrics as custom_error_metrics

# Load Data

In [2]:
df = pd.read_csv("data/diamonds_train.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


## Features y Labels
Separo los datos en:
- features: los datos que usaremos para predecir el precio.
- label: el precio, el objetivo a predecir.

In [3]:
features_cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y','z']
label_cols = 'price'

X = df[features_cols]
y = df[label_cols]

print(X.shape)
print(y.shape)

(40455, 9)
(40455,)


## Numerical Features vs Categorical Features
Las *features* se dividen en:
- Numéricas: aquellas que contienen números.
- Categóricas: aquellas que contienen texto.

Los datos *categóricos* hay que convertirlos en datos numéricos para que los modelos de *machine learning* puedan  trabajar con ellos. Utilizo *one-hot-encoding* para hacer esto.

In [4]:
numerical_features = ['carat', 'depth', 'table', 'x', 'y','z']
categorical_features = ['cut', 'color', 'clarity']

X_categorical = pd.get_dummies(X[categorical_features])
X_categorical.shape

(40455, 20)

In [5]:
X = pd.concat([X[numerical_features], X_categorical], axis=1)
X.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1.21,62.4,58.0,6.83,6.79,4.25,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
1,0.32,63.0,57.0,4.35,4.38,2.75,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.71,65.5,55.0,5.62,5.53,3.65,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.41,63.8,56.0,4.68,4.72,3.0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.02,60.5,59.0,6.55,6.51,3.95,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


## Datos de Test vs Entrenamiento
Para validar el modelo se dividen los datos en dos conjuntos:
- Datos de entrenamiento: 80%
- Datos de test: 20%

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Scaling
Los modelos de *machine learing* trabajan mejor si los datos numéricos están en una escala similar.

Este proceso se hace **despues** de dividir los datos de test y entrenamiento.

Consiste en crear un *Scaler*. Este *Scaler* se entrena **solo** con los datos de entrenamiento. Luego se aplica *.transfom()* tanto a datos de entrenamiento como a los de test.

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Machine Learning

## Model Selection

In [8]:
model = RandomForestRegressor()

## Train

In [9]:
model.fit(X_train, y_train)

RandomForestRegressor()

## Predict

In [10]:
y_predict = model.predict(X_test)

## Error
Comprobar como de acertada ha ido la predicción.

In [11]:
print(custom_error_metrics.get_rmse(y_test, y_predict))

558.0286665804889


# Competición
## Cargo el dataset de test

In [12]:
test_df = pd.read_csv("data/diamonds_test.csv")
test_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19


In [13]:
X_competi = test_df[features_cols]

X_competi.shape

(13485, 9)

## Aplico las mismas transformaciones que a los datos de train y test

### One-hot encoding con get_dummies

In [14]:
numerical_features = ['carat', 'depth', 'table', 'x', 'y','z']
categorical_features = ['cut', 'color', 'clarity']

X_competi_categorical = pd.get_dummies(X_competi[categorical_features])

In [15]:
X_competi = pd.concat([X_competi[numerical_features], X_competi_categorical], axis=1)
X_competi.shape

(13485, 26)

### Scaler

In [16]:
X_competi = scaler.transform(X_competi)

## Predicción

In [17]:
y_predict_competi = model.predict(X_competi)
y_predict_competi[:10]

array([ 2904.22,  5404.94,  9231.47,  4189.41,  1708.99,  6342.91,
        4797.07,  4313.94,  5128.99, 15952.2 ])

## Submission

In [18]:
submit.to_csv(submit.to_df(test_df, y_predict_competi))