In [1]:
from src.diamonds_data import DiamondsData
from src.transform_toolbox import label_encode_data, normalize_data_pipeline, export_to_kaggle_csv
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np
from sklearn.model_selection import cross_val_score, cross_validate


# Cargo los ficheros del csv

In [2]:
diamonds_train_df = DiamondsData("input/train.csv")
X = diamonds_train_df.get_features()
y = diamonds_train_df.get_groundtruth()

# Transformación y preprocesado de datos

In [3]:
# Voy a transformar las columnas "cut", "color" y "clarity" en numéricas, usando LabelEncoder.
object_columns = ["cut", "color", "clarity"]
X_label_encoded = label_encode_data(X, object_columns)

# Comprobamos que las columnas "cut", "color" y "clarity" ya son numéricas:
X_label_encoded.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.12,3,4,2,61.6,59.0,6.67,6.63,4.1
1,1.14,4,1,3,60.0,54.0,6.74,6.97,4.11
2,0.9,4,0,3,60.3,63.0,6.12,6.22,3.72
3,0.71,2,1,4,61.9,54.0,5.74,5.76,3.56
4,0.34,4,2,3,60.0,62.0,4.51,4.55,2.72


In [4]:
# Transformaciones que uso:
# - StandardScaler: hago que la media de todas las columnas sea 0 y la desviación típica sea 1.

pipeline_transforms = [
    StandardScaler(),
    # Normalizer() # Si aplico el Normalizer() las métricas empeoran
] 

# En esta función aplico una pipeline para transformar los datos
X_normalized = normalize_data_pipeline(X_label_encoded, pipeline_transforms)

# Resultado de las 5 primeras filas tras la normalización
display(X_normalized.head())

# Compruebo que tras la transformación la media y standar deviation de una de las columnas es prácticamente 0 y 1 respectivamente:
#print(X_normalized["carat"].mean())
#print(X_normalized["carat"].std())


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.306806,0.197493,0.374375,-0.480789,-0.045802,0.314812,0.378267,0.351302,0.357559
1,0.226428,0.444007,-0.294942,-0.151882,-0.382685,-0.487746,0.282435,0.336856,0.252874
2,0.061445,0.40433,-0.437076,-0.138309,-0.28858,0.713883,0.099123,0.120429,0.073081
3,-0.097372,-0.282883,-0.49208,0.050567,0.05666,-0.813753,0.004102,0.011266,0.01574
4,-0.272854,0.399286,-0.098846,-0.136584,-0.34414,0.577911,-0.307473,-0.290782,-0.325866


# Separar datos de entrenamiento de datos de test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size = 0.2)


# Evaluación de varios modelos usando Cross-Validation

In [6]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

models = {
    "svm_rbf_kernel": SVR(kernel='rbf'),
    "linear_regression": LinearRegression(),
    "k_nearest": KNeighborsRegressor(),
    "xgb_regressor": XGBRegressor()
}


In [7]:
# Numero de ciclos de cross-validations
n = 5

# Para cada modelo hago n cross-validations
result = {}
for name, model in models.items():
    scores = cross_validate(model, X_normalized, y, cv = n, scoring=['neg_mean_squared_error', 'r2'])
    result[name] = scores
    

In [8]:
# Muestro las métricas para comparar las predicciones de los distintos algoritmos que he utilizado:
for model in result.keys():
    print("Modelo: " + model)
    
    r2_score = round(np.mean(result[model]['test_r2']), 2)
    neg_mse = np.mean(result[model]['test_neg_mean_squared_error'])
    mse = -1 * neg_mse
    root_mse = sqrt(mse)

    print("- Root Mean Squared Error: " + str(root_mse))
    print("- r2: " + str(r2_score))
    print("")


Modelo: svm_rbf_kernel
- Root Mean Squared Error: 2673.1724417971177
- r2: 0.55

Modelo: linear_regression
- Root Mean Squared Error: 1605.8567919488596
- r2: 0.84

Modelo: k_nearest
- Root Mean Squared Error: 1106.981680500646
- r2: 0.92

Modelo: xgb_regressor
- Root Mean Squared Error: 723.7984412242815
- r2: 0.97



# Entrenamiento del modelo definitivo
De acuerdo a los resultados anteriores el modelo que ofrece una mejor predicción es X Gradient Boosting.

El siguiente paso es entrenar el modelo con todos datos y luego formatear y enviar las predicciones.

In [9]:
model = XGBRegressor()

# Cargo los datos del CSV de predicciones
diamonds_predict_df = DiamondsData("input/predict.csv")
X_predict = diamonds_predict_df.get_features()

# Hago label encode de las mismas columnas que con los datos de entrenamiento
X_predict_label_encoded = label_encode_data(X_predict, object_columns)

# Normalizo los datos con la misma pipeline que los datos de entrenamiento
X_predict_normalized = normalize_data_pipeline(X_predict_label_encoded, pipeline_transforms)

# Entrenamiento del modelo con todos los datos de TRAIN.CSV:
model.fit(X_normalized, y)

# Prediccion con los datos de PREDICT.CSV
diamond_price_prediction = model.predict(X_predict_normalized)

# Preparación de los resultados para enviar a Kaggle

In [10]:
print(diamond_price_prediction)

[ 791.1374 6061.3022 6015.245  ... 2541.3706 1140.6143 1033.5537]


In [13]:
export_to_kaggle_csv(diamonds_predict_df.get_all(), diamond_price_prediction)

Exportados los datos a: output/'prediction_20201005_151853.csv'
