In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from math import sqrt
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
#data = pd.read_csv("output/clean_data.csv")

In [3]:
#data.head()

In [4]:
data = pd.read_csv("input/train.csv")
data = pd.get_dummies(data)
data.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,0,0,...,1,0,0,0,0,0,0,1,0,0
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,0,0,...,0,1,0,0,0,0,0,1,0,0


In [5]:
columnas = [i for i in data.columns if i not in ["price"]]
X = data[columnas]
y = data["price"]

In [6]:
pipeline = [
    StandardScaler(),
    Normalizer()
]

transformer = make_pipeline(*pipeline)

X_data = transformer.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.2)


In [9]:
models = {
    "LinearRegression" : LinearRegression(),
    "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators = 900),
    "RandomForestRegressor": RandomForestRegressor(n_estimators = 200),
    "Ridge" : Ridge(),
    "Lasso": Lasso(),
    "BayesianRidge" : BayesianRidge(),
    "SGDR" : SGDRegressor(),
    "KNeighbors" : KNeighborsRegressor(),
    "DecisionTree": DecisionTreeRegressor(random_state=0),
    "ExtraTreesRegressor":ExtraTreesRegressor(n_estimators = 900,n_jobs =1, min_samples_split= 10, random_state=10, max_features = 0.9, warm_start=True)
}

In [10]:
for name, model in models.items():
    print(f"trained ML model: {name}")
    model.fit(X_train, y_train)
    print(f"end of model training:  {name}")


trained ML model: LinearRegression
end of model training:  LinearRegression
trained ML model: GradientBoostingRegressor
end of model training:  GradientBoostingRegressor
trained ML model: RandomForestRegressor
end of model training:  RandomForestRegressor
trained ML model: Ridge
end of model training:  Ridge
trained ML model: Lasso
end of model training:  Lasso
trained ML model: BayesianRidge
end of model training:  BayesianRidge
trained ML model: SGDR
end of model training:  SGDR
trained ML model: KNeighbors
end of model training:  KNeighbors
trained ML model: DecisionTree
end of model training:  DecisionTree
trained ML model: ExtraTreesRegressor
end of model training:  ExtraTreesRegressor


In [11]:
for name, model2 in models.items():
    y_pred2 = model2.predict(X_test)
    print(f"-------{name}-------")
    print ("RMSE: ",sqrt(mean_squared_error(y_pred2, y_test)))

-------LinearRegression-------
RMSE:  0.18586520812993107
-------GradientBoostingRegressor-------
RMSE:  0.09918313907304044
-------RandomForestRegressor-------
RMSE:  0.10202644798413074
-------Ridge-------
RMSE:  0.18555092018976416
-------Lasso-------
RMSE:  1.0125142718025117
-------BayesianRidge-------
RMSE:  0.1858338926949975
-------SGDR-------
RMSE:  0.18948578236688593
-------KNeighbors-------
RMSE:  0.20321030186808203
-------DecisionTree-------
RMSE:  0.14522110410250966
-------ExtraTreesRegressor-------
RMSE:  0.09805288355945604
