In [1]:
# imports

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# data visualization

import matplotlib.pyplot as plt
import seaborn as sns

# machine learning models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor

# error metrics

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# for training our machine model and testing the results
train = pd.read_csv('../input/dapt202011mad/diamonds_train.csv')

In [3]:
# to make our final price prediction
test = pd.read_csv('../input/dapt202011mad/diamonds_test.csv')

In [4]:
target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
features_pipeline = cat_features + num_features

# 2. PIPELINE

In [5]:
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                       ('scaler', StandardScaler())])

In [6]:
numerical_transformer.fit_transform(train[num_features])

array([[ 0.8670056 ,  0.45201864,  0.24798091,  0.97880679,  0.92198533,
         1.02265738],
       [-1.00455749,  0.8710986 , -0.19974534, -1.22673789, -1.17981558,
        -1.1292594 ],
       [-0.18443434,  2.61726508, -1.09519783, -0.09728557, -0.17688154,
         0.16189067],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158,  0.56971383,  0.5993022 ,
         0.6783507 ],
       [-0.98352869,  0.10278535, -1.4086062 , -1.13780463, -1.10132509,
        -1.11491329],
       [ 0.93009199,  0.172632  ,  0.24798091,  0.97880679,  1.00047582,
         1.02265738]])

In [7]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                          ('encoder', OneHotEncoder(handle_unknown='ignore'))])

In [8]:
categorical_transformer.fit_transform(train[cat_features])

<40455x20 sparse matrix of type '<class 'numpy.float64'>'
	with 121365 stored elements in Compressed Sparse Row format>

In [9]:
preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor', numerical_transformer, num_features),
                                                ('categorical_preprocessor', categorical_transformer, cat_features)])

In [10]:
preprocessor.fit_transform(train[features_pipeline])

array([[ 0.8670056 ,  0.45201864,  0.24798091, ...,  1.        ,
         0.        ,  0.        ],
       [-1.00455749,  0.8710986 , -0.19974534, ...,  1.        ,
         0.        ,  0.        ],
       [-0.18443434,  2.61726508, -1.09519783, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98352869,  0.10278535, -1.4086062 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.93009199,  0.172632  ,  0.24798091, ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
X = train[features_pipeline]
y = train[target]

In [12]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((30341, 9), (10114, 9), (30341,), (10114,))

In [13]:
# creating pipeline for different models

pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                       ('rf_regressor', RandomForestRegressor())]) 

pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                     ("lr_regressor",LinearRegression())])

pipeline_dt = Pipeline(steps=[('preprocessor', preprocessor),
                     ("dt_regressor",DecisionTreeRegressor())])

pipeline_kn = Pipeline(steps=[('preprocessor', preprocessor),
                     ("kn_regressor",KNeighborsRegressor())])

pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                     ("xgb_regressor",XGBRegressor())])

pipeline_lgbm = Pipeline(steps=[('preprocessor', preprocessor),
                     ("lgbm_regressor",LGBMRegressor())])

pipeline_extratr = Pipeline(steps=[('preprocessor', preprocessor),
                     ("extratr_regressor",ExtraTreesRegressor())])

In [14]:
# list of all pipelines
pipelines = [pipeline_rf, pipeline_lr, pipeline_dt, pipeline_kn, pipeline_xgb, pipeline_lgbm, pipeline_extratr]

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "RandomForest", 
             1: "LinearRegression", 
             2: "DecissionsTree",
             3: "KNeighbors", 
             4: "XGBRegressor",
             5: "LGBMRegressor",
             6: "ExtraTreeRegressor"}

In [15]:
# training all my models

for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [16]:
# making cross validation for all my models

cv_results = []

for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=10)
    cv_results.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

RandomForest: -554.710583 
LinearRegression: -1160.367740 
DecissionsTree: -753.080653 
KNeighbors: -794.101844 
XGBRegressor: -552.435314 
LGBMRegressor: -545.288553 
ExtraTreeRegressor: -547.866894 


We are going to work with LGMBRegressor which appears to be the best model.

In [17]:
lgmb = Pipeline(steps=[('preprocessor', preprocessor),
                     ("lgbm_regressor",LGBMRegressor(learning_rate=0.1, max_depth=8, n_estimators=500, num_leaves=21))])
model_lgmb = lgmb.fit(X_train, y_train)
y_pred_lgmb = lgmb.predict(X_test)
mean_squared_error(y_test, y_pred_lgmb)**0.5

550.9320305483186

In [18]:
xgb = Pipeline(steps=[('preprocessor', preprocessor),
                     ("xgb_regressor",XGBRegressor(base_score=0.6,
                                                   learning_rate=0.06,
                                                   max_depth= 8,
                                                   n_estimators=300,
                                                   booster = 'gbtree',
                                                   n_jobs = -1,
                                                   gamma=0.0,
                                                   min_child_weight=3,
                                                   reg_alpha=0.1,
                                                   subsample=0.9))])
model_xgb = xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
mean_squared_error(y_test, y_pred_xgb)**0.5

# best result by now

537.5983930391792

In [19]:
extratr = pipeline_extratr = Pipeline(steps=[('preprocessor', preprocessor),
                     ("extratr_regressor",ExtraTreesRegressor(bootstrap=False, 
                                                              max_features=0.8, 
                                                              min_samples_leaf=1, 
                                                              min_samples_split=16, 
                                                              n_estimators=100))])
model_extratr = extratr.fit(X_train, y_train)
y_pred_extratr = extratr.predict(X_test)
mean_squared_error(y_test, y_pred_extratr)**0.5

541.2199737197308

In [20]:
# using grid search to look for the best hyperparams.

ptmodel = Pipeline(steps=[('preprocessor', preprocessor),
                     ("lgbm_regressor",LGBMRegressor())])
params = {
    'regressor__max_depth': [2, 4, 8, 16],
    'regressor__n_estimators': [64, 128, 256, 512],
    'regressor__learning_rate': [0.1, 0.01, 0.001],
    'preprocessor__numerical_preprocessor__imputer__strategy': ['mean', 'median']
}

gridsearch = RandomizedSearchCV(ptmodel,
                                params,
                               cv=5,
                               verbose=10,
                               scoring='neg_root_mean_squared_error',
                               n_jobs=-1,
                               n_iter=20)

In [21]:
gridsearch.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: Invalid parameter regressor for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numerical_preprocessor',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('categorical_preprocessor',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['cut', 'color',
                                                   'clarity'])])),
                ('lgbm_regressor', LGBMRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [22]:
gridsearch.predict(test[features_pipeline])

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [23]:
# cross validation

errors = cross_val_score(model_extratr, X, y, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
np.mean(-errors)

533.3184730520621

In [24]:
y_submission = model_extratr.predict(test[features_pipeline])

In [25]:
pd.DataFrame({'price': y_submission, 'id': test.id}).to_csv('submission.csv', index=False)