In [None]:
# Ralph Mouawad, Lea Bou Sleiman, Michel Lamah
# INDE 535 - Project - Inst. Mario Karam
# Car Price Prediction

In [1]:
# Libraries to import
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from sklearn.metrics import make_scorer

1- Data Cleaning and Preparation

In [2]:
# Load dataset and see info
df = pd.read_csv('train.csv',index_col='id') # the first column will be our index
print(df.tail())
print(df.shape)
unlabeled_df = pd.read_csv('test.csv', index_col = 'id')
print(df.info())
df.describe()

       created_at_first            city     region    year body_type  \
id                                                                     
730089         8/1/2018   Other in Tyre       Tyre  2019.0     Other   
730090        9/23/2019  Other in Akkar      Akkar  1984.0     Other   
730091        11/9/2017           Dbaye       Metn  2014.0       NaN   
730092        7/24/2019       Al Bahsas    Tripoli  2011.0     Sedan   
730093       11/17/2018          Kaslik  Keserouan  2014.0     Other   

       transmission_type      kilometers      brand model    price  
id                                                                  
730089            Manual       0 to 9999  blackhawk     O   1900.0  
730090            Manual  20000 to 29999  avtokuban     G   2000.0  
730091        Steptronic  70000 to 79999   columbia     A    100.0  
730092        Steptronic  70000 to 79999   hamilton    AD  26200.0  
730093           Automat             NaN        NaN   NaN      NaN  
(730093, 10)

Unnamed: 0,year,price
count,724944.0,730092.0
mean,2001.634475,91317590.0
std,9.789225,26990090000.0
min,1920.0,0.0
25%,1996.0,3400.0
50%,2003.0,6200.0
75%,2009.0,10900.0
max,2019.0,11500000000000.0


In [3]:
# Locate duplicated rows
dups = df.duplicated()
df = df.drop_duplicates()

In [4]:
# Check the number of Missing Values
df.isnull().sum() # We can see that "brand" and "model" have the same nb of missing values. this is not random. same for "transmission_type" and "kilometers"

created_at_first          0
city                   5721
region                    0
year                   5141
body_type            347337
transmission_type      5024
kilometers             5025
brand                 15148
model                 15148
price                     1
dtype: int64

In [5]:
# Calculating upper and lower bounds for outliers
lower_bound = df['price'].quantile(0.05)
upper_bound = df['price'].quantile(0.95)

print("Upper bound:", upper_bound)
print("Lower bound:", lower_bound)

Upper bound: 23000.0
Lower bound: 1100.0


In [6]:
# Remove the outliers
df = df[(df['price'] < upper_bound) & (df['price'] > lower_bound)]

In [7]:
print(df.isnull().sum())
print('City Unique Values:',len(df['city'].unique()))
print('Region Unique Values:',len(df['region'].unique()))
print('Body Type Unique Values:',len(df['body_type'].unique()))
print('Transmission Unique Values:',len(df['transmission_type'].unique()))
print('Kilometers Unique Values:',len(df['kilometers'].unique()))
print('Brand Unique Values:',len(df['brand'].unique()))
print('Model Unique Values:',len(df['model'].unique()))

created_at_first          0
city                   5158
region                    0
year                   4413
body_type            309447
transmission_type      4349
kilometers             4349
brand                 14066
model                 14066
price                     0
dtype: int64
City Unique Values: 331
Region Unique Values: 27
Body Type Unique Values: 7
Transmission Unique Values: 4
Kilometers Unique Values: 22
Brand Unique Values: 52
Model Unique Values: 52


In [8]:
# Clean year column, and make it as an age for the car
df['year'] = df['year'].fillna(value=df['year'].mode()[0])
df['year'] = df['year'].max().astype(int) - df['year']

unlabeled_df['year'] = unlabeled_df['year'].fillna(value=unlabeled_df['year'].mode()[0])
unlabeled_df['year'] = unlabeled_df['year'].max().astype(int) - unlabeled_df['year']

In [9]:
df['transmission_type'].fillna(df['transmission_type'].mode()[0])

id
1             Manual
2         Steptronic
3         Steptronic
4          Automatic
5         Steptronic
             ...    
730084     Automatic
730086     Automatic
730088     Automatic
730089        Manual
730090        Manual
Name: transmission_type, Length: 652835, dtype: object

In [10]:
# Drop irrelevant columns as well as missing values of others
df = df.drop(['city','region','created_at_first', 'body_type'], axis=1)
unlabeled_df = unlabeled_df.drop(['city','region','created_at_first', 'body_type'], axis = 1)
df = df.dropna(subset=['brand', 'model', 'kilometers'])

2- Data Pre-Processing

In [11]:
# Print the number of unique values for each column of the dataset
for col in df.columns:
  print(f"Unique values for column '{col}': {df[col].nunique()}")

Unique values for column 'year': 100
Unique values for column 'transmission_type': 3
Unique values for column 'kilometers': 21
Unique values for column 'brand': 50
Unique values for column 'model': 51
Unique values for column 'price': 218


In [12]:
# Encoding kilometers with 5 labels to make it easier for random forest regressor
labels = ["Very new", "new", "Somewhat", "Old", "Very old"]
buckets = [0, 19999, 39999, 79999, 119999, float('inf')]
try:
    df['kilometers'] = df['kilometers'].str.split('to').str[1].astype(int)
    unlabeled_df['kilometers'] = unlabeled_df['kilometers'].str.split('to').str[1].astype(int)
except:
    df['kilometers'] = df['kilometers'].str.split('to').str[0].astype(int)
    unlabeled_df['kilometers'] = unlabeled_df['kilometers'].str.split('to').str[0].astype(int)
df['kilometers'] = pd.cut(df['kilometers'], bins=buckets, labels=labels)
unlabeled_df['kilometers'] = pd.cut(unlabeled_df['kilometers'], bins=buckets, labels=labels)
df['kilometers'] = df['kilometers'].fillna('Very new')
unlabeled_df['kilometers'] = unlabeled_df['kilometers'].fillna('Very new')

In [13]:
# Encode Categorical Variables with One Hot Encoder and Ordinal Encoder

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
ordinal_encoder = OrdinalEncoder()

# Define the column transformer
col_trans = make_column_transformer(
    (one_hot_encoder, ['brand', 'model', 'transmission_type']),  # One-hot encode these columns
    (ordinal_encoder, ['kilometers']),  # Ordinal encode 'kilometers' column
    remainder='passthrough'  # Pass through other columns without transformation
)


In [14]:
X = df[['year', 'transmission_type', 'kilometers', 'brand', 'model']]
Y = df['price']

In [15]:
# Encode 'kilometers' column using ordinal encoding
ordinal_encoder = OrdinalEncoder(categories=[["Very new", "new", "Somewhat", "Old", "Very old"]])
X['kilometers'] = ordinal_encoder.fit_transform(X[['kilometers']])
unlabeled_df['kilometers'] = ordinal_encoder.transform(unlabeled_df[['kilometers']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['kilometers'] = ordinal_encoder.fit_transform(X[['kilometers']])


In [16]:
X.head()

Unnamed: 0_level_0,year,transmission_type,kilometers,brand,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,18.0,Manual,0.0,divolvo,N
2,16.0,Steptronic,0.0,scuderia,AG
3,11.0,Steptronic,0.0,kauffman,B
4,17.0,Automatic,0.0,blackhawk,C
5,14.0,Steptronic,3.0,hispakart,K


In [17]:
unlabeled_df.head()

Unnamed: 0_level_0,year,transmission_type,kilometers,brand,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
766960,21,Automatic,0.0,mania spyder,AA
766961,11,Automatic,4.0,gaeth,H
766962,9,Steptronic,0.0,hamilton,K
766963,31,Automatic,0.0,scuderia,H
766964,30,Manual,3.0,dewitt,J


In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [19]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [20]:
unlabeled_df = pd.get_dummies(unlabeled_df)

In [21]:
x_train.head()

Unnamed: 0_level_0,year,kilometers,transmission_type_Automatic,transmission_type_Manual,transmission_type_Steptronic,brand_abbott,brand_autobianchi,brand_avtokuban,brand_blackhawk,brand_brennan,...,model_Q,model_R,model_S,model_T,model_U,model_V,model_W,model_X,model_Y,model_Z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
301703,31.0,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
481688,30.0,3.0,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
115324,6.0,2.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
179822,27.0,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
475530,7.0,4.0,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False


In [22]:
unlabeled_df.head()

Unnamed: 0_level_0,year,kilometers,transmission_type_Automatic,transmission_type_Manual,transmission_type_Steptronic,brand_abbott,brand_autobianchi,brand_avtokuban,brand_blackhawk,brand_brennan,...,model_Q,model_R,model_S,model_T,model_U,model_V,model_W,model_X,model_Y,model_Z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
766960,21,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766961,11,4.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766962,9,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766963,31,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766964,30,3.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# prompt: use min max scaler on x_train['year'] and then normalize the column in the train and test set

minmax = MinMaxScaler()
x_train['year'] = minmax.fit_transform(x_train['year'].values.reshape(-1, 1))
x_test['year'] = minmax.transform(x_test['year'].values.reshape(-1, 1))


In [24]:
unlabeled_df['year'] = minmax.fit_transform(unlabeled_df['year'].values.reshape(-1,1))

In [25]:
x_train.head()

Unnamed: 0_level_0,year,kilometers,transmission_type_Automatic,transmission_type_Manual,transmission_type_Steptronic,brand_abbott,brand_autobianchi,brand_avtokuban,brand_blackhawk,brand_brennan,...,model_Q,model_R,model_S,model_T,model_U,model_V,model_W,model_X,model_Y,model_Z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
301703,0.313131,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
481688,0.30303,3.0,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
115324,0.060606,2.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
179822,0.272727,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
475530,0.070707,4.0,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False


In [26]:
unlabeled_df.head()

Unnamed: 0_level_0,year,kilometers,transmission_type_Automatic,transmission_type_Manual,transmission_type_Steptronic,brand_abbott,brand_autobianchi,brand_avtokuban,brand_blackhawk,brand_brennan,...,model_Q,model_R,model_S,model_T,model_U,model_V,model_W,model_X,model_Y,model_Z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
766960,0.212121,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766961,0.111111,4.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766962,0.090909,0.0,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766963,0.313131,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
766964,0.30303,3.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


3- Fitting Supervised Learning Models

In [27]:
# Define Evaluation Metrics
def print_evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    rmsle = np.sqrt(metrics.mean_squared_log_error(true, predicted)) # Compute RMSLE
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')

def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    rmsle = np.sqrt(metrics.mean_squared_log_error(true, predicted))
    return mae, mse, rmse, r2_square

In [None]:
# 1- Random Forest Regressor

# Define RMSLE custom scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
import numpy as np
from sklearn import metrics

# Define parameters for RandomForestRegressor
rf_params = {
    'n_estimators': 150,
    'random_state': 42,
    'verbose': 5,
    'max_depth': 18,
    'min_samples_split': 20,
    'min_samples_leaf': 120
}

# Instantiate RandomForestRegressor with parameters
rf_reg = RandomForestRegressor(**rf_params)

# Fit the model to the training data
rf_reg.fit(x_train, y_train)

# Predict on the test data
y_pred = rf_reg.predict(x_test)
y_unlabeled = rf_reg.predict(unlabeled_df)

# Calculate RMSLE
rmsle = np.sqrt(metrics.mean_squared_log_error(y_test, np.clip(y_pred, 0, None)))
print(rmsle)


building tree 1 of 150
building tree 2 of 150
building tree 3 of 150
building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150
building tree 9 of 150
building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150
building tree 17 of 150


[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:  1.4min


building tree 18 of 150
building tree 19 of 150
building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150
building tree 24 of 150
building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150
building tree 53 of 150
building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59

[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:  5.9min


building tree 72 of 150
building tree 73 of 150
building tree 74 of 150
building tree 75 of 150
building tree 76 of 150
building tree 77 of 150
building tree 78 of 150
building tree 79 of 150
building tree 80 of 150
building tree 81 of 150
building tree 82 of 150
building tree 83 of 150
building tree 84 of 150
building tree 85 of 150
building tree 86 of 150
building tree 87 of 150
building tree 88 of 150
building tree 89 of 150
building tree 90 of 150
building tree 91 of 150
building tree 92 of 150
building tree 93 of 150
building tree 94 of 150
building tree 95 of 150
building tree 96 of 150
building tree 97 of 150
building tree 98 of 150
building tree 99 of 150
building tree 100 of 150
building tree 101 of 150
building tree 102 of 150
building tree 103 of 150
building tree 104 of 150
building tree 105 of 150
building tree 106 of 150
building tree 107 of 150
building tree 108 of 150
building tree 109 of 150
building tree 110 of 150
building tree 111 of 150
building tree 112 of 150
bui

[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  71 tasks      | elapsed:    1.2s


0.35377851453503706


In [51]:
# 2- XGBoost Regressor

xgb_reg = xgb.XGBRegressor(n_estimators=200, learning_rate=0.08, gamma=0.005, subsample=0.85, colsample_bytree=1, max_depth=18, verbose=5)
xgb_reg.fit(x_train, y_train)

# Predict on the test data
y_pred = xgb_reg.predict(x_test)

# calculate the RMSLE
rmsle = np.sqrt(metrics.mean_squared_log_error(y_test, np.clip(y_pred,0,None)))
y_unlabeled = xgb_reg.predict(unlabeled_df)
# calculate the R squared error
r2 = metrics.r2_score(y_test, y_pred)
print(r2)
print(rmsle)

Parameters: { "verbose" } are not used.



0.8281099176605594
0.31116640346896113


In [52]:
# prompt: I want to create a dataframe where the first column is the ID column of unlabeled_df, the second column is the values of y_unlabeled.

import pandas as pd
submission_df = pd.DataFrame({
    'id': unlabeled_df.index,
    'price': y_unlabeled
})


In [53]:
submission_df.to_csv('Submission.csv')

In [None]:
# 3- Grid Search Cross Validation

# Define RMSLE custom scorer
def rmsle(y_true, y_pred):
    rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
    return rmsle

# Create scorer using make_scorer
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# Define parameter grid
param_grid = {
    'n_estimators': [150, 185, 220],
    'max_depth': [8, 10, 12],
    'learning_rate': [0.09],
    'subsample': [0.75],
    'colsample_bytree': [0.75],
}

# Perform GridSearchCV with custom scorer
grid = GridSearchCV(
    estimator=xgb.XGBRegressor(),
    param_grid=param_grid,
    cv=5,
    scoring=rmsle_scorer,  # Use RMSLE as scoring metric
    verbose=5
)
grid.fit(x_train, y_train)

# Print best parameters
print("Best parameters set found on development set:")
print(grid.best_params_)

y_unlabeled = grid.predict(unlabeled_df)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=150, subsample=0.75;, score=-0.329 total time=  25.4s
[CV 2/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=150, subsample=0.75;, score=-0.328 total time=  21.4s
[CV 3/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=150, subsample=0.75;, score=-0.330 total time=  19.6s
[CV 4/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=150, subsample=0.75;, score=-0.329 total time=  21.1s
[CV 5/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=150, subsample=0.75;, score=-0.330 total time=  19.6s
[CV 1/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=185, subsample=0.75;, score=-0.325 total time=  23.3s
[CV 2/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=8, n_estimators=185, subsample=0.75;, score=-0.325 total t

  rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


[CV 1/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=185, subsample=0.75;, score=-0.313 total time=  27.4s
[CV 2/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=185, subsample=0.75;, score=-0.312 total time=  27.7s
[CV 3/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=185, subsample=0.75;, score=-0.313 total time=  28.0s
[CV 4/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=185, subsample=0.75;, score=-0.313 total time=  27.5s
[CV 5/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=185, subsample=0.75;, score=-0.313 total time=  27.9s


  rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


[CV 1/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=220, subsample=0.75;, score=-0.312 total time=  32.5s
[CV 2/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=220, subsample=0.75;, score=-0.311 total time=  30.8s
[CV 3/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=220, subsample=0.75;, score=-0.312 total time=  30.7s
[CV 4/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=220, subsample=0.75;, score=-0.311 total time=  30.7s
[CV 5/5] END colsample_bytree=0.75, learning_rate=0.09, max_depth=12, n_estimators=220, subsample=0.75;, score=-0.312 total time=  31.2s
Best parameters set found on development set:
{'colsample_bytree': 0.75, 'learning_rate': 0.09, 'max_depth': 12, 'n_estimators': 220, 'subsample': 0.75}


In [None]:
y_unlabeled = grid.predict(unlabeled_df)

In [None]:
submission_df = pd.DataFrame({
    'id': unlabeled_df.index,
    'price': y_unlabeled
})

In [None]:
submission_df.to_csv('Submission.csv')