## In this competition, you are expected to create an analytical and modelling framework to predict the flat resale prices based on the quantitative and qualitative features provided in the dataset while answering other questions too cited below. You may derive new features from the existing features and from the domain knowledge, which may help in improving the model efficiency.


## Importing the Packages required

In [71]:

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import KFold




from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Setting the work directory

In [2]:
import os
os.chdir("C:\\Users\\punee\\Downloads\\Data_For_all")

In [3]:
os.getcwd()

'C:\\Users\\punee\\Downloads\\Data_For_all'

#### Reading & Understanding the Train & Test Data sets

In [4]:
Train_1 = pd.read_csv("train1.csv", sep=',', header=0, na_values=["?",",","#","unknown"])
Train_2 = pd.read_csv("train2.csv", sep=',', header=0, na_values=["?",",","#","unknown"])
Train_3 = pd.read_csv("train3.csv", sep=',', header=0, na_values=["?",",","#","unknown"])
Test_data = pd.read_csv("test.csv", sep=',', header=0, na_values=["?",",","#","unknown"])
Bldng_to_mtr_dst = pd.read_csv("building_to_mrt_distance.csv", sep=',', header=0, na_values=["?",",","#","unknown"])
df=pd.read_csv("test.csv", sep=',', header=0, na_values=["?",",","#","unknown"])

#### Checking the shapes of train & test data 

In [5]:
print(Train_1.shape)
print(Train_2.shape)
print(Train_3.shape)
print(Test_data.shape)
print(Bldng_to_mtr_dst.shape)

(52203, 11)
(37153, 12)
(32578, 12)
(21846, 11)
(8811, 120)


### Checking the data types

In [6]:
Train_1.dtypes

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
resale_price           float64
uniqueID                 int64
dtype: object

#### From the train csv files, it is observed that remaining lease column is missing, so we have to create this column and impute the values by using some formula. So, new columns i.e year only and lease ending date are used for creation of missing column in Train 1 

In [7]:
Train_1['year only'] = Train_1['month'].str.split('-').str[0]
Train_1['year only']

0        2012
1        2012
2        2012
3        2012
4        2012
5        2012
6        2012
7        2012
8        2012
9        2012
10       2012
11       2012
12       2012
13       2012
14       2012
15       2012
16       2012
17       2012
18       2012
19       2012
20       2012
21       2012
22       2012
23       2012
24       2012
25       2012
26       2012
27       2012
28       2012
29       2012
         ... 
52173    2014
52174    2014
52175    2014
52176    2014
52177    2014
52178    2014
52179    2014
52180    2014
52181    2014
52182    2014
52183    2014
52184    2014
52185    2014
52186    2014
52187    2014
52188    2014
52189    2014
52190    2014
52191    2014
52192    2014
52193    2014
52194    2014
52195    2014
52196    2014
52197    2014
52198    2014
52199    2014
52200    2014
52201    2014
52202    2014
Name: year only, Length: 52203, dtype: object

In [8]:
Train_1['year only'] = Train_1['year only'].astype('int64')

In [9]:
Train_1['lease_ending_year'] = Train_1['lease_commence_date']+ 99
Train_1['lease_ending_year']

0        2085
1        2079
2        2079
3        2083
4        2079
5        2080
6        2077
7        2078
8        2078
9        2084
10       2080
11       2079
12       2079
13       2078
14       2080
15       2077
16       2079
17       2076
18       2080
19       2077
20       2080
21       2080
22       2077
23       2079
24       2079
25       2075
26       2078
27       2080
28       2076
29       2079
         ... 
52173    2087
52174    2087
52175    2084
52176    2085
52177    2084
52178    2085
52179    2086
52180    2084
52181    2086
52182    2087
52183    2086
52184    2086
52185    2086
52186    2086
52187    2083
52188    2085
52189    2092
52190    2100
52191    2086
52192    2099
52193    2084
52194    2086
52195    2086
52196    2085
52197    2086
52198    2087
52199    2087
52200    2091
52201    2084
52202    2084
Name: lease_ending_year, Length: 52203, dtype: int64

In [10]:
Train_1.dtypes

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
resale_price           float64
uniqueID                 int64
year only                int64
lease_ending_year        int64
dtype: object

In [11]:
Train_1['remaining_lease'] = Train_1['lease_ending_year'] - Train_1['year only'] 

In [12]:
Train_1['remaining_lease']

0        73
1        67
2        67
3        71
4        67
5        68
6        65
7        66
8        66
9        72
10       68
11       67
12       67
13       66
14       68
15       65
16       67
17       64
18       68
19       65
20       68
21       68
22       65
23       67
24       67
25       63
26       66
27       68
28       64
29       67
         ..
52173    73
52174    73
52175    70
52176    71
52177    70
52178    71
52179    72
52180    70
52181    72
52182    73
52183    72
52184    72
52185    72
52186    72
52187    69
52188    71
52189    78
52190    86
52191    72
52192    85
52193    70
52194    72
52195    72
52196    71
52197    72
52198    73
52199    73
52200    77
52201    70
52202    70
Name: remaining_lease, Length: 52203, dtype: int64

### Now, we are dropping the unnecessary columns in the Train 1 file

In [13]:
Train_1.drop(['year only','lease_ending_year'], axis = 1, inplace= True)

In [14]:
Train_1.dtypes

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
resale_price           float64
uniqueID                 int64
remaining_lease          int64
dtype: object

In [15]:
Train_3['remaining_lease'] = Train_3['remaining_lease'].str.split(' ').str[0]

In [16]:
Train_2.dtypes


month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
remaining_lease          int64
resale_price           float64
uniqueID                 int64
dtype: object

In [17]:
Train_3.dtypes

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
remaining_lease         object
resale_price           float64
uniqueID                 int64
dtype: object

In [18]:
Train_3['remaining_lease'] = Train_3['remaining_lease'].astype('int64')

### Merging the Train1, Train2 & Train3 to Train data

In [19]:
Train_data = pd.concat([Train_1,Train_2,Train_3],axis=0)

In [20]:
Train_data.head()

Unnamed: 0,block,flat_model,flat_type,floor_area_sqm,lease_commence_date,month,remaining_lease,resale_price,storey_range,street_name,town,uniqueID
0,172,Improved,2 ROOM,45.0,1986,2012-03,73,250000.0,06 TO 10,ANG MO KIO AVE 4,ANG MO KIO,100000
1,510,Improved,2 ROOM,44.0,1980,2012-03,67,265000.0,01 TO 05,ANG MO KIO AVE 8,ANG MO KIO,100001
2,610,New Generation,3 ROOM,68.0,1980,2012-03,67,315000.0,06 TO 10,ANG MO KIO AVE 4,ANG MO KIO,100002
3,474,New Generation,3 ROOM,67.0,1984,2012-03,71,320000.0,01 TO 05,ANG MO KIO AVE 10,ANG MO KIO,100003
4,604,New Generation,3 ROOM,67.0,1980,2012-03,67,321000.0,06 TO 10,ANG MO KIO AVE 5,ANG MO KIO,100004


In [21]:
Test_data['remaining_lease'] = Test_data['remaining_lease'].str.split(' ').str[0]

In [22]:
Test_data.dtypes

month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
remaining_lease         object
uniqueID                 int64
dtype: object

In [23]:
Train_data.isnull().sum()

block                  0
flat_model             0
flat_type              0
floor_area_sqm         0
lease_commence_date    0
month                  0
remaining_lease        0
resale_price           0
storey_range           0
street_name            0
town                   0
uniqueID               0
dtype: int64

### Type casting

In [24]:
for col in ('month', 'town', 'flat_type', 'block','street_name','storey_range','flat_model','remaining_lease'):
    Train_data[col] = Train_data[col].astype('category')

#### Checking the unique values in uniqueID column

In [25]:
Train_data.uniqueID.value_counts()

100303    1
167291    1
142703    1
185712    1
187761    1
181618    1
183667    1
193908    1
195957    1
189814    1
191863    1
169336    1
171385    1
165242    1
177532    1
146797    1
179581    1
173438    1
175487    1
120208    1
122257    1
116114    1
118163    1
128404    1
130453    1
124310    1
126359    1
103832    1
140654    1
144748    1
         ..
109280    1
217791    1
219838    1
213693    1
139921    1
146066    1
144019    1
133780    1
131733    1
137878    1
135831    1
158360    1
156313    1
162458    1
160411    1
150172    1
148125    1
154270    1
152223    1
207536    1
205489    1
211634    1
209587    1
199348    1
197301    1
203446    1
201399    1
221881    1
215740    1
100401    1
Name: uniqueID, Length: 121934, dtype: int64

#### Checking the unique values in block column

In [26]:
Train_data.block.value_counts()

2       501
1       436
8       393
108     392
107     388
101     383
113     377
110     366
4       364
109     362
22      351
114     348
112     338
3       336
115     330
5       328
105     324
111     324
7       318
6       312
130     310
116     308
9       307
13      304
102     303
12      298
121     298
122     298
106     294
17      294
       ... 
230F      3
150A      3
863A      3
380       3
10F       3
92A       3
857B      3
43A       3
223B      3
645A      2
382       2
796A      2
886D      2
178A      2
191B      2
42A       2
226E      2
858B      2
860A      2
230E      2
253A      2
905       1
863B      1
886B      1
10C       1
856E      1
640A      1
430A      1
430B      1
9B        1
Name: block, Length: 2216, dtype: int64

In [27]:
Train_data.columns

Index(['block', 'flat_model', 'flat_type', 'floor_area_sqm',
       'lease_commence_date', 'month', 'remaining_lease', 'resale_price',
       'storey_range', 'street_name', 'town', 'uniqueID'],
      dtype='object')

## Dropping the uniqueID column from both the train & test since there are all unique values

In [28]:
Train_data.drop("uniqueID", axis = 1, inplace= True)

In [29]:
Test_data.drop('uniqueID', axis = 1, inplace= True)

In [30]:
Train_data.head()

Unnamed: 0,block,flat_model,flat_type,floor_area_sqm,lease_commence_date,month,remaining_lease,resale_price,storey_range,street_name,town
0,172,Improved,2 ROOM,45.0,1986,2012-03,73,250000.0,06 TO 10,ANG MO KIO AVE 4,ANG MO KIO
1,510,Improved,2 ROOM,44.0,1980,2012-03,67,265000.0,01 TO 05,ANG MO KIO AVE 8,ANG MO KIO
2,610,New Generation,3 ROOM,68.0,1980,2012-03,67,315000.0,06 TO 10,ANG MO KIO AVE 4,ANG MO KIO
3,474,New Generation,3 ROOM,67.0,1984,2012-03,71,320000.0,01 TO 05,ANG MO KIO AVE 10,ANG MO KIO
4,604,New Generation,3 ROOM,67.0,1980,2012-03,67,321000.0,06 TO 10,ANG MO KIO AVE 5,ANG MO KIO


In [31]:
Test_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease
0,2018-08,ANG MO KIO,2 ROOM,323,ANG MO KIO AVE 3,01 TO 03,44.0,Improved,1977,57
1,2018-08,ANG MO KIO,3 ROOM,220,ANG MO KIO AVE 1,07 TO 09,82.0,New Generation,1977,57
2,2018-08,ANG MO KIO,3 ROOM,220,ANG MO KIO AVE 1,07 TO 09,67.0,New Generation,1977,57
3,2018-08,ANG MO KIO,3 ROOM,445,ANG MO KIO AVE 10,01 TO 03,67.0,New Generation,1979,60
4,2018-08,ANG MO KIO,3 ROOM,471,ANG MO KIO AVE 10,01 TO 03,67.0,New Generation,1979,59


In [32]:
Train_data.describe(include='all')

Unnamed: 0,block,flat_model,flat_type,floor_area_sqm,lease_commence_date,month,remaining_lease,resale_price,storey_range,street_name,town
count,121934.0,121934,121934,121934.0,121934.0,121934,121934.0,121934.0,121934,121934,121934
unique,2216.0,21,7,,,77,52.0,,25,527,26
top,2.0,Model A,4 ROOM,,,2018-07,70.0,,04 TO 06,YISHUN RING RD,JURONG WEST
freq,501.0,35539,48744,,,2539,5023.0,,27861,2085,9807
mean,,,,96.998518,1990.887538,,,449488.6,,,
std,,,,24.559239,10.96379,,,135723.0,,,
min,,,,31.0,1966.0,,,170000.0,,,
25%,,,,74.0,1984.0,,,350000.0,,,
50%,,,,96.0,1989.0,,,423000.0,,,
75%,,,,112.0,2000.0,,,515000.0,,,


In [33]:
Test_data.describe(include='all')

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease
count,21846,21846,21846,21846.0,21846,21846,21846.0,21846,21846.0,21846.0
unique,12,26,7,2193.0,532,17,,19,,51.0
top,2019-07,WOODLANDS,4 ROOM,2.0,YISHUN RING RD,04 TO 06,,Model A,,65.0
freq,2123,1802,9106,83.0,315,5160,,7370,,1423.0
mean,,,,,,,97.425744,,1993.758354,
std,,,,,,,24.250775,,12.643644,
min,,,,,,,31.0,,1966.0,
25%,,,,,,,81.0,,1984.0,
50%,,,,,,,95.0,,1993.0,
75%,,,,,,,112.0,,2003.0,


In [34]:
Test_data.block.value_counts()

2       83
8       80
13      70
1       69
9       66
109     66
12      66
101     65
23      62
121     61
102     61
115     61
20      60
113     60
114     60
3       59
110     57
112     56
107     56
105     56
103     55
7       54
21      51
6       51
130     50
4       49
34      48
111     48
117     48
420     48
        ..
688E     1
123D     1
641B     1
287D     1
325B     1
972      1
261A     1
267C     1
689E     1
654C     1
338B     1
119A     1
311D     1
3B       1
515D     1
642A     1
285A     1
352C     1
226B     1
893      1
317D     1
654A     1
664D     1
225C     1
150A     1
353C     1
23A      1
487B     1
788E     1
493E     1
Name: block, Length: 2193, dtype: int64

### Separating the numerical & categorical columns for pipeline

In [35]:
cat_attr_train = list(Train_data.select_dtypes("category").columns)
num_attr_train = list(Train_data.columns.difference(cat_attr_train))

num_attr_train.remove('resale_price')

print(cat_attr_train,num_attr_train)

['block', 'flat_model', 'flat_type', 'month', 'remaining_lease', 'storey_range', 'street_name', 'town'] ['floor_area_sqm', 'lease_commence_date']


## DATA Visualisation

# Data Visualisation is explained on different file.

### Creating a Pipeline

In [36]:
train_num_attr = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

train_cat_attr = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', train_num_attr, num_attr_train),
        ('cat', train_cat_attr, cat_attr_train)])

In [37]:
y = Train_data["resale_price"]
x = Train_data.drop(["resale_price"], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 123)

In [38]:
X_train.shape

(85353, 10)

In [39]:
X_test.shape

(36581, 10)

## Buliding a Linear regression model

In [40]:
lin_reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])

In [41]:
lin_reg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [42]:
train_pred = lin_reg.predict(X_train)
test_pred = lin_reg.predict(X_test)

print(lin_reg.score(X_train, y_train))
print(lin_reg.score(X_test, y_test))

0.940785002885975
0.9360276702663738


In [43]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
test_set_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
train_set_rmse = np.sqrt(mean_squared_error(y_train, train_pred))

test_set_r2 = r2_score(y_test, test_pred)
train_set_r2 = r2_score(y_train, train_pred)

### Printing the rmse values

In [44]:
print(test_set_rmse)
print(test_set_r2)
print(train_set_rmse)
print(train_set_r2)

34265.64079382842
0.9360276702663737
33052.49535051759
0.940785002885975


In [45]:
Test_pred = lin_reg.predict(Test_data)

### creating a csv file for submission

In [46]:
DF = pd.DataFrame(list(zip(df['uniqueID'],Test_pred)),columns=['uniqueID','resale_price'])
DF.to_csv("LGTESTPREDICTIONS.csv",index=False)
DF.shape

(21846, 2)

## Building a Random Forest Model with gridcv

In [47]:
RF_Reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rfr', RandomForestRegressor())])

In [48]:
RF_param_grid = {"rfr__n_estimators" : [10, 15],
              "rfr__max_depth" : [5,8],
              "rfr__max_features" : [3, 5],
              "rfr__min_samples_leaf" : [4, 6]}

rf_grid = GridSearchCV(RF_Reg, param_grid=RF_param_grid, cv=5)

In [49]:
rf_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

In [50]:
train_pred_rf = rf_grid.predict(X_train)
test_pred_rf = rf_grid.predict(X_test)

In [51]:
test_set_rmse1 = np.sqrt(mean_squared_error(y_test, test_pred_rf))
train_set_rmse1 = np.sqrt(mean_squared_error(y_train, train_pred_rf))

In [52]:
print(test_set_rmse1)
print(train_set_rmse1)

131685.22434888783
131958.91980701868


In [53]:
Test_pred_rf = rf_grid.predict(Test_data)

In [54]:
DF = pd.DataFrame(list(zip(df['uniqueID'],Test_pred_rf)),columns=['uniqueID','resale_price'])
DF.to_csv("RFTESTPREDICTIONS.csv",index=False)
DF.shape

(21846, 2)

## Building a Decision Tree model

In [55]:
DT_Reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dt', DecisionTreeRegressor())])

In [56]:
DT_Reg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [57]:
train_pred_DT = DT_Reg.predict(X_train)
test_pred_DT = DT_Reg.predict(X_test)

In [58]:
test_set_rmse2 = np.sqrt(mean_squared_error(y_test, test_pred_DT))
train_set_rmse2 = np.sqrt(mean_squared_error(y_train, train_pred_DT))

In [59]:
print(test_set_rmse2)
print(train_set_rmse2)

34110.12832088329
1552.2565974785825


In [60]:
Test_pred_DT = DT_Reg.predict(Test_data)

In [61]:
DF = pd.DataFrame(list(zip(df['uniqueID'],Test_pred_DT)),columns=['uniqueID','resale_price'])
DF.to_csv("DTTESTPREDICTIONS.csv",index=False)
DF.shape

(21846, 2)

## Building a Ridge Regression model

In [62]:
from sklearn import linear_model


In [63]:
Ridge_Reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('ridge', linear_model.Ridge(alpha=.5))])

In [64]:
Ridge_Reg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [65]:
train_pred_rid = Ridge_Reg.predict(X_train)
test_pred_rid = Ridge_Reg.predict(X_test)

In [66]:
test_set_rmse3 = np.sqrt(mean_squared_error(y_test, test_pred_rid))
train_set_rmse3 = np.sqrt(mean_squared_error(y_train, train_pred_rid))

In [67]:
print(test_set_rmse3)
print(train_set_rmse3)

34272.94857080892
33096.57275150851


In [68]:
Test_pred_ridge = Ridge_Reg.predict(Test_data)

In [69]:
DF = pd.DataFrame(list(zip(df['uniqueID'],Test_pred_ridge)),columns=['uniqueID','resale_price'])
DF.to_csv("RidgeTESTPREDICTIONS.csv",index=False)
DF.shape

(21846, 2)

## GBoost

In [73]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)


In [75]:
GBoost_reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('GBoost', GradientBoostingRegressor())])

In [76]:
GBoost_reg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [77]:
train_pred_GB = GBoost_reg.predict(X_train)
test_pred_GB = GBoost_reg.predict(X_test)

In [78]:
test_set_rmse4 = np.sqrt(mean_squared_error(y_test, test_pred_GB))
train_set_rmse4 = np.sqrt(mean_squared_error(y_train, train_pred_GB))

In [79]:
print(test_set_rmse4)
print(train_set_rmse4)

55267.123142992285
54874.19726395386


In [80]:
Test_pred_GB = GBoost_reg.predict(Test_data)

In [81]:
DF = pd.DataFrame(list(zip(df['uniqueID'],Test_pred_GB)),columns=['uniqueID','resale_price'])
DF.to_csv("GradientBoostingRegressor.csv",index=False)
DF.shape

(21846, 2)

## XGBoost


In [82]:
xgboost = xgb.XGBRegressor(colsample_bytree=0.7, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.7, silent=1,
                             random_state = 7, eta=0.1)

In [1]:


#xgboost_reg = Pipeline(steps=[('preprocessor', preprocessor),
#                      ('xgb', xgb() )])

In [None]:
#xgboost_reg.fit(X_train, y_train)

In [None]:
#train_pred_XG = xgboost_reg.predict(X_train)
#test_pred_XG = xgboost_reg.predict(X_test)

In [None]:
#test_set_rmse5 = np.sqrt(mean_squared_error(y_test, test_pred_XG))
#train_set_rmse5 = np.sqrt(mean_squared_error(y_train, train_pred_XG))

In [None]:
#print(test_set_rmse5)
#print(train_set_rmse5)

In [None]:
#Test_pred_XG = xgboost_reg.predict(Test_data)

In [None]:
#DF = pd.DataFrame(list(zip(df['uniqueID'],Test_pred_XG)),columns=['uniqueID','resale_price'])
#DF.to_csv("xgb.csv",index=False)
#DF.shape