In [1]:
# pandas and numpy for data manipulation
import numpy as np
import pandas as pd

# matplotlib and seaborn for visuilization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

# No warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read  data into dataframe
file_train=r'D:\Deloitte\train.csv'
file_train_amenities=r'D:\Deloitte\amenities_train.csv'
file_test=r'D:\Deloitte\test.xlsx'
file_test_amenities=r'D:\Deloitte\amenities_test.xlsx'

ab_train=pd.read_csv(file_train)
amenities_train=pd.read_csv(file_train_amenities)
ab_test=pd.read_excel(file_test)
amenities_test=pd.read_excel(file_test_amenities)

# Returns the number of Rows and Columns in train data
print('shape of train data after preprocessing: {}'.format(ab_train.shape))
print('shape of amenities train data after preprocessing: {}'.format(amenities_train.shape))
print('shape of test data after preprocessing: {}'.format(ab_test.shape))
print('shape of amenities test data after preprocessing: {}'.format(amenities_test.shape))

shape of train data after preprocessing: (49999, 45)
shape of amenities train data after preprocessing: (49999, 130)
shape of test data after preprocessing: (24111, 44)
shape of amenities test data after preprocessing: (24111, 130)


In [5]:
# combining the amenities with other features of ab_train
train_ab=pd.concat([ab_train,amenities_train],axis=1)
print('shape of whole train data after concating: {}'.format(train_ab.shape))

test_ab=pd.concat([ab_test,amenities_test],axis=1)
print('shape of whole test data after concating: {}'.format(test_ab.shape))

shape of whole train data after concating: (49999, 175)
shape of whole test data after concating: (24111, 174)


In [6]:
#### Separating independent and dependent features

x_train=train_ab.drop(['id','log_price'],axis=1)
y_train=train_ab['log_price']

print('shape of y_train: {}'.format(x_train.shape))
print('shape of x_train: {}'.format(y_train.shape))

x_test=test_ab.drop(['id'],axis=1)

print('shape of x_test: {}'.format(x_test.shape))

shape of y_train: (49999, 173)
shape of x_train: (49999,)
shape of x_test: (24111, 173)


* I choose to build XGBoost as my final Machine Learing Model

### 1: Machine Learning: XGBoost

In [7]:
from xgboost.sklearn import XGBRegressor
xgb_model=XGBRegressor()

In [8]:
xgb_model=XGBRegressor(**{'colsample_bytree': 0.6, 'gamma': 0.4, 'learning_rate': 0.01, 
                          'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 1000, 
                          'reg_alpha': 5, 'scale_pos_weight': 3, 'subsample': 0.8})
xgb_model.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.4, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=6,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=5, reg_lambda=1, scale_pos_weight=3, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
test_predict=xgb_model.predict(x_test)

In [10]:
submissions=pd.DataFrame({'id':ab_test['id'],'log_price':test_predict})
submissions.to_excel('test__ML_price.xlsx',index=False)

In [11]:
submissions.head()

Unnamed: 0,id,log_price
0,5979389,4.084634
1,13488121,4.9035
2,8121643,4.921487
3,16490010,4.403557
4,16274069,5.014755


### 2: Deep learning: Neural Network Model

In [12]:
# Deep neural network

from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense,Activation,Flatten,Dropout
import tensorflow as tf

In [13]:
NN_model=Sequential()

# Input layer:
NN_model.add(Dense(128,kernel_initializer='normal',input_dim=x_train.shape[1],activation='relu'))

# Hidden layers:
NN_model.add(Dense(256,kernel_initializer='normal',activation='relu'))
NN_model.add(Dropout(rate=0.2))
NN_model.add(Dense(256,kernel_initializer='normal',activation='relu'))
NN_model.add(Dropout(rate=0.2))
NN_model.add(Dense(256,kernel_initializer='normal',activation='relu'))

# output layer:
NN_model.add(Dense(1,kernel_initializer='normal',activation='relu'))

# Compile the network
NN_model.compile(loss='mean_squared_error',optimizer='adam',metrics=['mean_squared_error'])
NN_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               22272     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 2

In [14]:
# checkpoint

checkpoint_name='weights-{epoch:03d}--{val_loss:.5f}.hdf5'
checkpoint=ModelCheckpoint(checkpoint_name,monitor='val_loss',verbose=1,save_best_only=True,
                          mode='auto')
callbacks_list=[checkpoint]

In [15]:
tf.config.run_functions_eagerly(True)

In [16]:
# Train the model
NN_model.fit(x_train,y_train.values,epochs=10,validation_split=0.2,callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.20316, saving model to weights-001--0.20316.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.20316
Epoch 3/10

Epoch 00003: val_loss improved from 0.20316 to 0.19728, saving model to weights-003--0.19728.hdf5
Epoch 4/10

Epoch 00004: val_loss improved from 0.19728 to 0.19350, saving model to weights-004--0.19350.hdf5
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.19350
Epoch 6/10

Epoch 00006: val_loss improved from 0.19350 to 0.19305, saving model to weights-006--0.19305.hdf5
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.19305
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.19305
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.19305
Epoch 10/10

Epoch 00010: val_loss improved from 0.19305 to 0.18933, saving model to weights-010--0.18933.hdf5


<tensorflow.python.keras.callbacks.History at 0x1a5c1418070>

In [17]:
test_predict_NN=NN_model.predict(x_test)

In [18]:
submission=pd.DataFrame({'id':ab_test['id'],'log_price':test_predict_NN[:,0]})
submission.to_excel('test_NN_price.xlsx',index=False)

In [19]:
submission.head()

Unnamed: 0,id,log_price
0,5979389,4.376175
1,13488121,4.9967
2,8121643,4.88193
3,16490010,4.52769
4,16274069,5.119294
