In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import IPython
import sys
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,10)

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv("Hackathon_2022/resources/train_preprocessed_HK_IZ.csv")
test = pd.read_csv("Hackathon_2022/resources/test_preprocessed_HK_IZ.csv")

In [4]:
print(train.shape)
print(test.shape)

(22779, 17)
(6109, 17)


In [5]:
train_X = train.drop(columns=["count", "property_id"])
train_y = train['count']
test_X = test.drop(columns=["count","property_id"])

In [6]:
print(train_X.shape)
print(train_y.shape)
print(test_X.shape)

(22779, 15)
(22779,)
(6109, 15)


# validating the trainable model

In [7]:
X = train_X
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
y = train_y

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
X_valid = sc_X.fit_transform(X_valid)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(15945, 15)
(15945,)
(6834, 15)
(6834,)


In [11]:

reg = {
"LinearRegression": LinearRegression(),
"Lasso": Lasso(), 
"Ridge": Ridge(alpha=1.0),
"ElasticNet":ElasticNet(random_state=0),
"KNeighborsRegressor":KNeighborsRegressor(n_neighbors=2),
"AdaBoostRegressor":AdaBoostRegressor(random_state=0, n_estimators=100),
"DecisionTreeRegressor": DecisionTreeRegressor(),
"ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
"RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
"GradientBoostingRegressor":GradientBoostingRegressor(),
"XGBRegressor":xgboost.XGBRegressor( n_jobs=-1)
}

In [12]:
#creating a result file

result = pd.DataFrame(columns=['propert_id','count'])

In [13]:
result.head()

Unnamed: 0,propert_id,count


In [14]:
result['propert_id'] = test['property_id']
result['count']=test['count']

In [15]:
result.head()

Unnamed: 0,propert_id,count
0,ff8081815ab805ff015ac2099fa44da7,
1,ff8081815ab805ff015ac20d0c444f41,
2,ff8081815ab805ff015ac21333385249,
3,ff8081815ab805ff015ac213a3c25287,
4,ff8081815ab805ff015ac213f35f52b4,


In [16]:
result.shape

(6109, 2)

In [17]:
result.to_csv("Hackathon_2022/resources/result_HK_IZ.csv", index= False)

In [18]:
#model evaluation

In [19]:
%%time
dic =  {"Model":[],"R2_Train":[],"RMSE_Train":[],"MAPE_Train":[]}
for name, model in reg.items():
   
  model.fit(X_train, y_train)
  y_train_pred = model.predict(X_valid)
  r2 = r2_score(y_valid, y_train_pred)
  rmse  = np.sqrt(mean_squared_error(y_valid, y_train_pred))
  mape = np.mean(np.abs((y_valid - y_train_pred) / y_valid)) * 100
  print("--------------------------------------------------------------")
  print("Model:", name)
  print("-----Training Data Evaluation-----")
  print("R2 Value: ", r2_score(y_valid, y_train_pred))
  print("RMSE: ",np.sqrt(mean_squared_error(y_valid, y_train_pred)))
  print("MAPE:" , np.mean(np.abs((y_valid - y_train_pred) / y_valid)) * 100)
  dic["Model"].append(name)
  dic["R2_Train"].append(r2)
  dic["RMSE_Train"].append(rmse)
  dic["MAPE_Train"].append(mape)
  result["count"] = abs(model.predict(test_X))
  result.to_csv(name+"_HK_IZ.csv", index = False)

--------------------------------------------------------------
Model: LinearRegression
-----Training Data Evaluation-----
R2 Value:  0.008731244030999896
RMSE:  10.042700379404787
MAPE: 209.48729371574498
--------------------------------------------------------------
Model: Lasso
-----Training Data Evaluation-----
R2 Value:  -0.00032400172275792727
RMSE:  10.088466162036156
MAPE: 212.18136726994405
--------------------------------------------------------------
Model: Ridge
-----Training Data Evaluation-----
R2 Value:  0.00873286444388588
RMSE:  10.042692171071952
MAPE: 209.48722680335257
--------------------------------------------------------------
Model: ElasticNet
-----Training Data Evaluation-----
R2 Value:  0.00290343038819052
RMSE:  10.07217836693343
MAPE: 211.6153362468198
--------------------------------------------------------------
Model: KNeighborsRegressor
-----Training Data Evaluation-----
R2 Value:  -0.5029851268174477
RMSE:  12.366082468194378
MAPE: 227.29607619421824
--

In [20]:
score_data = pd.DataFrame(dic)
score_data.sort_values("RMSE_Train", axis = 0, ascending = True)

Unnamed: 0,Model,R2_Train,RMSE_Train,MAPE_Train
9,GradientBoostingRegressor,0.033666,9.915586,200.186071
2,Ridge,0.008733,10.042692,209.487227
0,LinearRegression,0.008731,10.0427,209.487294
3,ElasticNet,0.002903,10.072178,211.615336
1,Lasso,-0.000324,10.088466,212.181367
8,RandomForestRegressor,-0.06424,10.405777,241.452625
10,XGBRegressor,-0.088762,10.524979,213.102821
7,ExtraTreesRegressor,-0.110623,10.63012,246.391864
4,KNeighborsRegressor,-0.502985,12.366082,227.296076
5,AdaBoostRegressor,-1.850008,17.028556,646.578638


In [21]:
#increasing the features with
poly = PolynomialFeatures(degree=2)

In [22]:
new_X_train=poly.fit_transform(X_train)
new_X_valid=poly.fit_transform(X_valid)
new_test_X=poly.fit_transform(test_X)

In [23]:
print(new_X_train.shape)
print(new_X_valid.shape)
print(new_test_X.shape)

(15945, 136)
(6834, 136)
(6109, 136)


In [24]:
%%time
dic_poly =  {"Model_poly":[],"R2_Train_poly":[],"RMSE_Train_poly":[],"MAPE_Train_poly":[]}
for name, model in reg.items():
   
  model.fit(new_X_train, y_train)
  y_train_pred_poly = model.predict(new_X_valid)
  r2_poly = r2_score(y_valid, y_train_pred_poly)
  rmse_poly  = np.sqrt(mean_squared_error(y_valid, y_train_pred_poly))
  mape_poly = np.mean(np.abs((y_valid - y_train_pred_poly) / y_valid)) * 100
  print("--------------------------------------------------------------")
  print("Model_poly:", name)
  print("-----Training Data Evaluation with polynomial features-----")
  print("R2 Value_poly: ", r2_score(y_valid, y_train_pred_poly))
  print("RMSE_poly: ",np.sqrt(mean_squared_error(y_valid, y_train_pred_poly)))
  print("MAPE_poly:" , np.mean(np.abs((y_valid - y_train_pred_poly) / y_valid)) * 100)
  dic_poly["Model_poly"].append(name)
  dic_poly["R2_Train_poly"].append(r2_poly)
  dic_poly["RMSE_Train_poly"].append(rmse_poly)
  dic_poly["MAPE_Train_poly"].append(mape_poly)
  result["count"] = abs(model.predict(new_test_X))
  result.to_csv( name+"_poly_HK_IZ.csv", index = False)

--------------------------------------------------------------
Model_poly: LinearRegression
-----Training Data Evaluation with polynomial features-----
R2 Value_poly:  -2611859534410251.0
RMSE_poly:  515501229.48817205
MAPE_poly: 17014118151.364183
--------------------------------------------------------------
Model_poly: Lasso
-----Training Data Evaluation with polynomial features-----
R2 Value_poly:  0.0006378854446240645
RMSE_poly:  10.08361458393763
MAPE_poly: 212.03525726462212
--------------------------------------------------------------
Model_poly: Ridge
-----Training Data Evaluation with polynomial features-----
R2 Value_poly:  -0.0016989398294469105
RMSE_poly:  10.095397043135797
MAPE_poly: 206.46778539660951
--------------------------------------------------------------
Model_poly: ElasticNet
-----Training Data Evaluation with polynomial features-----
R2 Value_poly:  0.004029450364682496
RMSE_poly:  10.066489510817943
MAPE_poly: 211.42541864548326
---------------------------

In [25]:
score_poly_data = pd.DataFrame(dic_poly)
score_poly_data.sort_values("RMSE_Train_poly", axis = 0, ascending = True)

Unnamed: 0,Model_poly,R2_Train_poly,RMSE_Train_poly,MAPE_Train_poly
3,ElasticNet,0.00402945,10.06649,211.4254
1,Lasso,0.0006378854,10.08361,212.0353
2,Ridge,-0.00169894,10.0954,206.4678
8,RandomForestRegressor,-0.11124,10.63307,260.0668
7,ExtraTreesRegressor,-0.1327968,10.73571,260.1806
9,GradientBoostingRegressor,-0.2842976,11.43109,319.0985
10,XGBRegressor,-0.5012231,12.35883,346.9438
4,KNeighborsRegressor,-0.5062121,12.37935,225.782
6,DecisionTreeRegressor,-1.619729,16.32612,290.1459
5,AdaBoostRegressor,-7.08623,28.68322,1153.988


# comparing model reults with and without the implementation of polynomial features

In [26]:
score_data.sort_values("RMSE_Train", axis = 0, ascending = True)

Unnamed: 0,Model,R2_Train,RMSE_Train,MAPE_Train
9,GradientBoostingRegressor,0.033666,9.915586,200.186071
2,Ridge,0.008733,10.042692,209.487227
0,LinearRegression,0.008731,10.0427,209.487294
3,ElasticNet,0.002903,10.072178,211.615336
1,Lasso,-0.000324,10.088466,212.181367
8,RandomForestRegressor,-0.06424,10.405777,241.452625
10,XGBRegressor,-0.088762,10.524979,213.102821
7,ExtraTreesRegressor,-0.110623,10.63012,246.391864
4,KNeighborsRegressor,-0.502985,12.366082,227.296076
5,AdaBoostRegressor,-1.850008,17.028556,646.578638


### By using polynomial features it seems that the R2 score  is deccreasing and increase in RMSE so we use the dataset without using polynmial features

### Another insight we can observe that the ensembles of decision tree model like Gradient boosting regressor and XGB regressor were performing well on the dataset. so by much tuning the model parameters we may achieve good RMSE score.

# From above reulsts we can consolidate good model for final evaluations Gradient Boosting Regressor, Ridge, Linear Regression.

# lets do Hypertuning for the model parameters and see the model performanace

# Gradient Boosting Regressor Model

In [29]:
# create regressor object 
gbr = GradientBoostingRegressor(learning_rate = 0.01)
# fit the regressor with x and y data 
gbr.fit(X_train, y_train)
pred_gbr = gbr.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_gbr))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_gbr)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_bgr) / y_valid)) * 100)

R2 Value:  0.02205935653247182
RMSE:  9.974957291986495


NameError: name 'pred_bgr' is not defined

In [None]:
# create regressor object 
gbr = GradientBoostingRegressor(learning_rate = 0.001)
# fit the regressor with x and y data 
gbr.fit(X_train, y_train)
pred_gbr = gbr.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_gbr))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_gbr)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_gbr) / y_valid)) * 100)

# we infer that there is very low differnece even after Hypertuning Gradient Boosting


# Ridge Regressor Model

In [30]:
# create regressor object 
rdge = Ridge(alpha=0.1)
# fit the regressor with x and y data 
rdge.fit(X_train, y_train)
pred_rdge = rdge.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_rdge))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_rdge)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_rdge) / y_valid)) * 100)

R2 Value:  0.008731406160949029
RMSE:  10.042699558122687
MAPE: 209.48728689680243


In [31]:
# create regressor object 
rdge = Ridge(alpha=100)
# fit the regressor with x and y data 
rdge.fit(X_train, y_train)
pred_rdge = rdge.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_rdge))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_rdge)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_rdge) / y_valid)) * 100)

R2 Value:  0.008883942094457975
RMSE:  10.041926845511263
MAPE: 209.4831357828701


In [32]:
# create regressor object 
rdge = Ridge(alpha=0.001)
# fit the regressor with x and y data 
rdge.fit(X_train, y_train)
pred_rdge = rdge.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_rdge))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_rdge)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_rdge) / y_valid)) * 100)

R2 Value:  0.008731245652397024
RMSE:  10.04270037119147
MAPE: 209.48729364755116


# Linear Regression Model

In [33]:
# create regressor object 
lr = LinearRegression(n_jobs=-1, normalize = True)
# fit the regressor with x and y data 
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_lr))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_lr)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_lr) / y_valid)) * 100)

R2 Value:  0.008731244030999896
RMSE:  10.042700379404787
MAPE: 209.48729371574495


In [34]:
# create regressor object 
lr = LinearRegression(n_jobs=-1, normalize = False)
# fit the regressor with x and y data 
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_lr))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_lr)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_lr) / y_valid)) * 100)

R2 Value:  0.008731244030999896
RMSE:  10.042700379404787
MAPE: 209.48729371574498


# There is no improvemnt LR model results

# we infer that there is very low differnece even after Hypertuning Ridge

## Even after moving to other model the RMSE doesnt differ much. so we can use some advanced model for predicting the data.

## I used VotingRegressor with base regressors as GradientBoostingRegressor and XGBRegressor as they both gave the best predictions as described in previous cells

In [35]:
#applying best hypertuned params
gbr = GradientBoostingRegressor(learning_rate=0.01)
rdge = Ridge(alpha=100)
lr = LinearRegression(n_jobs=-1, normalize = False)

In [36]:
from sklearn.ensemble import VotingRegressor
model_vr = VotingRegressor([('gbr', gbr), ('rdge', rdge), ('lr', lr)])
model_vr.fit(X_train, y_train)
pred_vr = model_vr.predict(X_valid)
print("R2 Value: ", r2_score(y_valid, pred_vr))
print("RMSE: ",np.sqrt(mean_squared_error(y_valid, pred_vr)))
print("MAPE:" , np.mean(np.abs((y_valid - pred_vr) / y_valid)) * 100)

R2 Value:  0.01615649528471652
RMSE:  10.005016480702148
MAPE: 208.65056320052116


# The above results are good than the previous model evaluation

## Using the model for predicting the test data

In [37]:
result["count"] = abs(model_vr.predict(test_X))
result.to_csv("Hackathon_2022/resources/VotingRegressor_HK_IZ_finalML.csv", index = False)