# Hybrid Regression

In [1]:
!pip install plotly



## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Taking care of missing data

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,:36])
X[:,:36] = imputer.transform(X[:,:36])

imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(X[:,37:])
X[:,37:] = imputer.transform(X[:,37:])

In [5]:
y = y.reshape(len(y),1)

### Encoding the Independent Variable

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
for i in range(37,80):
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
    X = np.array(ct.fit_transform(X))

In [7]:
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=9)
pls.fit(X,y)

PLSRegression(copy=True, max_iter=500, n_components=9, scale=True, tol=1e-06)

## Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
xtr=X_train[:,252]
xte=X_test[:,252]
ytr=y_train[:,0]
yte=y_test[:,0]

In [9]:
X_train = pls.transform(X_train)
X_test = pls.transform(X_test)

## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)
print(X_train.shape)

(973, 9)


## Training the Hybrid Regression model on the Training set

In [11]:
from xgboost import XGBRegressor
regressor1 = XGBRegressor(n_estimators=115)
regressor1.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=115,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [12]:
from sklearn.svm import SVR
regressor2 = SVR(kernel = 'linear')
regressor2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Predicting the Test set results

In [13]:
y_pred1 = sc_y.inverse_transform(regressor1.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
y_pred2 = sc_y.inverse_transform(regressor2.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
y_pred=((y_pred1)*10+(y_pred2)*7.5)/17.5
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[230604.28 200624.  ]
 [134327.8  133000.  ]
 [115261.12 110000.  ]
 [208550.37 192000.  ]
 [105085.05  88000.  ]
 [ 83835.13  85000.  ]
 [251618.16 282922.  ]
 [127238.65 141000.  ]
 [686000.8  745000.  ]
 [155422.17 148800.  ]
 [200771.04 208900.  ]
 [136800.03 136905.  ]
 [232226.45 225000.  ]
 [120930.56 123000.  ]
 [112639.13 119200.  ]
 [135501.05 145000.  ]
 [260725.   190000.  ]
 [129202.77 123600.  ]
 [144171.52 149350.  ]
 [178024.13 155000.  ]
 [142672.93 166000.  ]
 [172088.77 144500.  ]
 [120725.99 110000.  ]
 [158416.21 174000.  ]
 [178039.64 185000.  ]
 [176012.53 168000.  ]
 [164061.54 177500.  ]
 [ 77350.92  84500.  ]
 [323631.96 320000.  ]
 [116866.2  118500.  ]
 [152014.71 110000.  ]
 [204316.22 213000.  ]
 [161928.28 156000.  ]
 [288452.97 250000.  ]
 [348213.96 372500.  ]
 [208602.34 175000.  ]
 [294060.47 277500.  ]
 [123821.95 112500.  ]
 [221376.98 263000.  ]
 [346220.29 325000.  ]
 [198192.27 243000.  ]
 [120481.36 130000.  ]
 [201968.77 164990.  ]
 [290730.14

## Visualising the Training set results

In [14]:
dt3={'id':xtr,'price':ytr}
dt3=pd.DataFrame(dt3)
yp2=(sc_y.inverse_transform(regressor1.predict(X_train))*10+sc_y.inverse_transform(regressor2.predict(X_train))*7.5)/17.5
dt4={'id':xtr,'price':yp2}
dt4=pd.DataFrame(dt4)
dt3=dt3.sort_values(by=['id'])
dt4=dt4.sort_values(by=['id'])

In [19]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=dt3.id,y=dt3.price,mode='markers',name='Original Points'))
fig.add_trace(go.Scatter(x=dt4.id, y=dt4.price, mode='lines',name='Traning Model'))
fig.update_layout(
    title={
        'text': "Property Price Prediction",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Property ID",
    yaxis_title="Price",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

## Visualising the Test set results

In [16]:
dt1={'id':xte,'price':yte}
dt1=pd.DataFrame(dt1)
yp1=y_pred
dt2={'id':xte,'price':yp1}
dt2=pd.DataFrame(dt2)
dt1=dt1.sort_values(by=['id'])
dt2=dt2.sort_values(by=['id'])

In [20]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=dt1.id,y=dt1.price,mode='markers',name='Original Points'))
fig.add_trace(go.Scatter(x=dt2.id, y=dt2.price, mode='lines',name='Testing'))
fig.update_layout(
    title={
        'text': "Property Price Prediction",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Property ID",
    yaxis_title="Price",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

In [18]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
print(r2_score(y_test, y_pred))
print(1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test[0])-1)))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
np.mean((y_pred-yte))

0.9372405566329644
0.9360564161920769
13470.691385392302
404077646.81038624


303.13369270635275