In [6]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from uszipcode import SearchEngine, SimpleZipcode
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBRegressor

# Read the dataset 

In [7]:
df=pd.read_csv('kc_house_data.csv')
df = df.drop(df[df['id'] == 1225069038].index)


# change the date format

In [8]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df['year']= df['date'].dt.year
df['month']= df['date'].dt.month
df['day']= df['date'].dt.day
df['Date'] = df[['day', 'month', 'year']].apply(lambda x: '/'.join(x.astype(str)), axis=1)

# Drop houses with zero bedrooms or bathrooms

In [9]:
df=df.drop(df[df['bathrooms']==0].index, axis=0)
df=df.drop(df[df['bedrooms']==0].index, axis=0)


# Dropping duplicates 

In [10]:
df=df.drop_duplicates(subset=["id","price"])


# Rounding number of bathrooms

In [11]:
def round_bathrooms(df):
     # Round the 'bathrooms' column and convert to integers
    df['bathrooms'] = df['bathrooms'].round().astype(int)
    return df

In [12]:
df = round_bathrooms(df)


#  Changing the zipcode for city name

In [13]:
search = SearchEngine()
def zco(x):
    city = search.by_zipcode(x).major_city
    return city if city else 'None'

In [14]:
df['city'] = df['zipcode'].apply(zco)

 # creating a new column named house age from yr_built feature

In [15]:
df['House_Age']=df['year']-df['yr_built']

#  Adding new feature




In [16]:
df['living_to_lot_ratio']=df['sqft_living']/df['sqft_lot']

# Dropping non-relevant columns

In [17]:
df=df.drop(columns=['sqft_living15', 'sqft_lot15','zipcode','date','day','month','year','yr_built'])

#  removing price outliers

In [18]:
#using upper limit and lower limit to remove price outliers

def iqr_method(df):
    perc_75 = np.percentile(df, 75)
    perc_25 = np.percentile(df, 25)
    iqr_range = perc_75 - perc_25
    iqr_upper = perc_75 + 1.5 * iqr_range
    iqr_lower = perc_25 - 1.5 * iqr_range
    return(iqr_lower,iqr_upper)

In [19]:
lower,upper=iqr_method(df['price'])
print('upper limit for price   = ', upper )
print('lower limit for price   = ', lower)
df=df[(df['price']>lower)&(df['price']<upper)]

upper limit for price   =  1129500.0
lower limit for price   =  -162500.0


# Removing outliers for bedrooms , bathrooms and sqft_living

In [20]:
#using z-score to remove the outliers

def remove_outliers(df, threshold=3):
    cols = ['bedrooms', 'bathrooms', 'sqft_living']
    df_clean = df.copy()  
            # initialize a new dataframe to avoid modifying the original
    for col in cols:
        zscore = (df_clean[col] - df_clean[col].mean()) / df_clean[col].std()
        df_clean = df_clean[abs(zscore) <= threshold]
    return df_clean

In [21]:
df=remove_outliers(df)

In [22]:
df.to_csv('house_price_Group1.csv')

In [23]:
df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_renovated,lat,long,Date,city,House_Age,living_to_lot_ratio
0,7129300520,221900.0,3,1,1180,5650,1.0,0,0,3,7,1180,0,0,47.5112,-122.257,13/10/2014,Seattle,59,0.208850
1,6414100192,538000.0,3,2,2570,7242,2.0,0,0,3,7,2170,400,1991,47.7210,-122.319,9/12/2014,Seattle,63,0.354874
2,5631500400,180000.0,2,1,770,10000,1.0,0,0,3,6,770,0,0,47.7379,-122.233,25/2/2015,Kenmore,82,0.077000
3,2487200875,604000.0,4,3,1960,5000,1.0,0,0,5,7,1050,910,0,47.5208,-122.393,9/12/2014,Seattle,49,0.392000
4,1954400510,510000.0,3,2,1680,8080,1.0,0,0,3,8,1680,0,0,47.6168,-122.045,18/2/2015,Sammamish,28,0.207921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,360000.0,3,2,1530,1131,3.0,0,0,3,8,1530,0,0,47.6993,-122.346,21/5/2014,Seattle,5,1.352785
21609,6600060120,400000.0,4,2,2310,5813,2.0,0,0,3,8,2310,0,0,47.5107,-122.362,23/2/2015,Seattle,1,0.397385
21610,1523300141,402101.0,2,1,1020,1350,2.0,0,0,3,7,1020,0,0,47.5944,-122.299,23/6/2014,Seattle,5,0.755556
21611,291310100,400000.0,3,2,1600,2388,2.0,0,0,3,8,1600,0,0,47.5345,-122.069,16/1/2015,Issaquah,11,0.670017


# Splitting the dataset for training and testing data

In [24]:
X = df.drop(['price','Date','city','id'], axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature selection:

In [25]:
# Selection of best features based on correlations :
corr = df.corr()
corr_target = abs(corr["price"])

# select features with correlation coefficient > 0.1
best_features_corr= corr_target[corr_target > 0.1].index.tolist()
print('The best features based on correlations: ' , best_features_corr[1:] )
print('The number of features was selected based on correlations is : ' ,len( best_features_corr[1:]) )

The best features based on correlations:  ['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'living_to_lot_ratio']
The number of features was selected based on correlations is :  10


In [26]:
# Selection of best features Based on RandomForestRegressor :
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(X_train, y_train)

# select the best features
best_features_rfr = X_train.columns[rfr.feature_importances_ > 0.01].tolist()
print('The best feature based on RFR model :',best_features_rfr)
print('The number of features was selected based on RandomForestRegressor is: ' ,len(best_features_rfr ) )

The best feature based on RFR model : ['sqft_living', 'sqft_lot', 'view', 'grade', 'sqft_above', 'lat', 'long', 'House_Age', 'living_to_lot_ratio']
The number of features was selected based on RandomForestRegressor is:  9


In [27]:
#now we want to build a model to compare these two feature lists:
#LGBM Regressor:
def lgbm(X_train,X_test,y_train,y_test):
    lgbm = LGBMRegressor()
    lgbm_model = lgbm.fit(X_train, y_train)
    y_pred_lgbm = lgbm_model.predict(X_test, num_iteration = lgbm_model.best_iteration_)
    r2=r2_score(y_test,y_pred_lgbm)
    rmse= mean_squared_error(y_test, y_pred_lgbm, squared = False) 
    print('R^2:', r2.round(3) )
    print('RMSE:',rmse.round(2))

    return (r2,rmse)

In [28]:
lgbm(X_train,X_test,y_train,y_test)
#r2 score from the original dataset to see how it will affected when reduce features

R^2: 0.875
RMSE: 72715.7


(0.8746963619849638, 72715.70171183874)

In [29]:
#split the data but only using the selected features based on correlations
XX=df[['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'view', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'living_to_lot_ratio']]
yy=df['price']
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.3, random_state=42)
lgbm(XX_train, XX_test, yy_train, yy_test)

R^2: 0.82
RMSE: 87129.0


(0.8200993514575892, 87128.99884111501)

In [30]:
#split the data but only using the selected features based on RFR
XX1=df[['sqft_living', 'sqft_lot', 'view', 'grade', 'sqft_above', 'lat', 'long', 'House_Age', 'living_to_lot_ratio']]
yy1=df['price']
XX1_train, XX1_test, yy1_train, yy1_test = train_test_split(XX1, yy1, test_size=0.3, random_state=42)
lgbm(XX1_train, XX1_test, yy1_train, yy1_test)

R^2: 0.868
RMSE: 74610.73


(0.8680802268825689, 74610.73308467474)

In [31]:
# if we try to add more features and compare the score:
XX2=df[['sqft_living', 'sqft_lot', 'view', 'grade', 'sqft_above', 'lat', 'long', 'House_Age', 'living_to_lot_ratio','bedrooms', 'bathrooms']]
yy2=df['price']
XX2_train, XX2_test, yy2_train, yy2_test = train_test_split(XX2, yy2, test_size=0.3, random_state=42)
lgbm(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.869
RMSE: 74377.72


(0.8689029077240731, 74377.72473848712)

as we can see that the feature set (XX1) and (XX2 after adding some features)from the RFR feature selection model gave us a higher score so we chose it to build our model 

# Models to be tested:

# 1-linear regression:

In [32]:
# Define a LinearRegression Model without regularisation(using LinearRegression())
def linearregression(X_train, X_test, y_train, y_test):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    r2 = r2_score(y_test, y_pred_lr)
    rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
    print('R^2:', r2.round(3))
    print('RMSE:', rmse.round(2))
    return (r2,rmse)

In [33]:
r01 , R01=  linearregression(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.696
RMSE: 113189.45


#  2-LinearRegression Model with L1 regularisation(using Lasso()):

In [34]:
def lasso(X_train, X_test, y_train, y_test):
    lasso = Lasso()
    lasso.fit(X_train,y_train)
    y_pred_lasso=lasso.predict(X_test)
    r2=r2_score(y_test, y_pred_lasso)
    rmse= mean_squared_error(y_test, y_pred_lasso, squared = False) 
    print('R^2:', r2.round(3) )
    print('RMSE:',rmse.round(2))
    
    return (r2,rmse)


In [35]:
r02,R02= lasso(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.696
RMSE: 113189.58


# 3-DecisionTreeRegressor

In [36]:
def dtr(X_train, X_test, y_train, y_test):
    dtr = DecisionTreeRegressor()
    dtr.fit(X_train,y_train)
    y_pred_dtr=dtr.predict(X_test)
    r2=r2_score(y_test, y_pred_dtr)
    rmse= mean_squared_error(y_test, y_pred_dtr, squared = False) 
    print('R^2:', r2.round(3) )
    print('RMSE:',rmse.round(2))
    
    return (r2,rmse)

In [37]:
r03 , R03= dtr(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.716
RMSE: 109424.64


# 4-RandomForestRegressor

In [38]:
def rfr(X_train, X_test, y_train, y_test):
    rfr=RandomForestRegressor()
    rfr.fit(X_train,y_train)
    y_pred_rfr=rfr.predict(X_test)
    r2=r2_score(y_test, y_pred_rfr)
    rmse= mean_squared_error(y_test, y_pred_rfr, squared = False) 
    print('R^2:', r2.round(3) )
    print('RMSE:',rmse.round(2))
    return (r2,rmse)

In [39]:
r04 , R04= rfr(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.857
RMSE: 77694.56


# 5-GradientBoostingRegressor

In [40]:
def GBR(X_train, X_test, y_train, y_test):
    gbr = GradientBoostingRegressor(max_depth=7,random_state=42)
    gbr.fit(X_train, y_train)
    y_pred_gbr= gbr.predict(X_test)
    r2=r2_score(y_test, y_pred_gbr)
    rmse= mean_squared_error(y_test, y_pred_gbr, squared = False) 
    print('R^2:', r2.round(3) )
    print('RMSE:',rmse.round(2))
    return (r2,rmse)

In [41]:
r05 , R05 = GBR(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.866
RMSE: 75163.22


# 6- XGB Regressor 

In [42]:
import xgboost as xgb
from xgboost import XGBRegressor
def xgbb(X_train, X_test, y_train, y_test):
    DM_train = xgb.DMatrix(data = X_train, label = y_train)
    DM_test = xgb.DMatrix(data = X_test, label = y_test)
    xgb_model = XGBRegressor(max_depth=4,random_state=42).fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    r2=r2_score(y_test, y_pred_xgb)
    rmse= mean_squared_error(y_test, y_pred_xgb, squared = False) 
    print('R^2:', r2.round(3) )
    print('RMSE:',rmse.round(2))
    return (r2,rmse)

In [43]:
r06 , R06= xgbb(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.859
RMSE: 77018.1


# 7-LGBM Regressor 

In [44]:
#only call the function lgbm() because its already defined before
r07 , R07 =lgbm(XX2_train, XX2_test, yy2_train, yy2_test)

R^2: 0.869
RMSE: 74377.72


In [50]:
data = {
    'Model': ['Linear Regression', 'Lasso', 'DecisionTreeRegressor', 'RandomForestRegressor', 'GradientBoostingRegressor', 
              'XGB Regressor', 'LGBM Regressor'],
    'R2 Score': [r01.round(3), r02.round(3), r03.round(3), r04.round(3),r05.round(3), r06.round(3), r07.round(3)],
    'RMSE': [R01.round(2), R02.round(2), R03.round(2), R04.round(2), R05.round(2), R06.round(2), R07.round(2)]
}

# Create a DataFrame from the data dictionary
comp = pd.DataFrame(data)

# Set the index to start from 1
comp.index = comp.index + 1

# Print the DataFrame
comp

Unnamed: 0,Model,R2 Score,RMSE
1,Linear Regression,0.696,113189.45
2,Lasso,0.696,113189.58
3,DecisionTreeRegressor,0.716,109424.64
4,RandomForestRegressor,0.857,77694.56
5,GradientBoostingRegressor,0.866,75163.22
6,XGB Regressor,0.859,77018.1
7,LGBM Regressor,0.869,74377.72


# we were confusing about which one we are going use between GradientBoostingRegressor and LGBM Regressor , so we used grid search to help us to choose the best one

#Gridsearch with GradientBoostingRegressor




In [51]:
from sklearn.model_selection import GridSearchCV
# Define the parameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [100,200,500]
}
model1 = GradientBoostingRegressor()
# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=model1, param_grid=param_grid, cv=5)
grid_search.fit(XX2_train, yy2_train)
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best parameters: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 500}
Best score: 0.8610094028470121


GradientBoostingRegressor after using best parameters

In [52]:
gbr1 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=7)
gbr1.fit(XX2_train, yy2_train)
yy2_pred_gbr1= gbr1.predict(XX2_test)
r21=r2_score(yy2_test, yy2_pred_gbr1)
rmse2= mean_squared_error(yy2_test, yy2_pred_gbr1, squared = False) 
print('R^2:', r21.round(3) )
print('RMSE:',rmse2.round(2))

R^2: 0.869
RMSE: 74302.93


#Gridsearch with LGBM Regressor

In [53]:
param_grid = {
    'max_depth': [-1,2, 3, 4,5,6,7,8],
    'learning_rate': [0.01, 0.1, 0.05, 1],
    'n_estimators': [100, 200, 500]
}
lgbm = LGBMRegressor()
# Initialize GridSearchCV object with the defined parameter grid
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(XX2,yy2)
# Print the best parameters and score
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best parameters:  {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 500}
Best score:  0.8701295499461953


LGBM Regressor after using best parameters





In [54]:
lgbm1 = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=-1)
lgbm_model1 = lgbm1.fit(XX2_train, yy2_train)
yy2_pred_lgbm1 = lgbm_model1.predict(XX2_test, num_iteration = lgbm_model1.best_iteration_)
r21=r2_score(yy2_test,yy2_pred_lgbm1)
rmse1= mean_squared_error(yy2_test, yy2_pred_lgbm1, squared = False) 
print('R^2:', r21.round(3) )
print('RMSE:',rmse1.round(2))

R^2: 0.87
RMSE: 74040.01


In [58]:
parameters = {
    'Model': [ 'GradientBoostingRegressor',  'LGBM Regressor'],
    'learning_rate': [0.05 ,0.05],
    'max_depth': [7,-1 ],
    'n_estimators':[500, 500],
    'R^2':[0.869,0.87],
    'RMSE':[74302.93,74040.01]
}

# Create a DataFrame from the data dictionary
table = pd.DataFrame(parameters)

# Set the index to start from 1

table.index = table.index + 1
# Print the DataFrame
table

Unnamed: 0,Model,learning_rate,max_depth,n_estimators,R^2,RMSE
1,GradientBoostingRegressor,0.05,7,500,0.869,74302.93
2,LGBM Regressor,0.05,-1,500,0.87,74040.01


so after the grid search and applying the best parameters for both models we can see that the LGBM gave us the best performance

In [59]:
# we will use lgbm , after comparing between them
import pickle

In [60]:
# save a model to a pikle file
pickle.dump(lgbm_model1, open('sk.pkl', 'wb'))