The required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Loading the dataset

In [2]:
df=pd.read_csv('datasets_20710_26737_Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
print('the train data has {} rows and {} features'.format(df.shape[0],df.shape[1]))


th train data has 13320 rows and 9 features


In [None]:
Explatory data analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
area_type       13320 non-null object
availability    13320 non-null object
location        13319 non-null object
size            13304 non-null object
society         7818 non-null object
total_sqft      13320 non-null object
bath            13247 non-null float64
balcony         12711 non-null float64
price           13320 non-null float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


Divide the data into numerical and categorical data  

In [7]:
num_features=df.select_dtypes(include=['int64','float64'])
categorical_features=df.select_dtypes(include='object')


In [8]:
num_features.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [9]:
categorical_features.describe()

Unnamed: 0,area_type,availability,location,size,society,total_sqft
count,13320,13320,13319,13304,7818,13320
unique,4,81,1305,31,2688,2117
top,Super built-up Area,Ready To Move,Whitefield,2 BHK,GrrvaGr,1200
freq,8790,10581,540,5199,80,843


Display the columns with number of missing values

In [10]:
df.isnull().sum().sort_values(ascending = False)[:34]

society         5502
balcony          609
bath              73
size              16
location           1
price              0
total_sqft         0
availability       0
area_type          0
dtype: int64

Check their names and data types

In [11]:
df.columns


Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

Imput and cleaning the missing values 

In [12]:
df.dropna(inplace = True)
df = df.drop(['area_type', 'availability', 'location', 'size', 'society'], axis = 1)
df.columns

Index(['total_sqft', 'bath', 'balcony', 'price'], dtype='object')

Create a new class "other"

In [13]:
features=['total_sqft', 'bath', 'balcony', 'price']

for name in features:
    df[name].fillna('Other', inplace = True)

In [14]:
df[features].isnull().sum()

total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [16]:
object_features = df.select_dtypes(include='object').columns
object_features

Index(['total_sqft'], dtype='object')

In [17]:
def dummies(d):
    dummies_df=pd.DataFrame()
    object_features = d.select_dtypes(include='object').columns
    for name in object_features:
        dummies = pd.get_dummies(d[name], drop_first=False)
        dummies = dummies.add_prefix("{}_".format(name))
        dummies_df=pd.concat([dummies_df,dummies],axis=1)
    return dummies_df

In [18]:
dummies_data=dummies(df)
dummies_data.shape


(7496, 1682)

In [19]:
data=df.drop(columns=object_features,axis=1)
data.columns

Index(['bath', 'balcony', 'price'], dtype='object')

In [20]:
final_data=pd.concat([data,dummies_data],axis=1)
final_data.shape

(7496, 1685)

Re-spliting the data into train and test datasets

In [45]:
train_data=final_data.iloc[:3748,:]
test_data=final_data.iloc[3748:,:]
print(train_data.shape)
test_data.shape

(3748, 1685)


(3748, 1685)

X: independent variables and y: target variable

In [53]:
X = train_data
y = train_data.loc[:,'price']

With Machine Learning and Regression algorithms

In [54]:
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNet

In [56]:
model_las_cv = LassoCV(alphas=(0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))

model_las_cv.fit(X,y)

las_cv_preds=model_las_cv.predict(test_data)



In [58]:
model_ridge_cv = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10))

model_ridge_cv.fit(X, y)

ridge_cv_preds=model_ridge_cv.predict(test_data)

In [61]:
model_ridge = Ridge(alpha=10, solver='auto')

model_ridge.fit(X, y)
ridge_preds=model_ridge.predict(test_data)


In [64]:
model_en = ElasticNet(random_state=1, alpha=0.00065, max_iter=3000)

model_en.fit(X, y)

en_preds=model_en.predict(test_data)

In [66]:
import xgboost as xgb

In [68]:
model_xgb = xgb.XGBRegressor(learning_rate=0.01,n_estimators=3460,
                             max_depth=3, min_child_weight=0, gamma=0, subsample=0.7,
                             colsample_bytree=0.7, objective='reg:linear', nthread=-1,
                              scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

model_xgb.fit(X, y)
                             



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=0, missing=nan, monotone_constraints='()',
             n_estimators=3460, n_jobs=-1, nthread=-1, num_parallel_tree=1,
             objective='reg:linear', random_state=27, reg_alpha=6e-05,
             reg_lambda=1, scale_pos_weight=1, seed=27, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
xgb_preds=model_xgb.predict(test_data)

In [71]:
print(xgb_preds)

[ 57.851112  53.178818  55.355858 ... 229.66873   60.02441  489.7601  ]


In [74]:
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(n_estimators=3000, 
                                learning_rate=0.05, 
                                max_depth=4, 
                                max_features='sqrt', 
                                min_samples_leaf=15, 
                                min_samples_split=10, 
                                loss='huber', 
                                random_state =42)

model_gbr.fit(X, y)
gbr_preds=model_gbr.predict(test_data)

In [76]:
from lightgbm import LGBMRegressor

In [77]:
model_lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )

In [78]:
model_lgbm.fit(X, y)
lgbm_preds=model_lgbm.predict(test_data)

In [79]:
print(lgbm_preds)

[ 57.6913184   53.0111463   54.70711546 ... 218.45188506  60.03637824
 457.94703379]
