In [162]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('bengaluru_house_prices.csv')

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.shape

(13320, 9)

In [5]:
# society has high null value percentage

In [6]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
df.drop('society',axis=1,inplace=True)


In [8]:
df.isna().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [9]:
df.balcony.mode()

0    2.0
Name: balcony, dtype: float64

In [10]:
df.balcony = df.balcony.fillna(2.0)

In [11]:
df.bath.mode()

0    2.0
Name: bath, dtype: float64

In [12]:
df.bath = df.bath.fillna(2.0)

In [13]:
df['size'].mode()

0    2 BHK
Name: size, dtype: object

In [14]:
df.size = df['size'].fillna('2 BHK')

In [15]:
df.location.mode()

0    Whitefield
Name: location, dtype: object

In [16]:
df.location = df.location.fillna('Whitefield')

In [17]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath',
       'balcony', 'price'],
      dtype='object')

In [18]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [19]:
df['bhk'] = df['size'].apply(lambda x:x.split(' ')[0])

In [20]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [21]:
df.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [22]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True
    

In [23]:
def spl(x):
    token = x.split('-')
    if len(token) == 2:
             
            return (float(token[0]) + float(token[1]))/2
    try:
        return float(x)
    except:
        return None   #if we give x we get other values ex:34sqm we ignore them rightnow
    
        

In [24]:
df.total_sqft = df.total_sqft.apply(spl)

In [25]:
df.total_sqft.unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [26]:
len(df[~df.total_sqft.apply(is_float)])

0

In [27]:
# these are the columns we are ignoring temporarily

In [28]:
df_new = df[~df.total_sqft.apply(is_float)].head(46)


In [29]:
df.isna().sum()

area_type        0
availability     0
location         0
size             0
total_sqft      46
bath             0
balcony          0
price            0
bhk              0
dtype: int64

In [30]:
df = df.dropna()

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13274 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13274 non-null  object 
 1   availability  13274 non-null  object 
 2   location      13274 non-null  object 
 3   size          13274 non-null  object 
 4   total_sqft    13274 non-null  float64
 5   bath          13274 non-null  float64
 6   balcony       13274 non-null  float64
 7   price         13274 non-null  float64
 8   bhk           13274 non-null  object 
dtypes: float64(4), object(5)
memory usage: 1.0+ MB


In [32]:
df = df.drop('availability',axis=1)

In [33]:
df.bhk = df.bhk.astype(int)

In [34]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13274 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13274 non-null  object 
 1   location    13274 non-null  object 
 2   size        13274 non-null  object 
 3   total_sqft  13274 non-null  float64
 4   bath        13274 non-null  float64
 5   balcony     13274 non-null  float64
 6   price       13274 non-null  float64
 7   bhk         13274 non-null  int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 933.3+ KB


In [36]:
df['price_per_sqft'] = round((df['price'] * 100000 )/(df['total_sqft']),2)

In [37]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.81
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.38
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.56
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.89
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [38]:
a = df.location.value_counts()

In [39]:
a_top_10 = a.sort_values(ascending=False).head(10)

In [40]:
a_top_10

Whitefield               539
Sarjapur  Road           399
Electronic City          302
Kanakpura Road           271
Thanisandra              233
Yelahanka                213
Uttarahalli              186
Hebbal                   177
Marathahalli             175
Raja Rajeshwari Nagar    171
Name: location, dtype: int64

In [41]:
def loc_f(x):
    if x in a_top_10:
        return x
    else:
        return 'other'
    

In [42]:
df.location = df.location.apply(loc_f)

In [43]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,other,2 BHK,1056.0,2.0,1.0,39.07,2,3699.81
1,Plot Area,other,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.38
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.56
3,Super built-up Area,other,3 BHK,1521.0,3.0,1.0,95.0,3,6245.89
4,Super built-up Area,other,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [44]:
df.location.value_counts()

other                    10608
Whitefield                 539
Sarjapur  Road             399
Electronic City            302
Kanakpura Road             271
Thanisandra                233
Yelahanka                  213
Uttarahalli                186
Hebbal                     177
Marathahalli               175
Raja Rajeshwari Nagar      171
Name: location, dtype: int64

In [45]:
df.shape

(13274, 9)

In [46]:
df[df.total_sqft/df.bhk<300].head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
9,Plot Area,other,6 Bedroom,1020.0,6.0,2.0,370.0,6,36274.51
45,Plot Area,other,8 Bedroom,600.0,9.0,2.0,200.0,8,33333.33
58,Plot Area,other,6 Bedroom,1407.0,4.0,1.0,150.0,6,10660.98
68,Plot Area,other,8 Bedroom,1350.0,7.0,0.0,85.0,8,6296.3
70,Plot Area,other,3 Bedroom,500.0,3.0,2.0,100.0,3,20000.0


In [47]:
df.shape

(13274, 9)

In [48]:
df1 = df[~(df.total_sqft/df.bhk<300)]

In [49]:
df1.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,other,2 BHK,1056.0,2.0,1.0,39.07,2,3699.81
1,Plot Area,other,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.38
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.56
3,Super built-up Area,other,3 BHK,1521.0,3.0,1.0,95.0,3,6245.89
4,Super built-up Area,other,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [50]:
df1.shape

(12530, 9)

In [51]:
a = df1.groupby(df.location)
df1.groupby(df.location)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f448d4cec20>

In [52]:
def remove_outlier(df):
    df_out = pd.DataFrame()
    for key,subdf in df1.groupby(df1.location):
        m = np.mean(subdf.price_per_sqft)
        std = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (m-std)) & (subdf.price_per_sqft <= (m+std))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
        

In [53]:
df2 = remove_outlier(df1)

In [54]:
df2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City,2 BHK,660.0,1.0,1.0,23.1,2,3500.0
1,Super built-up Area,Electronic City,3 BHK,1025.0,2.0,1.0,47.0,3,4585.37
2,Super built-up Area,Electronic City,2 BHK,1000.0,2.0,1.0,28.88,2,2888.0
3,Super built-up Area,Electronic City,3 BHK,1530.0,2.0,1.0,45.9,3,3000.0
4,Super built-up Area,Electronic City,3 BHK,1500.0,2.0,1.0,64.5,3,4300.0


In [55]:
df3 = df2[~(df2.bath > (df2.bhk))]

In [56]:
df3.shape

(10557, 9)

In [57]:
len(df3[df3.total_sqft > 20000])

2

In [58]:
df4 = df3[~(df3.total_sqft > 9000)]

In [59]:
df4.shape

(10550, 9)

In [60]:
df5 = df4.drop('price_per_sqft',axis=1)

In [61]:
df6 = df5.drop(['size'],axis=1)

In [62]:
df7 = pd.get_dummies(df6,drop_first=True)

In [63]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [64]:
X = df7.drop('price',axis=1)
y = df7['price']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=0)

In [66]:
scl = StandardScaler()

X_train_scaled = scl.fit_transform(X_train)

X_test_scales = scl.transform(X_test) 

In [153]:
lr = LinearRegression()
lasso_rig = Lasso()
ridge = Ridge()
dt = DecisionTreeRegressor()
ada = AdaBoostRegressor()
xgb = XGBRFRegressor()
rf = RandomForestRegressor()


In [154]:
lst = [lr,lasso_rig,ridge,dt,ada,rf,xgb]
dc = {0:'linear_regression',1:'lasso_reg',2:'ridge',3:'Decision_tree',4:'Ada_boost',5:'Random_Forest',6:'Xg_boost'}

In [155]:
def model_score(xtr,xt,ytr,yt):
    dict = {}
    for i,model in enumerate(lst):
        model.fit(xtr,ytr)
        y_pred = model.predict(xt)
        scr = r2_score(yt,y_pred)
        dict[dc[i]] = f'{round(scr*100,2)}%'
    scr_df = pd.DataFrame(dict,index=['R^2 Score'])
        
    return scr_df
        
        
        

In [156]:
model_score(X_train_scaled,X_test_scales,y_train,y_test)

Unnamed: 0,linear_regression,lasso_reg,ridge,Decision_tree,Ada_boost,Random_Forest,Xg_boost
R^2 Score,78.63%,77.7%,78.63%,62.57%,77.39%,75.51%,78.89%


In [132]:
# best_scores using cv

In [159]:
def cross_val(xtr,ytr):
    dict = {}
    for i,model in enumerate(lst):
        cv_score = cross_val_score(model,xtr,ytr,cv=5,n_jobs=-1,scoring='r2')
        best_cv = cv_score.mean()
        dict[dc[i]] = (f'{round(best_cv *100,2)}%')
    cv_df = pd.DataFrame(dict,index=['Cv_scores'])
    return cv_df
        

In [160]:
cross_val(X_train,y_train)

Unnamed: 0,linear_regression,lasso_reg,ridge,Decision_tree,Ada_boost,Random_Forest,Xg_boost
Cv_scores,72.47%,71.11%,72.47%,56.34%,70.0%,71.79%,74.79%


In [161]:
cross_val(X_train_scaled,y_train)

Unnamed: 0,linear_regression,lasso_reg,ridge,Decision_tree,Ada_boost,Random_Forest,Xg_boost
Cv_scores,72.47%,72.01%,72.47%,55.43%,69.86%,71.93%,74.69%


In [180]:
params = {
    "learning_rate": [0.1, 0.2, 0.3],
    # "max_depth": [3, 4, 5],
}

In [181]:
def hyp_params(xtr,ytr,model,parameter):
    hyp = GridSearchCV(model,parameter,cv=5,n_jobs=-1,scoring='r2')
    hyp.fit(xtr,ytr)
    print(hyp.best_score_)
    print(hyp.best_params_)

In [184]:
hyp_params(X_train_scaled,y_train,AdaBoostRegressor(),params)

0.7184544427167945
{'learning_rate': 0.2}


In [None]:
hyp_par