In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Bengaluru_House_Data.csv')

In [3]:
data

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [4]:
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
data.drop(['area_type', 'availability', 'society', 'balcony'], axis = 1, inplace=True)

In [6]:
data['location'].value_counts()

Whitefield                      540
Sarjapur  Road                  399
Electronic City                 302
Kanakpura Road                  273
Thanisandra                     234
                               ... 
Chikbasavanapura                  1
2nd phase jp nagar, jp nagar      1
Chikkaballapur                    1
Bhattarahalli                     1
M C Layout                        1
Name: location, Length: 1305, dtype: int64

In [7]:
data['location'].fillna(value = 'Sarjapur Road', inplace=True)

In [8]:
data['bath'].fillna(np.median(data['bath']), inplace=True)

In [9]:
data['size'].fillna(value = '2 BHK', inplace=True)

In [10]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [11]:
def convert(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try :
        return float(x)
    except :
        return None

In [12]:
data['total_sqft'] = data['total_sqft'].apply(convert)

In [13]:
data['location'] = data['location'].apply(lambda x : x.strip())

In [14]:
location_value_counts = data['location'].value_counts()
location_value_counts_less_10 = location_value_counts[location_value_counts < 10]

In [15]:
data['location'] = data['location'].apply(lambda x : 'other' if x in location_value_counts_less_10 else x)

In [16]:
 data = data[data['total_sqft']/data['bhk'] >= 300]

In [17]:
data['price_per_sqft'] = data['price'] * 100000/data['total_sqft']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price_per_sqft'] = data['price'] * 100000/data['total_sqft']


In [18]:
def outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, sub_df in df.groupby('location'):
        mean = np.mean(sub_df.price_per_sqft)
        std = np.std(sub_df.price_per_sqft)
        gen_df = sub_df[(sub_df.price_per_sqft > (mean - std)) & (sub_df.price_per_sqft <= (mean + std))]
        #print(df_output)
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output    

In [19]:
data = outliers_sqft(data)

In [20]:
def outliers_bhk(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis = 'index')      

In [21]:
data = outliers_bhk(data)

In [23]:
data.drop(['size', 'price_per_sqft'], axis = 1, inplace=True)

In [24]:
data['bath'] = data['bath'].fillna(np.mean(data['bath']))

In [25]:
X = data.drop('price', axis = 1)
y = data['price']

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [49]:
X_train

Unnamed: 0,location,total_sqft,bath,bhk
7207,Vidyaranyapura,1200.0,2.0,3
1141,Bellandur,982.0,2.0,2
9558,other,1600.0,3.0,3
3492,Hulimavu,1818.0,3.0,3
9528,other,420.0,1.0,1
...,...,...,...,...
5451,Old Madras Road,1065.0,2.0,2
4455,Kengeri,1200.0,2.0,2
1689,Choodasandra,1530.0,3.0,3
4234,Kanakpura Road,1240.0,2.0,2


In [51]:
X_test

Unnamed: 0,location,total_sqft,bath,bhk
1807,Devarachikkanahalli,947.0,2.0,2
3750,Jigani,927.0,2.0,2
3927,Kaggadasapura,1140.0,2.0,2
4994,Malleshwaram,2000.0,3.0,3
4413,Kathriguppe,1350.0,3.0,3
...,...,...,...,...
9784,other,1020.0,2.0,2
1841,Doddaballapur,1690.0,3.0,4
4562,Kodigehaali,1166.0,2.0,2
4081,Kanakapura,711.0,1.0,1


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [77]:
X_train.drop('location', axis =1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [79]:
X_test.drop('location', axis =1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [28]:
column_trans = make_column_transformer((OneHotEncoder(sparse = False), ['location']), remainder = 'passthrough')

In [29]:
scaler = StandardScaler()

In [30]:
lr = LinearRegression(normalize=True)

In [80]:
pipe = make_pipeline(scaler, lr)

In [81]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [82]:
predict = pipe.predict(X_test)

In [83]:
r2_score(y_test, predict)

0.7520391570884885

In [84]:
lasso = Lasso()

In [85]:
pipe_lasso = make_pipeline(scaler, lasso)

In [86]:
pipe_lasso.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [87]:
predict_lasso = pipe_lasso.predict(X_test)

In [88]:
r2_score(y_test, predict_lasso)

0.7532136005282466

In [89]:
ridge = Ridge()

In [90]:
pipe_ridge = make_pipeline(scaler, ridge)

In [91]:
pipe_ridge.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [92]:
X_test =pd.DataFrame(X_test, columns=X_train.columns)

In [93]:
predict_ridge = pipe_ridge.predict(X_test)

In [94]:
r2_score(y_test, predict_ridge)

0.7520723613164065

In [95]:
import pickle

In [96]:
pickle_out = open('classifier.pkl', 'wb')
pickle.dump(pipe_ridge, pickle_out)
pickle_out.close()

In [102]:
X_test.to_csv(r'Test file.csv')

In [98]:
X_train.to_csv(r'Train file.csv')

In [101]:
X_test.reset_index(drop=True, inplace=True)

In [103]:
X_test

Unnamed: 0,total_sqft,bath,bhk
0,999.00,2.0,2
1,1300.00,2.0,3
2,1211.00,2.0,2
3,980.00,2.0,2
4,1260.00,5.0,3
...,...,...,...
2438,1584.01,3.0,3
2439,1672.00,3.0,3
2440,1200.00,4.0,4
2441,2775.00,3.0,4
