In [None]:
import numpy as np  #use for mathematical operations
import pandas as pd   #import for data analysis
import matplotlib.pyplot as plt   #import for plotting the data
%matplotlib inline  

In [None]:
df1=pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [None]:
df1.head()  #first five row of DataFrame

In [None]:
df1.shape  #Number of rows and columns

**Data Cleaning Processes**

In [None]:
df1.groupby('area_type')['area_type'].agg('count')  #area types and their count

In [None]:
#drop the "area_type","balcony","society",and availability columns from DataFram
df2=df1.drop(['area_type','balcony','society','availability'],axis='columns') 

In [None]:
df2.head()  #check new DataFrame first five rows

In [None]:
df2.isnull().sum()  #check the Null values in new DataFrames

In [None]:
df3=df2.dropna()  #drop Null values
df3.isnull().sum() #check again

In [None]:
df3.shape  #new DataFrame shape

In [None]:
df3['size'].unique()   #finding no. of unique value of size

In [None]:
df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))  #add "bhk" column in DataFrame

In [None]:
df3.head() #Again check first five rows

In [None]:
df3['bhk'].unique()  #unique values in "bhk" column

In [None]:
df3[df3.bhk>10] #check 'bhk' column how much value is more than 10

In [None]:
df3.total_sqft.unique()  

In [None]:
def is_float(x):   #use function to generate boolean false value if in a column of "total_sqrt" have float values
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df3[-df3['total_sqft'].apply(is_float)].head(10)

In [None]:
#again use function method to take avgerage
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
    

In [None]:
df4=df3.copy()
df4['total_sqft']=df4['total_sqft'].apply(convert_sqft_to_num)
df4.head()

In [None]:
df4.loc[30]  #show row 30

> **Feature Engineeering**

In [None]:
#Change into Price Per square ft

df5=df4.copy()
df5['price_per_sq_ft']=df5['price']*100000/df5['total_sqft']

In [None]:
df5.head()

In [None]:
len(df5.location.unique())   # check the no.unique location

In [None]:
df5.location = df5.location.apply(lambda x: x.strip())

location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

In [None]:
len(location_stats[location_stats<=10])    #no. of location which have less than 10 values

In [None]:
location_stats_less_than_10=location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
len(df5.location.unique())

In [None]:
#Use lambda function for put other value which have less than 10 location
df5.location=df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())

In [None]:
df5.head()

# Outlier removal

In [None]:
df5[df5.total_sqft/df5.bhk<300]

In [None]:
df5.shape

In [None]:
df6=df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sq_ft)
        st = np.std(subdf.price_per_sq_ft)
        reduced_df = subdf[(subdf.price_per_sq_ft>(m-st)) & (subdf.price_per_sq_ft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

we check here how the 2bhk & 3bhk property price look like at specific location

In [None]:
def plot_scatter_chart(df,location):
    bhk2=df[(df.location==location) & (df.bhk==2)]
    bhk3=df[(df.location==location) & (df.bhk==3)]
    plt.figure(figsize=(15,10))
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 bhk',s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,color='green',label='3 bhk',s=50,marker='+')
    plt.xlabel=("Total square feet area")
    plt.ylabel=("Price(Lakhs in Indian Rupees)")
    plt.title(location)
    plt.legend()
plot_scatter_chart(df7,"Rajaji Nagar")

In [None]:
plot_scatter_chart(df7,"Hebbal")

we should also remove the properties where for same location ,the price of less room apartment is greater than the price of the apartment which have more rooms(areas) like 2bhk price is greater than 3bhk for this we build dictionay of stats per bhk.


{
    
    '1' : {
          'mean':4000,
          'std  :2000,
          'count': 34
      },
      
      
      '2' : {
          'mean' : 4300
          'std'  : 2300
          'count': 22
      },
      
      
}
    
          
Now remove those two bhk apartment whose price is less than 1bhk apartment          

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sq_ft),
                'std': np.std(bhk_df.price_per_sq_ft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sq_ft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df8 = remove_bhk_outliers(df7)
# df8 = df7.copy()
df8.shape

In [None]:
plot_scatter_chart(df8,'Rajaji Nagar')

In [None]:
plot_scatter_chart(df8,"Hebbal")

In [None]:
plt.figure(figsize=(20,10))
plt.hist(df8.price_per_sq_ft,rwidth=0.8)
plt.xlabel('price per square feet')
plt.ylabel('count')

# Outliers using bathroom features

In [None]:
df8.bath.unique()

In [None]:
plt.hist(df8.bath,rwidth=0.8)
plt.figure(figsize=(15,10))
plt.xlabel('price per square feet')
plt.ylabel('count')

In [None]:
df8[df8.bath>df8.bhk+2]

so here in some rows have the no. of bathroom is more than bhk+2,this is also outliers and we need to remove it

In [None]:
df9=df8[df8.bath<df8.bhk+2]
df9

In [None]:
df9.shape

In [None]:
df10=df9.drop(['price_per_sq_ft','size'],axis='columns')
df10.head()

# Use One Hot Encoding for location

In [None]:
dummies=pd.get_dummies(df10.location)
dummies.head(5)

In [None]:
df11=pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')
df11.head()

In [None]:
df12=df11.drop('location',axis='columns')
df12.head()

# Build a model

In [None]:
df12.shape

In [None]:
X=df12.drop('price',axis='columns')
X.head()

In [None]:
y=df12.price
y.head()

In [None]:
len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf=LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)


Use K_fold cross validation to measure accuracy of our linear regression model

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
cross_val_score(LinearRegression(),X,y,cv=cv)

So on the above resut we c took more fit accuracy of first one model that is 0.845227769787456

# Test the model for few properties

In [None]:
def predict_price(location,sqft,bath,bhk):
    loc_index=np.where(X.columns==location)[0][0]
    
    x=np.zeros(len(X.columns))
    x[0]=sqft
    x[1]=bath
    x[2]=bhk
    if loc_index >=0:
        x[loc_index] = 1
        
    return lr_clf.predict([x])[0]
    

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

In [None]:
predict_price('Indira Nagar',1000, 2, 2)

Export the tested model to a pickle file

In [None]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))