In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

In [None]:
df=pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
l=[]
for i in df.columns:
  l.append(df[i].value_counts())

In [None]:
l

In [None]:
df.describe()

In [None]:
df.describe(include='O')

In [None]:
#lets check the relationship of price wrt area_type

In [None]:
df[['area_type','price']].groupby(['area_type'],as_index=False).mean().sort_values(by='price',ascending=False)

Here we can see that plot area seems to have an edge over the rest as compared to the others. 

In [None]:
df[['availability','price']].groupby(['availability'],as_index=False).mean().sort_values(by='price',ascending=False)

Availability has a huge variance and doesnt seem very useful so lets drop this column.

In [None]:
df[['location','price']].groupby(['location'],as_index=False).mean().sort_values(by='price',ascending=False)

Although we see that locations are large in number, we should consider it due to its major price difference. We can do so by grouping them into ranges

In [None]:
df[['size','price']].groupby(['size'],as_index=False).mean().sort_values(by='price',ascending=False)

Size will definatly impace the price. so again we can convert it into appropriate ranges

In [None]:
df[['society','price']].groupby(['society'],as_index=False).mean().sort_values(by='price',ascending=False)

Sociect has a large number of entries and also a huge price difference. we can add it

In [None]:
df[['total_sqft','price']].groupby(['total_sqft'],as_index=False).mean().sort_values(by='price',ascending=False)

again this has to be included


**Now that we finished looking at categorical variables lets understand the numeric ones**

In [None]:
df.describe()

In [None]:
plt.scatter(df.balcony,df.price)
plt.show()

with respect to balcony we can see that there is hardly any difference . majority falls in the 0-1000 range apart from a few outliers
so drop it

In [None]:
plt.scatter(df.bath,df.price)
plt.show()

we should add this column because the is alot of difference between no of baths wrt price

In [None]:
df.corr(method='pearson')

**Finally we decided to drop the following cols:**


1.   society
2.   availability
3.   balcony




In [None]:
df2=df.drop(['society','availability','balcony'],axis='columns')

In [None]:
df2.head()

**Lets look for null values now**

In [None]:
df2.isnull().sum()

Now you have two options here:


1.   drop the na rows because we have 13k rows and 73 is insignificant
2.   fill the missing entries with mean ,median,interpolate ...etc



**Well.. Im droping them**

In [None]:
df3=df2.dropna()

In [None]:
df3.isnull().sum()

Now lets go further into data exploration

In [None]:
df3.columns

**SIZE COLUMN**

In [None]:
df3['size'].unique()

Did you notice the unclean data and multiple redundancies?
lets clean it by goruping the relevant ones together

In [None]:
df3['bhk']=df3['size'].apply(lambda x: x.split(' ')[0])

In [None]:
df3.head()

In [None]:
df3['bhk'].unique()

lets look at total_sqft

In [None]:
df['total_sqft'].unique()

In [None]:
#we seem to have a range here. lets replace with mean.

In [None]:
def not_range(x):
  try:
    float(x)
  except:
    return False
  return True   

create a dataframe which passes on each value of total sqft and returns true if it is a range

In [None]:
df3[~df3['total_sqft'].apply(not_range)]

In [None]:
def convert_range(x):
  tokens=x.split('-')
  if len(tokens)==2:
    return(float(float(tokens[0])+float(tokens[1])))/2
  try:
    return float(x)
  except:
    return None      


In [None]:
df4=df3.copy()

In [None]:
df4['total_sqft']=df4['total_sqft'].apply(convert_range)
df4.head()

In [None]:
df4.isnull().sum()

In [None]:
df4=df4.dropna()

In [None]:
#feature engineering:
df5=df4.copy()
df5['price_per_sqft']=df['price']*100000/df5['total_sqft']
df5.head()

In [None]:
#lets work on locations

In [None]:
len(df5['location'].unique())

In [None]:
#lets strip extra spaces from loaction and the group similar priced locs

In [None]:
df5.location=df5.location.apply(lambda x: x.strip())

In [None]:
location_stats=df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

In [None]:
#alot of ones..so lets group all locs with less than 10 and call it other location
location_stats_less_than_10=location_stats[location_stats<10]
df5.location=df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

In [None]:
df5['location'].unique()

In [None]:
location_stats=df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

Now lets look at outliers and try and remove any

In [None]:
df5.head(10)

Upon research a standard room has around 300sq ft per room.
so lets check unusual instances


In [None]:
df5.bhk=df5.bhk.apply(lambda x: float(x))

In [None]:
df5[df5.total_sqft/df5.bhk<300]

In [None]:
df6=df5[df5.total_sqft/df5.bhk<300]
df6.shape

remove these

In [None]:
df6=df5[~(df5.total_sqft/df5.bhk<300)]

In [None]:
df6.shape

lets check price per squarefeet

In [None]:
df6.price_per_sqft.describe()

This is definitly a outlier

we remove ppsqft where the result lies within the ranges:
greater than mean-sd and less than mean+std within each location type

In [None]:
def remove_outs(df):
  new_df=pd.DataFrame()
  for k,subdf in df.groupby('location'):
    m=np.mean(subdf.price_per_sqft)
    st=np.std(subdf.price_per_sqft) 
    rdf=subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
    new_df=pd.concat([new_df,rdf],ignore_index=True)
  return new_df
df7=remove_outs(df6)    

In [None]:
df7

In [None]:
def remove_bhk_outliers(df):
  exclude_indies=np.array([])
  for location,location_df in df.groupby('location'):
    bhk_stats={}
    for bhk,bhk_df in location_df.groupby('bhk'):
      bhk_stats[bhk]={
          'mean':np.mean(bhk_df.price_per_sqft)
          ,'std':np.std(bhk_df.price_per_sqft),
          'count':bhk_df.shape[0]
      }
    for bhk,bhk_df in location_df.groupby('bhk'):
      stats=bhk_stats.get(bhk-1)  
      if stats and stats['count']>5:
        exclude_indies=np.append(exclude_indies,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
  return df.drop(exclude_indies,axis='index')
df8=remove_bhk_outliers(df7)
df8.shape          

In [None]:
#now batrooms
df8.bath.unique

In [None]:
#how can baths be greater thanm bhk?


In [None]:
df8=df8[df8.bath<df8.bhk+2]

In [None]:
df8.head()

In [None]:
df9=df8.copy()

In [None]:
#area_type
df9['area_type']=df9['area_type'].replace({'Super built-up  Area':1,'Plot  Area':2,'Built-up  Area':3,'Carpet  Area':4})

In [None]:
df9.drop(['size','price_per_sqft'],axis='columns',inplace=True)

In [None]:
df9

In [None]:
dummies=pd.get_dummies(df9.location)
dummies.head()

In [None]:
df9

In [None]:
df10=pd.concat([df9,dummies.drop('Anekal',axis='columns')],axis='columns')
df10

In [None]:
df10=df10.drop('location',axis='columns')

In [None]:
df10

In [None]:
X=df10.drop('price',axis='columns')
X

In [None]:
y=df10.price

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
model=LinearRegression()
model.fit(xtrain,ytrain)
model.score(xtest,ytest)

In [None]:
from sklearn import linear_model
lasso_reg=linear_model.Lasso(alpha=50,max_iter=1000,tol=0.1)
lasso_reg.fit(xtrain,ytrain)
lasso_reg.score(xtest,ytest)

In [None]:
from sklearn import linear_model
ridge_reg=linear_model.Ridge(alpha=50,max_iter=1000,tol=0.1)
ridge_reg.fit(xtrain,ytrain)
ridge_reg.score(xtest,ytest)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import DecisionTreeRegressor
model_params={
    'linear':{
        'model':LinearRegression(),
            'params': {
            'normalize':[True,False]
              }
           },
           'lasso':{
        'model':linear_model.Lasso(),
            'params':{
            'alpha':[1,2],
            'selection':['random','cyclic']
           
                  }
           },
          'dec_tree':{
        'model':DecisionTreeRegressor(),
            'params':{
            'criterion':['mse','friedman_mse'],
            'splitter':['best', 'random'], 
           
                  }
            }
           
}


scores=[]
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
for model_name,mp in model_params.items():
  clf=GridSearchCV(mp['model'],mp['params'],cv=cv)
  clf.fit(xtrain,ytrain)
  scores.append({'model':model_name,'best_score':clf.best_score_,'best_params':clf.best_params_})
  

scores


In [None]:
#linear regression Wins
model=LinearRegression(normalize= False)
model.fit(xtrain,ytrain)
model.score(xtest,ytest)

In [None]:
X.columns

In [None]:
#prediction function
def predict(area,loc,sqft,bath,bhk):
  loc_index=np.where(X.columns==loc)[0][0]
  ar={'Super built-up  Area':1,'Plot  Area':2,'Built-up  Area':3,'Carpet  Area':4}

  x=np.zeros(len(X.columns))
  x[0]=ar[area]
  x[1]=sqft
  x[2]=bath
  x[3]=bhk
  if loc_index>=0:
    x[loc_index]=1
  return model.predict([x])[0]     

In [None]:
predict('Super built-up  Area','1st Phase JP Nagar',1000,2,2)

In [None]:
predict('Super built-up  Area','Whitefield',1000,3,3)

In [None]:
import pickle
pickle.dump(model,open('BangloreHousePrice.pickle','wb'))

In [None]:
load=pickle.load(open('BangloreHousePrice.pickle','rb'))

In [None]:
load.score(xtest,ytest)

In [None]:
import json
columns={
    'data_columns':[col.lower() for col in X.columns]
}
json.dumps(columns)