In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv("../input/bengaluru-house-price-data/Bengaluru_House_Data.csv")

In [None]:
df.head()

In [None]:
df.groupby("area_type")["area_type"].agg("count")

In [None]:
df.shape

 **while filling na values : (1) use mean() when the values are continuous (float values) and (2) use median() when the values are classified (like number of Balcony which cannot be a float value)**

In [None]:
df.drop(["area_type","availability","society","balcony"],axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)    #drops all the Rows which has NA values

In [None]:
df.shape    

In [None]:
df["size"].unique()    #BHK and bedroooms are same

In [None]:
# Now we will be needing the first numeric value only

df["bhk"]=df["size"].apply(lambda x: int(x.split(" ")[0]))  

 **x.split splits string with reference to the space and then using indexing [0] to grab the first value(which is the numeric value , then we use int() to convert the string into int and return it**

In [None]:
df.head()

In [None]:
df["bhk"].unique()

In [None]:
df[df.bhk>20]    # its not possible to have house with 43 bedroooms in just area of 2400 sqft

In [None]:
df.total_sqft.unique()   # here you can see a sqft in range-> so we will take avg of min and max  

In [None]:
#this function will check whether the value is convertable to float or not
#the values which have range or string in it they cannot be converted float -> so the func returns flase for it

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df.total_sqft.apply(is_float)   #returns Boolean value

In [None]:
df[df.total_sqft.apply(is_float)].head() #returns the rows with only True boolean values-> but we need the opposite

In [None]:
df[~df.total_sqft.apply(is_float)].tail(10)   #putting the negate symbol-> converts Flase to True and returns them

In [None]:
def convert_to_sqft(x):
    tokens =x.split("-")
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except: 
        return None

In [None]:
convert_to_sqft("4000 - 4450")

In [None]:
convert_to_sqft("2000")

In [None]:
print(convert_to_sqft("200yard"))  #returns none

In [None]:
df["sqft"]=df["total_sqft"].apply(convert_to_sqft)

In [None]:
df.head()

In [None]:
df.drop(["total_sqft","size"],axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

### Feature engineering -> adding new Feature

In [None]:
#adding price_per_sqft

df["price_per_sqft"]= (df["price"]*100000)/df["sqft"]

In [None]:
df.head(3)

In [None]:
# working loaction feature

len(df.location.unique())   #a lot of location so we will define a Other category

In [None]:
df["location"].apply(lambda x: x.strip())

In [None]:
loc_count=df.groupby("location")["location"].agg("count")
len(loc_count)

In [None]:
other=loc_count[loc_count<=10]
len(other)

In [None]:
##applying it 

df.location=df.location.apply(lambda x: "other" if x in other else x)

In [None]:
len(df.location.unique())

In [None]:
df.tail()    #2nd location is converted now to other

### Outlier removal -> data errors or extreme variation in data

In [None]:
#on seeing the data you can say that sqft per room must be around 600

df[df["sqft"]/df["bhk"]<300].head()  #these are anomalies -> we need to remove them

In [None]:
df=df[~(df["sqft"]/df["bhk"]<300)]   # Negate~ will filter out the outliers

In [None]:
df.price_per_sqft.describe()   #min value is very low and unlikely , same case with max

In [None]:
a=[]
for key ,subdf in df.groupby("location"): 
    a.append(np.mean(subdf.price_per_sqft))
len(a)

In [None]:
def remove_outliers(var):
    df_out=pd.DataFrame()
    for key, subdf in var.groupby("location"):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        reduced_df= subdf[(subdf.price_per_sqft> (m-st)) & (subdf.price_per_sqft< (m+st))]
        df_out=pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [None]:
df1=remove_outliers(df)

In [None]:
df1.shape   #around 2000 outliers have been removed

In [None]:
df.shape

**Let's check if for a given location how does the 2 BHK and 3 BHK property prices look like**


In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

In [None]:

def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(df1,"Rajaji Nagar")

#between 1600 and 1800 we can see a vertical line which shows for same area price for 2 bedroom is higher than of 3

**We should also remove properties where for same location, the price of (for example) 3 bedroom apartment is less than 2 bedroom apartment (with same square ft area). What we will do is for a given location, we will build a dictionary of stats per bhk, i.e.**

{
    '1' : {
        'mean': 4000,
        'std: 2000,
        'count': 34
    },
    '2' : {
        'mean': 4300,
        'std: 2300,
        'count': 22
    },    
}
Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [None]:
df2 = remove_bhk_outliers(df1)
df2.shape

In [None]:
plot_scatter_chart(df2,"Rajaji Nagar") #you can notice the diff between this and previous

In [None]:
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df2.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

#lokks perfect #bell curve

**It is unusual to have 2 more bathrooms than number of bedrooms in a home**

In [None]:
df2[df2.bath>df2.bhk+2]

In [None]:
df3=df2[df2.bath<df2.bhk+2]   #removing the outliers

In [None]:
df3.shape

**we will drop the unneccesary columns like price_per_sqft as it has no other use than removing the outliers**

In [None]:
df3.drop("price_per_sqft",axis=1,inplace=True)

In [None]:
len(df3)

In [None]:
#One hot encoding

dummies=pd.get_dummies(df3.location)
dummies.head()
len(dummies)

In [None]:
df4=pd.concat([df3,dummies.drop("other",axis=1)],axis=1)   #remember to specify the axis
df4.head()

In [None]:
df4.drop("location",axis=1,inplace=True)

## Model Training

In [None]:
X=df4.drop("price",axis=1)
y=df4.price

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV,ShuffleSplit

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

In [None]:
from sklearn.linear_model import LinearRegression

lr=LinearRegression(normalize=True)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

### Exporting the model

In [None]:
import joblib

joblib.dump(lr,"Bangalore_House_prices")

### Export location and column information to a file that will be useful later on in our prediction application


In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))