In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

**Reading and importing the file**

In [None]:
df1=pd.read_csv("/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv")
df1.head()

In [None]:
#LOOKING AT NUMBER OF ROWS AND COLUMNS
df1.shape
# Thus 13320=rows   9=columns
#This is an important step since we may have to remove certain data(rows or columns) due to many nan(absent)values thus to have an overlook at the size of data in the beginning is a smart step to take.

> **Observing and cleaning each column**

In [None]:
df1.columns


In [None]:
# Dont get confused this line shows how the column "area_type" has different categorical entries and there number of counts.
df1.groupby("area_type")["area_type"].agg("count")

> BY BASIC OBSERVATION WE CAN CONCLUDE THAT AREA_TYPE DOES NOT AFFECT THE PRICE SINCE THE TOTAL SQFT IS GIVEN.

 > ALSO SOCIETY,BALCONY,AVAILABILITY DOES NOT AFFECT

In [None]:
# Do not confuse area_type with the space of house area_type is just the description of space of the house which can be infered by total_sqft
# Society does not matter since location is given it just makes the data set more complicated to deal with

df2=df1.drop(["area_type","society","balcony","availability"],axis="columns")
df2.head()
# We also create a new data frame in case we get a abrupt accuracy.Also this is a good habit incase you want to compare accuracies in the end.

In [None]:
#This code here tells us the number of missing enteris in each column
df2.isnull().sum()


HERE WE CAN SEE TOTAL MISSING VALUES ARE 1+16+73(considering the worst case in which every missing value is in different row) WHICH IS STILL A NEGLIGIBLE LOSS OF DATA THEREFORE WE TOTALLY DROP THESE ROWS INSTEAD OF FILLING THEM.
> We wouldnt have taken this step in case the loss of data was too much since it would have affected our accuracy.

In [None]:
#DROPPING ALL NA VALUES
#Again a new data frame after some major changes in previous data frame.
df3=df2.dropna()
df3.isnull().sum()

**Size column**

In [None]:
df3["size"].unique()

**WE SEE DATA IS ENTERED IN BHK OR BEDROOM FORM, BOTH OF THEM MEAN THE SAME SO WE WOULD PICK JUST THE NUMBER FROM THIS COLUMN AND PUT IT IN NEW COLUMN FOR EASY PREDICTION.**

In [None]:
df3["bhk"]=df3["size"].apply(lambda x:int(x.split(" ")[0]))
df3.head()
#The output here is trying to tell us to use .loc instrad of our code but it is fine since the end results are same.
# We use lambda function(google it highly important)this functions are one liner function that have one specific task.

In [None]:
#NEXT WE SEARCH FOR ANY OUTLIERS.
df3["bhk"].unique()
#we see 43 and 24 bhk which may be wrong

In [None]:
#PRINTING DATA WHICH HAS BHK MORE THAN 20
df3[df3.bhk>20]
#WE CAN SEE IT IS AN ERROR SINCE SQFT IS VERY LESS AND BHK IS VERY HIGH

**MAJOR PROBLEM IN TOTAL_SQFT BECAUSE IT HAS DATA IN DIFFERENT UNITS AND RANGE:
EG- 8000-7500, 1331-1350, 35.56 perch......**

In [None]:
df3["total_sqft"].unique()

In [None]:
#WE WOULD DROP ROWS WITH SQ METER AND PERCH
#FOR DATA IN RANGE WE WOULD REPLACE IT WITH AVERAGE.
#USING THIS FUNCTION WE ARE CREATING A CLEANER VERSION OF total_sqft

def convert_sqft_to_num(x):
  tokens=x.split("-")
  if len(tokens)==2:
    return (float(tokens[0])+float(tokens[1]))/2
  try:
    return float(x)
  except:
    return None 

In [None]:
#CREATING COPY AND CONTINUING:

df4=df3.copy()
df4["total_sqft"]=df4["total_sqft"].apply(convert_sqft_to_num)

**Creating price_per_sqft**
> Later this would help remove the outliers (we saw two in outliers in bhk(24,42)).

In [None]:
df5=df4.copy()
df5["price_per_sqft"]=df5["price"]*1000000/df5["total_sqft"]
df5.head()

**Location**

*In this column we have high unique values(1304)and it is impossible to know every value and use it for prediction.*
Thus we check which area(LOCATION) has maximum repetation in our data.
Any location with repetation less than 10 can be replaced as "others".

AFTER DOING THIS WE WOULD HAVE ONLY 242 UNIQUE VALUES IN LOCATION.

In [None]:
len(df5.location.unique())
#These are the unique(caterogircally different values) in location before our cleaning

In [None]:
#Stripping any whitespaces
df5.location.apply(lambda x:x.strip())

In [None]:
#Basic data observation
location_stats=df5.groupby("location")["location"].agg("count").sort_values(ascending=False)
print(location_stats)

In [None]:
len(location_stats[location_stats<=10])
# THEREFORE WE SEE THAT OUT OF 1304 ROWS ALMOST 1063 ROWS HAVE LESS TAHN 10 REPETATION SO RATHER TAHN LOSING THIS DATA WE PLACE THEM UNDER "OTHER"

In [None]:
#A new data frame
location_stats_less_than_10=location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
"""using lambda func we place these 1063 in a location="other"""
df5.location=df5.location.apply(lambda x:"other" if x in location_stats_less_than_10 else x)
len(df5.location.unique())

In [None]:
df5.head(10)

**OUTLIERS**

In [None]:
""" IN THIS DATA SET WE HAVE (BHK AND TOTAL_SQFT) THEREFORE WE CAN SEE ANY OUTLIERS SINCE BOTH THESE VALUES ARE CONNECTED"""
""" WE DIVIDE TOTAL_SQFT BY BHK IF THE ANSWER IS UNUSUALLY SMALL(<300) THEN ITS AN OUTLIER"""
# FOR EG------TOTAL_SQFT = 1000 AND BHK = 6  (1000/6<300) THEREFORE AN OUTLIER SINCE A SINGLE BEDROOM TAKES ABOUT 300 SQFT MINIMUM
df5[df5.total_sqft/df5.bhk<300].head() #this would give us first five outliers

In [None]:
df5.shape

In [None]:
#REMOVING OUTLIERS
df6=df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape

In [None]:
#FINDING MORE OUTLIERS
df6.price_per_sqft.describe()
#MIN is very less (e+03) 

**THERE MAY BE OUTLIERS IN SOME LOCATIONS, LIKE AN AREA WITH LOWPRICE RATE MAY HAVE A HOUSE WITH LOW SQFT AND HIGH PRICE.**

THEREFORE WE WILL CALCULATE STANDARD DEVIATION OF PRICE_PER_SQFT W.R.T LOCATION

In [None]:
#FUNCTION TO REMOVE OUTLIERS(ANY VALUEOF DATA IN RANGE OF MEAN-STANDARD DEVIATION AND MEAN+STANDARD DEVIATION IS OBMITTED)
#(M-ST<VALUE<=M+ST) NICE LOGIC
def remove_pps_outliers(df):
  df_out=pd.DataFrame()
  for key,subdf in df.groupby("location"):
    m=np.mean(subdf.price_per_sqft)
    st=np.std(subdf.price_per_sqft)
    reduced_df=subdf[(subdf.price_per_sqft>(m-st))&(subdf.price_per_sqft<=(m+st))]
    df_out=pd.concat([df_out,reduced_df],ignore_index=True)
  return df_out


In [None]:
df7=remove_pps_outliers(df6)
df7.shape
#therefor we removed approx 2000 outliers
#For any data scientist it is very important to remove outliers and clean the data.
#Getting a good accuracy becomes relatively easier whe your data is clean and easy to read.

In [None]:
#NOW CHECKING IF 2BHK AND 3BHK HAVE SAME PRICE IN SIMILAR LOCATION OF SAME TOTAL_SQFT
# WE PLOT A SCATTER PLOT
#THE PLOT WOULD TAKE A SPECIFIC LOCATION AND DATAFRAME AS INPUT
#IT WOULD COMPARE 2BHK AND 3BHK PRICE WITH SQFT AS A PARAMETER
def plot_scatter_chart(df,location):
  bhk2=df[(df.location==location)&(df.bhk==2)]
  bhk3=df[(df.location==location)&(df.bhk==3)]

  #matplotlib.reParams["figure.figsize"]=(15,10)
  plt.scatter(bhk2.total_sqft,bhk2.price,color="blue",label="2 BHK",s=50)
  plt.scatter(bhk3.total_sqft,bhk3.price,marker="+",color="green",label="3 BHK",s=50)
  plt.xlabel("TOTAL_SQFT_AREA")
  plt.ylabel("PRICE")
  plt.title("LOCATION")

In [None]:
plot_scatter_chart(df7,"Hebbal")
# WE CAN SEE WHEN SAME SQFT 2BHK IS OF HIGHER PRICE THAN 3BHK THUS THIS COULD CREATE PRBEL IN PREDICTION SO REMOVE ALL 3BHK WITH SAME SQFT AS OF 2BHK AND LESS PRICE

In [None]:
plot_scatter_chart(df7,"Rajaji Nagar")

*WE WILL REMOVE ALL 3BHK WITH SAME LOCATION AS 2BHK and at a lesser price.*

** We will do the same with any 2bhk flat with less price than 1bhk if in sam elocation and same sqft.**

In [None]:
def remove_bhk_outliers(df):
  exclude_indices=np.array([])
  for location,location_df in df.groupby("location"):
    bhk_stats={}
    for bhk,bhk_df in location_df.groupby("bhk"):
      bhk_stats[bhk]={
          "mean":np.mean(bhk_df.price_per_sqft),
          "std":np.std(bhk_df.price_per_sqft),
          "count":bhk_df.shape[0]
      }
    for bhk,bhk_df in location_df.groupby("bhk"):
      stats=bhk_stats.get(bhk-1)
      if stats and stats["count"]>5:
        exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats["mean"])].index.values)
  return df.drop(exclude_indices,axis="index")
# You may find this a little difficult but give it a 5 min read.

In [None]:
df8=remove_bhk_outliers(df7)
df8.shape

In [None]:
plot_scatter_chart(df8,"Hebbal")
# NOW ALMOST ALL COINCIDING POINTS ARE REMOVED

In [None]:
import matplotlib
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("PRICE_PER_SQFT")
plt.ylabel("COUNT")

In [None]:
#WE SEE MAX DATA POINTS LIE FROM 0 TO 10,000 SQFT
#EXPLAINING BATHROOM FEATURES
df8.bath.unique()

In [None]:
df8[df8.bath>10]

In [None]:
#WE NOTICE SOME DATA POINTS HAVE: NUMBER OF BATHROOMS> NUMBER OF BHK
#HISTOGRAM:
plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("NO.OF BATHROOMS")
plt.ylabel("COUNT")

In [None]:
#MAXIMUM VALUE ARE BTW (2 TO 4)
df8[df8.bath>df8.bhk+2]
#ALL ARE OUTLIERS

In [None]:
df9=df8[df8.bath<df8.bhk+2]
df9.shape

In [None]:
# DROPPING SOME FEATURES WHICH ARE UNNECESSARY
df10=df9.drop(["size","price_per_sqft"],axis="columns")
df10.head()

***MODEL BUILDING***

In [None]:
# NOW ALL OUR COLUMNS ARE NUMERICAL EXCEPT FOR "LOCATION". THEREFORE WE WOULD USE ONE HOT ENCODING OR DUMMIES TO CONVERT CATEGORICAL TO NUMERICAL
dummies=pd.get_dummies(df10.location)
dummies.head()
# Always do this for categorical data
# Here since the columns is not a level catergorical column(high low medium...) we use one hot encoding

In [None]:
# Appending int dataframe:
df11=pd.concat([df10,dummies.drop("other",axis="columns")],axis="columns")
df11.head()

In [None]:
#WHILE CONCATING WE USUALLY DROP A COLUMN HENCE WE DROPPED "OTHER" FROM DUMMIES DATAFRAME
#DROPPING OCATION COLUMN
df12=df11.drop("location",axis="columns")
df12.head()

In [None]:
#Observe how the number of rows remained same just the columns increased.
df12.shape

*** TEST TRAIN AND SPLIT***

In [None]:
x=df12.drop("price",axis="columns")
x.head()

In [None]:
y=df12.price
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf=LinearRegression()
lr_clf.fit(x_train,y_train)
lr_clf.score(x_test,y_test)

K FOLD CROSS VALIDATION

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
cross_val_score(LinearRegression(),x,y,cv=cv)

GRID SEARCH CV(ALL IN ONE FUNCTION)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso

def find_best_model_using_gridsearchcv(x,y):
  algos={
      "linear_regressor":{
          "model":LinearRegression(),
          "parms":{
              "normalize":[True,False]
           }
       },
      "lasso":{
          "model":Lasso(),
          "parms":{
              "alpha":[1,2],
              "selection":["random","cyclic"]
           }
       },          
      "decision_tree":{
          "model":DecisionTreeRegressor(),
          "parms":{
              "criterion":["mse","freidman_mse"],
              "splitter":["best","random"]
           }
       }
   }
  scores=[]
  cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
  for algo_name,config in algos.items():
    gs=GridSearchCV(config["model"],config["parms"],cv=cv,return_train_score=False)
    gs.fit(x,y)
    scores.append({
        "model":algo_name,
        "best_score":gs.best_score_,
        "best_parms":gs.best_params_
    })
  return pd.DataFrame(scores,columns=["model","best_score","best_parms"])

find_best_model_using_gridsearchcv(x,y)

PARAMETER SELECTION FROM ABOVE

In [None]:
#WE SEE LINEAR REGRESSION IS THE BEST WITH NORMALIZE=FALSE
def predict_price(location,sqft,bath,bhk):
  loc_index=np.where(x.columns==location)[0][0]
  z=np.zeros(len(x.columns))
  z[0]=sqft
  z[1]=bath
  z[2]=bhk
  if loc_index>=0:
    z[loc_index]=1
  return lr_clf.predict([z])[0]
predict_price("1st Phase JP Nagar",1000,2,2)

**Hence a high accuracy model was build with basic data cleaning steps and machine learning algorithms.**