# "Real Estate price prediction" project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)

# Real estate price prediction is done on 'BENGALURU CITY' dataset. 

In [None]:
df1 = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
df1.head()

# Rows and columns in the dataset taken. 

In [None]:
df1.shape

# Count of the data sample in each of the category in area_type column.

In [None]:
df1.groupby('area_type')['area_type'].agg('count')

In [None]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis = 'columns')
df2.head()

In [None]:
df2.shape

# 1)Data cleaning  
## finding out the total null values in each column

In [None]:
df2.isnull().sum()

## Removing the null values with in the dataset

In [None]:
df3 = df2.dropna()
df3.isnull().sum()

In [None]:
df3.shape

## Finding the unique values in the size column of dataset 

In [None]:
df3['size'].unique()

## As it is seen in size column it have BHK and bedroom values
## Which makes the data little uneven

In [None]:
df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))

In [None]:
df3.head()

In [None]:
df3['bhk'].unique()

In [None]:
df3[df3.bhk>20]

##  In total_sqft column the values are not even

In [None]:
 df3['total_sqft'].unique() # it is observed in total_sqft column there are uncertain values like '1133 - 1384'

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
df3[~df3['total_sqft'].apply(is_float)].head(10)
# here it is observed that there are characters in total_sqft columns where it require only int

# function to get avg of numbers in total_sqft column in each row and if any char just return int in the cell

In [None]:
def convert_sqft_int(x):
    tokens = x.split('-')
    if len(tokens) ==2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_int)
df4 = df4[df4.total_sqft.notnull()]
df4.head(2)

In [None]:
df4.loc[30] # returns 30th column or any other column details by simply entering column number

####  -----------------------------------------------End of data cleaning---------------------------------------------

# 2)Feature engineering 

In [None]:
df5 = df4.copy()
#findind the price per sqft using formula 'price(in lakhs)/totalsqft'
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()

In [None]:
df5_stats = df5['price_per_sqft'].describe()
df5_stats

In [None]:
# when it comes to location coloum there are different location so finding total location
df5['location'].unique()

In [None]:
len(df5['location'].unique())

In [None]:
df5.location = df5.location.apply(lambda x:x.strip())
location_stats = df5.groupby('location')['location'].agg('count')
location_stats

In [None]:
#sort to find maximum location_stats
location_stats.sort_values(ascending= False)

In [None]:
location_stats.values.sum()

In [None]:
len(location_stats[location_stats>10])

In [None]:
len(location_stats)

In [None]:
# locations having less than 10 data points
len(location_stats[location_stats<=10])

In [None]:
location_stats_lessthan_10 = location_stats[location_stats<=10]

location_stats_lessthan_10

In [None]:
len(df5.location.unique())

In [None]:
# location which are having datapoints less than 10 is named as 'other' inorder to reduce confussions
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_lessthan_10 else x)
len(df5.location.unique())

In [None]:
# checking the the above conditions is applied or not
df5.head(10)

# 3)Outlier removel.

In [None]:
# in some cases like total_sqft it is impossible to have more number of rooms or impossible to have a  home with that many rookms.
# for a single bedroom it should be minimun 300sqft and for two bedrooms it 600sqft, so it seems suspicious to have a house in 400sqft.
# if there are any rows with codition like explained above the row can be removed. 
df5[df5.total_sqft/df5.bhk<300].head(10)

In [None]:
df5.shape

In [None]:
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape

###  Lets check for other outlier in the dataset

In [None]:
df6.describe()

### lets only consider price_per_sqft 

In [None]:
df6.price_per_sqft.describe()

###### As it is observed min price is 'Rs.267' and max price is 'Rs.176470' . prices are not that genuine. because In cities like bengaluru the prices wont be that cheaper,even  though their might be chance of having max price. So it should be corrected.

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

### According to dataset taken ,check the price for 2bhk and 3bhk by ploting the price values.

In [None]:
def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.bhk==2)] #df to have all the 2bhk dataset according to location given.
    bhk3 = df[(df.location==location) & (df.bhk==3)] #df to have all the 3bhk dataset according to location given.
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(df7,"Rajaji Nagar")

###  Lets check another location in the df.

In [None]:
plot_scatter_chart(df7,"Hebbal")

#### We should also remove properties where for same location, the price of (for example) 3 bedroom apartment is less than 2 bedroom apartment (with same square ft area). What we will do is for a given location, we will build a dictionary of stats per bhk, i.e.

{
    '1' : {
        'mean': 4000,
        'std: 2000,
        'count': 34
    },
    '2' : {
        'mean': 4300,
        'std: 2300,
        'count': 22
    },    
}

#### Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df8 = remove_bhk_outliers(df7)
# df8 = df7.copy()
df8.shape

####  Once again lets check the plotting for 2bhk and 3bhk prices

In [None]:
plot_scatter_chart(df8,"Rajaji Nagar")

#### hence outlies had been removed from the data set. cross check the previous ploting with new plottings above

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

#### it is observed from above histogram majority of the points will be in range of 0 to 1000.

#  Lets now consider bathroom feature in the dataset.

#### generally there wont be more bathrooms in a house then the bedrooms. so lets remove the rows with more number of bathrooms then bedrooms 

In [None]:
df8.bath.unique()

In [None]:
df8[df8.bath>10]

In [None]:
plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

#### It is common to have extra 2 bathrooms in a house other than in bedroom

In [None]:
df8[df8.bath>df8.bhk+2]

In [None]:
df9 = df8[df8.bath<df8.bhk+2]
df9.shape

In [None]:
df10 = df9.drop(['size','price_per_sqft'],axis = 'columns')
df10.head()

# 4)Model building 

#### Machine learning model can't interpret text data, in location column there is text data which need to be converted in to numeric using pandas.dummies

In [None]:
dummies = pd.get_dummies(df10.location)
# as we can observe for each datapoint like '1st block jayanagar' except for that remaining are filled with zeros in that row.

In [None]:
dummies.head()

In [None]:
df11 = pd.concat([df10,dummies.drop('other',axis = 'columns')],axis='columns')
df11.head()

#### As we are having dummies for location columns we now drop the location column

In [None]:
df12 = df11.drop('location',axis = 'columns')
df12.head()

In [None]:
df12.shape

#### Modeling....

#### 'price' is the dependent variable in my data set so i want to assign table to 'X' without price and 'y' as price table

In [None]:
X = df12.drop(['price'],axis='columns')
X.head(3)

In [None]:
y = df12.price
y.head(3)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)
# .score is used to find score of the model where 0.86 are 86 is a decent score.
# ultimate aim is the get good score for model we are building

### I have used kfold to measure my LinearRegresion model score

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

#### 5 splits are giving 82% plus score , lets try with 10 splits and find the score

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

#### well the score finds decent with 10 splits which is more than 79% all the time.

# i gonna find the better model using GridSearchCV 

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

## 

## As the results shows linear_regression is having high score of 84% 

# Model testing using the parameters.

In [None]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

# Predictions of prices

In [None]:
df10.location.unique()

In [None]:
len(df10.location.unique())

# Prediction test 1 

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

# Prediction test 2 

In [None]:
predict_price('Nehru Nagar',1400,3,2)

# Prediction test 3 

In [None]:
predict_price('Padmanabhanagar',1250,2,3)