# I. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import time 
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn import metrics

from scipy import stats

import requests
import pickle
import joblib

import re
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

# II. Function that will be called by Preprocess methods

In [3]:
def get_average_area(x):
    regx_numbers = re.compile(r"[-+]?(\d*\.\d+|\d+)")
    x = regx_numbers.findall(x)
    if len(x) == 1:
        return (np.float(x[0]))
    elif len(x) == 2:
        return (np.float(x[0]) + np.float(x[1]))/2
    else:
        return -1

In [4]:
def get_outlier_range(df,cname):
    sorted(cname)
    llimit = df[cname].quantile(0.25)
    hlimit = df[cname].quantile(0.95)
    return llimit, hlimit

In [5]:
# Setting up stopwords for Text Processing
stopwords_list = set(stopwords.words('english'))

#Custom Stoplist
custome_stopwords= ["i","project","living","home",'apartment',"pune","me","my","myself","we","our","ours","ourselves","you",
                  "you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she",
                  "she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves",
                  "what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been",
                  "being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because",
                  "as","until","while","of","at","by","for","with","about","against","between","into","through","during",
                  "before","after","above","below","to","from","up","down","in","out","on","off","over","under","again",
                  "further","then","once","here","there","when","where","why","all","any","both","each","few","more","most",
                  "other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will",
                  "just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain",
                  "aren","couldn","didn","doesn","hadn","hasn","haven","isn","ma","mightn","mustn","needn","shan","shan't",
                  "shouldn","wasn","weren","won","rt","rt","qt","for","the","with","in","of","and","its","it","this","i",
                  "have","has","would","could","you","a","an","be","am","can","edushopper","will","to","on","is","by","ive",
                  "im","your","we","are","at","as","any","ebay","thank","hello","know","need","want","look","hi","sorry",
                  "http", "https","body","dear","hello","hi","thanks","sir","tomorrow","sent","send","see","there","welcome",
                  "what","well","us"]

stopwords_list.update(custome_stopwords)

In [6]:
# Function to preprocess the text
def text_preprocess(text):
    """
        text: a string    
        return: modified initial string
    """
    text = text.replace("\d+"," ") #removing digits
    text = re.sub(r"(?:\@|https?\://)\S+",'',text) #removing mentions and urls
    text = text.lower()
    text = re.sub('[0-9]+','',text) #removing numeric characters
    text = re.sub('[/(){}\[\]\|@,;!]',' ',text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub('[^0-9a-z #+_]',' ',text) # replace symbols which are in BAD_SYMBOLS_RE from text
    
    text = ' '.join([word for word in text.split() if word not in stopwords_list])
    text = text.strip()
    return text

In [7]:
def get_pos_counter(text,part_of_speech):
    """
    Returns the count for the given parts of speech tag
    
    NN - Noun
    VB - Verb
    JJ - Adjective
    RB - Adverb
    """
    word_list = nltk.word_tokenize(text.lower())
    clean_word_list = [word for word in word_list if word not in stopwords_list]
    text = nltk.Text(clean_word_list)
    tag_pos = nltk.pos_tag(text)
    counts = Counter(tag for word,tag in tag_pos)
    return counts[part_of_speech]

In [8]:
def get_interval(train_actual_values, train_predicted_values, pi=.60):
    '''
    Get a prediction interval for the regression model.
    
    INPUTS: 
        - actual_values (y_train)
        - predicted_values (prediction from x_train)
        - Prediction interval threshold (default = .95) 
    OUTPUT: 
        - Interval estimate
    '''
    
    #get standard deviation of prediction on the train dataset
    sum_of_square_error = np.sum((train_actual_values - train_predicted_values)**2)
    stdev = np.sqrt(sum_of_square_error / (len(train_actual_values) - 1))
    
    #get interval from standard deviation
    one_minus_pi = 1 - pi
    ppf_lookup = 1 - (one_minus_pi / 2) # If we need to calculate a 'Two-tail test' (i.e. We're concerned with values both greater and less than our mean) then we need to split the significance (i.e. our alpha value) because we're still using a calculation method for one-tail. The split in half symbolizes the significance level being appropriated to both tails. A 95% significance level has a 5% alpha; splitting the 5% alpha across both tails returns 2.5%. Taking 2.5% from 100% returns 97.5% as an input for the significance level.
    z_score = stats.norm.ppf(ppf_lookup) # This will return a value (that functions as a 'standard-deviation multiplier') marking where 95% (pi%) of data points would be contained if our data is a normal distribution.
    interval_value = z_score * stdev
    
    return interval_value

In [9]:
def get_prediction_interval(y_predicted_value, interval_value):
    
    #generate prediction interval lower and upper bound cs_24
    lower, upper = y_predicted_value - interval_value, y_predicted_value + interval_value
    return lower, upper

# III. Preprocess methods

In [10]:
def preprocess_data(df):
    df_final = pd.DataFrame()
    df_final['City'] = df['Location'].apply(lambda x : x.split(',')[0].strip())
    df_final['State'] = df['Location'].apply(lambda x : x.split(',')[1].strip())
    df_final['Country'] = df['Location'].apply(lambda x : x.split(',')[2].strip())
    
    regx_numbers = re.compile(r"[-+]?(\d*\.\d+|\d+)")
    df_final['PropertyType'] = df['Propert Type'].apply(lambda x : regx_numbers.findall(x)[0] 
                                                     if len(regx_numbers.findall(x)) > 0 else 0)
    
    df_final['SubArea'] = df['Sub-Area'].apply(lambda x: x.capitalize().strip())
    df_final['CompanyName'] = df['Company Name'].apply(lambda x : x.capitalize())
    df_final['TownshipSocietyName'] = df['TownShip Name/ Society Name'].apply(lambda x: x.capitalize())
    df_final['Description'] = df['Description'].apply(lambda x : x.capitalize())
    
    regx_numbers = re.compile(r"[-+]?(\d*\.\d+|\d+)")
    df_final['PropertyAreainSqFt'] = df['Property Area in Sq. Ft.'].apply(lambda x : get_average_area(str(x)))
    
    df_final['ClubHouse'] = df['ClubHouse'].apply(lambda x: x.lower().strip())
    df_final['School/UniversityInTownship'] = df['School / University in Township '].apply(lambda x: x.lower().strip())
    df_final['HospitalInTownShip'] = df['Hospital in TownShip'].apply(lambda x: x.lower().strip())
    df_final['MallInTownShip'] = df['Mall in TownShip'].apply(lambda x: x.lower().strip())
    df_final['ParkJoggingTrack'] = df['Park / Jogging track'].apply(lambda x: x.lower().strip())
    df_final['SwimmingPool'] = df['Swimming Pool'].apply(lambda x: x.lower().strip())
    df_final['Gym'] = df['Gym'].apply(lambda x: x.lower().strip())
        
    df_final['ClubHouse'] = df['ClubHouse'].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    df_final['School/UniversityInTownship'] = df['School / University in Township '].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    df_final['HospitalInTownShip'] = df['Hospital in TownShip'].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    df_final['MallInTownShip'] = df['Mall in TownShip'].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    df_final['ParkJoggingTrack'] = df['Park / Jogging track'].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    df_final['SwimmingPool'] = df['Swimming Pool'].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    df_final['Gym'] = df['Gym'].apply(lambda x: x.lower().strip()).map({'yes': 1, 'no': 0})
    
    df_final = df_final.dropna()
    
    return df_final

In [13]:
def create_features(df):
    # Treating outliers in the numeric columns
    clist = ['PropertyAreainSqFt']
    
    for col in clist:
        lval, hval = get_outlier_range(df,col)
        df[col] = np.where(df[col] >hval, hval, df[col])
        df[col] = np.where(df[col] <lval, lval, df[col])
    
    
    filename = 'model/price_by_sub_area.pkl'
    with open(filename,'rb') as f:
        Price_by_Sub_Area = pickle.load(f)
        
    df['PriceBySubArea']= df['SubArea'].map(Price_by_Sub_Area)
    
    
    amenities_cloumns = ['ClubHouse',
                         'School/UniversityInTownship',
                         'HospitalInTownShip',
                         'MallInTownShip',
                         'ParkJoggingTrack',
                         'SwimmingPool',
                         'Gym']
    
    temp_df = df[amenities_cloumns]
    temp_df['AmenitiesScore'] = temp_df.sum(axis=1)
    df['AmenitiesScore'] = temp_df['AmenitiesScore']
    
    filename = 'model/price_by_amenities_score.pkl'
    with open(filename,'rb') as f:
        price_by_amenities_score = pickle.load(f)
        
    df['PriceByAmenitiesScore'] = df['AmenitiesScore'].map(price_by_amenities_score)
    
    '''
        NLP Text Processing to extract new features
    '''
    
    df['Description'] = df["Description"].astype(str).apply(text_preprocess)
    df['Noun_Counts'] = df['Description'].apply(lambda x: get_pos_counter(x,'NN'))
    df['Verb_Counts'] = df['Description'].apply(lambda x: (get_pos_counter(x,'VB')+get_pos_counter(x,'RB')))
    df['Adjective_Counts'] = df['Description'].apply(lambda x: get_pos_counter(x,'JJ'))
    
    fileName = 'model/count_vectorizer.pkl'
    with open(fileName,'rb') as f:
        count_vectorizer = pickle.load(f)
        
    X = count_vectorizer.transform(df['Description'])
    ngram_df = pd.DataFrame(X.toarray(),columns=count_vectorizer.get_feature_names_out())
    df_final = pd.concat([df.reset_index(drop=True),ngram_df.reset_index(drop=True)],axis=1)

    # selecting the final model ready features
    fileName = 'model/raw_features_mapping.pkl'
    with open(fileName,'rb') as f:
        feature_mapping = pickle.load(f)   
        
    fileName = 'model/feature_mapping.pkl'
    with open(fileName,'rb') as f:
        final_feature_list = pickle.load(f)
        
    # Removing price column as it is not available in test data
    final_feature_list.remove('Price_In_Lakhs')

    df_final = df_final.rename(columns=feature_mapping)
    df_final = df_final[final_feature_list]
    
    return df_final

# Test Inference:

In [42]:
# Loading the data
data= pd.read_excel('data/Pune Real Estate Data.xlsx')
data = data.drop(['Price in Millions','Price in lakhs'],axis=1)
print(data.shape)
data.head()

(200, 16)


Unnamed: 0,Sr. No.,Location,Sub-Area,Propert Type,Property Area in Sq. Ft.,Company Name,TownShip Name/ Society Name,Total TownShip Area in Acres,ClubHouse,School / University in Township,Hospital in TownShip,Mall in TownShip,Park / Jogging track,Swimming Pool,Gym,Description
0,1,"Pune, Maharashtra, India",Bavdhan,1 BHK,492,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Shapoorji Paloonji comunity located in the sub...
1,2,"Pune, Maharashtra, India",Bavdhan,2 BHK,774,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha Township located near the lonavala hill...
2,3,"Pune, Maharashtra, India",Bavdhan,3 BHK,889,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha Society is suitable for all aged group ...
3,4,"Pune, Maharashtra, India",Bavdhan,3 BHK Grand,1018,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha township are offering 3BHK grand prpoer...
4,5,"Pune, Maharashtra, India",Mahalunge,2BHK,743,Godrej Properties,Godrej Hills retreat,100.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,The area is a hub of prestigious schools like ...


In [43]:
df_preprocess = preprocess_data(data)
df_preprocess.head()

Unnamed: 0,City,State,Country,PropertyType,SubArea,CompanyName,TownshipSocietyName,Description,PropertyAreainSqFt,ClubHouse,School/UniversityInTownship,HospitalInTownShip,MallInTownShip,ParkJoggingTrack,SwimmingPool,Gym
0,Pune,Maharashtra,India,1,Bavdhan,Shapoorji paloonji,Vanaha,Shapoorji paloonji comunity located in the sub...,492.0,1,1,1,1,1,1,1
1,Pune,Maharashtra,India,2,Bavdhan,Shapoorji paloonji,Vanaha,Vanaha township located near the lonavala hill...,774.0,1,1,1,1,1,1,1
2,Pune,Maharashtra,India,3,Bavdhan,Shapoorji paloonji,Vanaha,Vanaha society is suitable for all aged group ...,889.0,1,1,1,1,1,1,1
3,Pune,Maharashtra,India,3,Bavdhan,Shapoorji paloonji,Vanaha,Vanaha township are offering 3bhk grand prpoer...,1018.0,1,1,1,1,1,1,1
4,Pune,Maharashtra,India,2,Mahalunge,Godrej properties,Godrej hills retreat,The area is a hub of prestigious schools like ...,743.0,1,1,1,1,1,1,1


In [44]:
df_preprocess.isnull().sum()

City                           0
State                          0
Country                        0
PropertyType                   0
SubArea                        0
CompanyName                    0
TownshipSocietyName            0
Description                    0
PropertyAreainSqFt             0
ClubHouse                      0
School/UniversityInTownship    0
HospitalInTownShip             0
MallInTownShip                 0
ParkJoggingTrack               0
SwimmingPool                   0
Gym                            0
dtype: int64

In [45]:
df_features = create_features(df_preprocess)
df_features.head()

Unnamed: 0,Property_Type,Club_House,School_University_In_Township,Hospital_In_Township,Mall_In_Township,Park_Jogging_Track,Swimming_Pool,Gym,Property_Area_in_SqFt,Price_By_SubArea,Amenities_Score,Price_By_Amenities_Score,Noun_Counts,Verb_Counts,Adjective_Counts,boasts_elegant,elegant_towers,every_day,great_community,mantra_gold,offering_bedroom,quality_specification,stories_offering,towers_stories,world_class
0,1,1,1,1,1,1,1,1,670.0,58.044,7,72.666667,9,1,3,0,0,0,0,0,0,0,0,0,0
1,2,1,1,1,1,1,1,1,774.0,58.044,7,72.666667,9,1,3,0,0,0,0,0,0,0,0,0,0
2,3,1,1,1,1,1,1,1,889.0,58.044,7,72.666667,9,1,3,0,0,0,0,0,0,0,0,0,0
3,3,1,1,1,1,1,1,1,1018.0,58.044,7,72.666667,8,1,3,0,0,0,0,0,0,0,0,0,0
4,2,1,1,1,1,1,1,1,743.0,73.555556,7,72.666667,12,1,6,0,0,0,0,0,0,0,0,0,0


In [71]:
df_features = df_features.drop([63,64])

In [46]:
payload = df_features.iloc[15].to_dict()
#Even if an integer of the type int64 is present in another object like a dictionary, 
#the TypeError exception will occur with the message “TypeError: Object of type int64 is not JSON serializable”
import json
# define a class to avoid that
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)
    
payload = json.dumps(payload,cls=NpEncoder)

In [47]:
payload

'{"Property_Type": "1", "Club_House": 1, "School_University_In_Township": 0, "Hospital_In_Township": 0, "Mall_In_Township": 0, "Park_Jogging_Track": 1, "Swimming_Pool": 1, "Gym": 1, "Property_Area_in_SqFt": 670.0, "Price_By_SubArea": 73.55555555555556, "Amenities_Score": 4, "Price_By_Amenities_Score": 97.87350515463918, "Noun_Counts": 9, "Verb_Counts": 1, "Adjective_Counts": 3, "boasts_elegant": 0, "elegant_towers": 0, "every_day": 0, "great_community": 0, "mantra_gold": 0, "offering_bedroom": 0, "quality_specification": 0, "stories_offering": 0, "towers_stories": 0, "world_class": 0}'

In [72]:
payload = df_features.iloc[64].to_dict()
payload = json.dumps(payload,cls=NpEncoder)

out =  requests.post(url='https://propoert-price-prediction-prod-fbbed88de872.herokuapp.com/PredictPrice',
                data=payload)
result = np.float(re.sub('[^A-Za-z0-9.]+', ' ', out.text)) 
result

81.98416164

In [73]:
output = []
for i in range(len(df_features)):
    payload = df_features.iloc[i].to_dict()
    payload = json.dumps(payload,cls=NpEncoder)
    
    out =  requests.post(url='https://propoert-price-prediction-prod-fbbed88de872.herokuapp.com/PredictPrice',
                    data=payload)
    result = np.float(re.sub('[^A-Za-z0-9.]+', ' ', out.text))  
    output.append(result)

ValueError: could not convert string to float: ' detail loc body Property Type msg value is not a valid integer type type error.integer '

In [74]:
import pickle
fileName = 'model/estimating_interval.pkl'
with open(fileName,'rb') as f:
    interval = pickle.load(f)

interval

15.038882262184757

In [75]:
# getting prediction intervals for the test data
lower_vet = []
upper_vet = []

for out in output:
    lower, upper =  get_prediction_interval(out, interval)
    lower_vet.append(lower)
    upper_vet.append(upper)

In [76]:
pd.DataFrame(zip(lower_vet,upper_vet,output),columns=['lower','upper','mean'])

Unnamed: 0,lower,upper,mean
0,26.009678,56.087443,41.048561
1,44.842812,74.920577,59.881695
2,64.325084,94.402849,79.363967
3,72.216431,102.294196,87.255314
4,55.401575,85.47934,70.440457
5,73.135544,103.213309,88.174427
6,68.742828,98.820593,83.781711
7,97.962169,128.039934,113.001052
8,64.038511,94.116275,79.077393
9,94.286532,124.364297,109.325415


In [77]:
df_features.loc[75]

Property_Type                            1
Club_House                               1
School_University_In_Township            0
Hospital_In_Township                     0
Mall_In_Township                         0
Park_Jogging_Track                       1
Swimming_Pool                            1
Gym                                      1
Property_Area_in_SqFt                670.0
Price_By_SubArea                      64.9
Amenities_Score                          4
Price_By_Amenities_Score         97.873505
Noun_Counts                              5
Verb_Counts                              0
Adjective_Counts                         3
boasts_elegant                           0
elegant_towers                           0
every_day                                0
great_community                          0
mantra_gold                              0
offering_bedroom                         0
quality_specification                    0
stories_offering                         0
towers_stor

In [69]:
df_preprocess.loc[63:64]

Unnamed: 0,City,State,Country,PropertyType,SubArea,CompanyName,TownshipSocietyName,Description,PropertyAreainSqFt,ClubHouse,School/UniversityInTownship,HospitalInTownShip,MallInTownShip,ParkJoggingTrack,SwimmingPool,Gym,PriceBySubArea,AmenitiesScore,PriceByAmenitiesScore,Noun_Counts,Verb_Counts,Adjective_Counts
63,Pune,Maharashtra,India,2.5,Kiwale,Unique properties,K ville,perfect blend luxury convenience connectivity ...,847.0,1,0,0,0,1,1,0,69.665,3,68.514348,11,0,8
64,Pune,Maharashtra,India,2.5,Kiwale,Unique properties,K ville,bhk residences every feature thoughtfully plan...,936.0,1,0,0,0,1,1,0,69.665,3,68.514348,10,2,4
