# Project Phase 1
### Zameen.com Price Predictor
L191101 - Mohammad Hasaan <br>
L192367 - Laiba Gohar <br>
L191003 - Saad Waseem
## Process Description
<ul>
    <li>Make clusters of AMENITIES, TYPE, and LOCATION</li>
    <ul>
        <li>Find the average of stdPRICE and stdAREA for each cluster</li>
        <li>Divide these values as stdPRICE / stdAREA to get PLOTVAL</li>
    </ul>
    <li>Use Multivariate regression using standardised variables AREA, PLOTVAL, and INTALLMENTVal to predict the PRICE of plots.</li>
</ul>

## Loading Data and Preprocessing
The following portion loads the cleaned data into a dataframe and then processes the frames so ML operations could be applied to them.
### Loading Data and Clustering
<b>L192367 - Laiba Gohar</b>

In [124]:
import pandas as pd
import os
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

In [138]:
path_clean_data = 'Data_Cleaned/Cleaned_Data.csv'
path_categories = [
    'Data_Cleaned/Categories/Category_Names_Amenities.csv',
    'Data_Cleaned/Categories/Category_Names_Type.csv',
    'Data_Cleaned/Categories/Category_Names_Location.csv'
]
clean_data_df = pd.read_csv(path_clean_data)
categories_names = {
    'Amenities': pd.read_csv(path_categories[0]),
    'Types': pd.read_csv(path_categories[1]),
    'Location': pd.read_csv(path_categories[2])
}

Define number of clusters needed for KMeans

In [139]:
n = len(clean_data_df['Location'].unique())
n

81

In [140]:
kmeans_clustering_obj = KMeans(n_clusters=n, random_state=42)
clean_data_df['Cluster'] = kmeans_clustering_obj.fit_predict(clean_data_df[['Location', 'Type', 'Amenities']])
clean_data_df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

clusters = [] # Clusters separation
for cluster in range(n):
    clusters.append(clean_data_df[clean_data_df['Cluster']==cluster])
    
# Create a place value of clusters
place_value = {}
for cluster in clusters:
    sum_price = cluster['Price'].sum()
    sum_area = cluster['Area'].sum()
    place_value[int(cluster['Cluster'].unique())] = sum_price / sum_area



In [142]:
place_value[0]

1450787.401574803

In [146]:
clean_data_df['Place_Val'] = [0] * len(clean_data_df)
clean_data_df.head()

Unnamed: 0,Type,Price,Location,Area,Creation date,Amenities,Installment value,Cluster,Place_Val
0,0,1900000,0,5.0,1 day ago,0,,27,0
1,0,4500000,0,5.0,1 day ago,0,,27,0
2,0,5900000,0,7.5,4 days ago,1,,27,0
3,0,5900000,0,7.5,4 days ago,1,,27,0
4,0,5900000,0,7.5,4 days ago,1,,27,0


In [147]:
for index, row in clean_data_df.iterrows():
    cluster_num = row['Cluster']
    clean_data_df.iloc[index, clean_data_df.columns.get_loc('Place_Val')] = place_value[cluster_num]

Standardising new values

In [168]:
pv_max = clean_data_df['Place_Val'].max()
pv_den = pv_max - clean_data_df['Place_Val'].min()
clean_data_df['Place_Val'] = (pv_max - clean_data_df['Place_Val']) / pv_den

In [169]:
stand_data_df = pd.read_csv('Data_Cleaned/Standardised_Data.csv')
stand_data_df['Place value'] = clean_data_df['Place_Val']
stand_data_df.drop(columns='Unnamed: 0', axis=1, inplace=True)
stand_data_df.to_csv('Data_Cleaned/Standardised_Data.csv')

### Multivariate Regression from Standardised Data
<b>L191003 - Saad Waseem</b>

        Price            Area
mean   2.326870e+07     22.419965
std    4.433047e+07    241.818415
min    1.000000e+05      0.000000
max    9.900000e+08  19200.000000


mean	1.422226e+06
std	6.614049e+05
min	6.244290e+04
max	2.644082e+06

In [192]:
reversals = {
    'price_values': {'max': 9.900000e+08, 'min': 1.000000e+05, 'std': 4.433047e+07, 'mean': 2.326870e+07},
    'area_values': {'max': 19200.000000, 'min': 0.000000, 'std': 241.818415, 'mean': 22.419965},
    'pv_values': {'max': 2.644082e+06, 'min': 6.244290e+04, 'std': 6.614049e+05, 'mean': 1.422226e+06}
}

def reverse(rev_name, prediction):
    u_std =  reversals[rev_name]['max'] - (prediction * (reversals[rev_name]['max'] - reversals[rev_name]['min'])) 
    return (u_std * reversals[rev_name]['std']) + reversals[rev_name]['mean']

def predict_price(A, PV, rm):
    return rm.coef_[0][0] * A + rm.coef_[0][1] * PV + rm.intercept_

In [179]:
from sklearn import linear_model

x = stand_data_df[['Area', 'Place value']]
y = stand_data_df[['Price']]

regression_model = linear_model.LinearRegression()
regression_model.fit(x, y)
print('Intercept: \n', regression_model.intercept_)
print('Coefficients: \n', regression_model.coef_)

Intercept: 
 [0.3459233]
Coefficients: 
 [[ 0.64923541 -0.03458693]]


In [197]:
standard_price = predict_price(0.999739583, 0.519754773, regression_model) # Area, Place Value
print('Predicted Value: ', reverse('price_values', standard_price))
print('Actual Value: ', reverse('price_values', 0.999739583))

Predicted Value:  [1.01316893e+15]
Actual Value:  15860879753831.158
Error:  [-9.97308047e+14]
