# Task: House Price Analysis and Prediction 
Submission: 15-07-2024 Submission date: 30-07-2024 

# Step-1  Data Cleaning and Exploration:


1.1 Import Libraries 

In [1]:
import numpy as np
import pandas as pd

1.2 Load the Dataset

In [3]:
df=pd.read_csv(r'D:\Data_science_Projects\Task-1\zameen-updated.csv')

1.3 Check Data set


In [4]:
df.head()

Unnamed: 0,property_id,location_id,page_url,property_type,price,location,city,province_name,latitude,longitude,baths,area,purpose,bedrooms,date_added,agency,agent,Area Type,Area Size,Area Category
0,237062,3325,https://www.zameen.com/Property/g_10_g_10_2_gr...,Flat,10000000,G-10,Islamabad,Islamabad Capital,33.67989,73.01264,2,4 Marla,For Sale,2,02-04-2019,,,Marla,4.0,0-5 Marla
1,346905,3236,https://www.zameen.com/Property/e_11_2_service...,Flat,6900000,E-11,Islamabad,Islamabad Capital,33.700993,72.971492,3,5.6 Marla,For Sale,3,05-04-2019,,,Marla,5.6,5-10 Marla
2,386513,764,https://www.zameen.com/Property/islamabad_g_15...,House,16500000,G-15,Islamabad,Islamabad Capital,33.631486,72.926559,6,8 Marla,For Sale,5,07-17-2019,,,Marla,8.0,5-10 Marla
3,656161,340,https://www.zameen.com/Property/islamabad_bani...,House,43500000,Bani Gala,Islamabad,Islamabad Capital,33.707573,73.151199,4,2 Kanal,For Sale,4,04-05-2019,,,Kanal,2.0,1-5 Kanal
4,841645,3226,https://www.zameen.com/Property/dha_valley_dha...,House,7000000,DHA Defence,Islamabad,Islamabad Capital,33.492591,73.301339,3,8 Marla,For Sale,3,07-10-2019,Easy Property,Muhammad Junaid Ceo Muhammad Shahid Director,Marla,8.0,5-10 Marla


In [5]:
df.dtypes

property_id        int64
location_id        int64
page_url          object
property_type     object
price              int64
location          object
city              object
province_name     object
latitude         float64
longitude        float64
baths              int64
area              object
purpose           object
bedrooms           int64
date_added        object
agency            object
agent             object
Area Type         object
Area Size        float64
Area Category     object
dtype: object

1.4 Handle Missing Values:

In [6]:
df.isnull().sum()

property_id          0
location_id          0
page_url             0
property_type        0
price                0
location             0
city                 0
province_name        0
latitude             0
longitude            0
baths                0
area                 0
purpose              0
bedrooms             0
date_added           0
agency           44071
agent            44072
Area Type            0
Area Size            0
Area Category        0
dtype: int64

1.5  Drop columns with many missing values

Drop 'Agency' and 'agent' columns due to many missing  values

In [7]:
df.drop(columns=['agency','agent'], inplace=True)

In [8]:
df.head()

Unnamed: 0,property_id,location_id,page_url,property_type,price,location,city,province_name,latitude,longitude,baths,area,purpose,bedrooms,date_added,Area Type,Area Size,Area Category
0,237062,3325,https://www.zameen.com/Property/g_10_g_10_2_gr...,Flat,10000000,G-10,Islamabad,Islamabad Capital,33.67989,73.01264,2,4 Marla,For Sale,2,02-04-2019,Marla,4.0,0-5 Marla
1,346905,3236,https://www.zameen.com/Property/e_11_2_service...,Flat,6900000,E-11,Islamabad,Islamabad Capital,33.700993,72.971492,3,5.6 Marla,For Sale,3,05-04-2019,Marla,5.6,5-10 Marla
2,386513,764,https://www.zameen.com/Property/islamabad_g_15...,House,16500000,G-15,Islamabad,Islamabad Capital,33.631486,72.926559,6,8 Marla,For Sale,5,07-17-2019,Marla,8.0,5-10 Marla
3,656161,340,https://www.zameen.com/Property/islamabad_bani...,House,43500000,Bani Gala,Islamabad,Islamabad Capital,33.707573,73.151199,4,2 Kanal,For Sale,4,04-05-2019,Kanal,2.0,1-5 Kanal
4,841645,3226,https://www.zameen.com/Property/dha_valley_dha...,House,7000000,DHA Defence,Islamabad,Islamabad Capital,33.492591,73.301339,3,8 Marla,For Sale,3,07-10-2019,Marla,8.0,5-10 Marla


1.6 Fill missing values in numeric columns with median

In [9]:
numeric_cols=df.select_dtypes(include='number').columns
df[numeric_cols]=df[numeric_cols].fillna(df[numeric_cols].median)

1.7 Drop rows with missing values in non-numeric columns

In [10]:
df.dropna(subset=df.select_dtypes(exclude='number').columns, inplace=True)

1.8 Identify and Handle Outliers

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering

Create New Features


In [84]:
# Age of the House:

from datetime import datetime

current_year=2024
df['house_age']=current_year-df['date_added'].dt.year
df.date_added

0        2019-02-04
1        2019-05-04
2        2019-07-17
4        2019-07-10
5        2019-04-05
            ...    
168441   2019-07-18
168442   2019-07-18
168443   2019-07-18
168444   2019-07-18
168445   2019-07-18
Name: date_added, Length: 145413, dtype: datetime64[ns]

In [81]:
# Number of bedrooms per floor
df['price_per_area'] = df['price'] / df['Area Size']
df.price_per_area

0         2.500000e+06
1         1.232143e+06
2         2.062500e+06
4         8.750000e+05
5         2.156250e+07
              ...     
168441    2.760417e+06
168442    1.562500e+06
168443    2.812500e+06
168444    1.410256e+06
168445    9.574468e+05
Name: price_per_area, Length: 145413, dtype: float64

Encode Categorical Features

1. One-Hot Encoding:

In [85]:
df=pd.get_dummies(df, columns=['property_type', 'location'], drop_first=True)

Label encoding

In [87]:
from sklearn.preprocessing import LabelEncoder

# label_encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label encode 'city' and 'province_name'
label_encoder = LabelEncoder()
df['city'] = label_encoder.fit_transform(df['city'])
df['province_name'] = label_encoder.fit_transform(df['province_name'])
