# **Equity Residential Apartment Price Model Building**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import warnings
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

%matplotlib inline

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/equity-residential-apartment-data/Equity_Apartments_Data.csv')

In [None]:
df.head()

In [None]:
df.shape

### Data Cleaning

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.isnull().sum()

Removing Un Necessary Columns.

In [None]:
df = df.drop(['URL','unit_id','building_id','Unique_ID','Amenity'], axis=1)

In [None]:
df['Northern_Exposure']= df['Northern_Exposure'].fillna(df.Northern_Exposure.mean())
df['Southern_Exposure']= df['Southern_Exposure'].fillna(df.Southern_Exposure.mean())
df['Eastern_Exposure']= df['Eastern_Exposure'].fillna(df.Eastern_Exposure.mean())
df['Western_Exposure']= df['Western_Exposure'].fillna(df.Western_Exposure.mean())
df['Balcony']= df['Balcony'].fillna(df.Balcony.mean())
df['Walk_In_Closet']= df['Walk_In_Closet'].fillna(df.Walk_In_Closet.mean())
df['Fireplace']= df['Fireplace'].fillna(df.Fireplace.mean())
df['City_Skyline']= df['City_Skyline'].fillna(df.City_Skyline.mean())
df['Fireplace']= df['Fireplace'].fillna(df.Fireplace.mean())
df['Kitchen_Island']= df['Kitchen_Island'].fillna(df.Kitchen_Island.mean())
df['Stainless_Appliances']= df['Stainless_Appliances'].fillna(df.Stainless_Appliances.mean())
df['Renovated']= df['Renovated'].fillna(df.Renovated.mean())
df['Office_Space']= df['Office_Space'].fillna(df.Fireplace.mean())
df['Days_Till_Available']= df['Days_Till_Available'].fillna(df.Days_Till_Available.mean())
df['Move_in_date'] = pd.to_datetime(df.Move_in_date)
df['Day_Recorded'] = pd.to_datetime(df.Day_Recorded)

In [None]:
df.isnull().sum()

Date Column "Move_in_date" is have some missing values, But there is another Date Column which is Day_Recorded, which shows that when the Sale is Recorded and there is another Columns Days_Till_Available, which Shows that when they Move in so if we add the Days Recorded in Days Till Available, we will eventually get the Columns Move_in_date, without any Null values.  

In [None]:
New_date = []
for i in range(df.shape[0]):
    Move_date = df.Day_Recorded[i]+dt.timedelta(days=df.Days_Till_Available[i])
    New_date.append(Move_date)

In [None]:
df['Move_in_date(new)'] = New_date

In [None]:
df[['Move_in_date','Move_in_date(new)']]

Here We can see Old Column Move in date is completely identical with New Move In date without any null values, so we do not need old column.

In [None]:
df.drop('Move_in_date',axis=1,inplace=True)

In [None]:
df.isnull().sum()

### Feature Engineering

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(),annot=True)

In [None]:
df2 = df.drop(['Northern_Exposure','Southern_Exposure','Eastern_Exposure','Western_Exposure','Balcony','Fireplace', 'City_Skyline', 'Kitchen_Island', 'Stainless_Appliances',
       'Renovated', 'Office_Space','Address','Days_Till_Available','Day_of_the_week_recorded','Apartment Name','Walk_In_Closet','Estiamted_Vacancy','Units'],axis=1)

In [None]:
df2.head()

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df2.Price,kde=True)

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df2.Beds,kde=True)

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df2.Baths,kde=True)

In [None]:
plt.figure(figsize=(15,8))
sns.distplot(df2['sq.ft'],kde=True)

In [None]:
df2.head()

In [None]:
cities = pd.get_dummies(df.City).drop(df.City.value_counts().tail(1).index[0],axis=1)
cities.head()

In [None]:
df3 = pd.concat([df2,cities],axis=1)

In [None]:
df3.head()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df3.corr(),annot=True)

### Model Building

In [None]:
df3.columns

In [None]:
x = df3[['Beds','sq.ft','Floor','Boston', 'Denver', 'Los Angeles', 'New York City',
       'Orange County', 'San Diego', 'San Francisco', 'Seattle',
       'Washington DC']]
y = df3.Price

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

In [None]:
RFR = RandomForestRegressor(n_estimators = 100,random_state=0)

In [None]:
RFR.fit(x_train,y_train)

In [None]:
RFR.score(x_test,y_test)

Our Created Model have the Accuracy of **99.04%**.