### Part 1 - Create and evaluate an initial model


__EXPLANATION__
provided the dataset, we are about to train a random forest model to predict New York City rent prices. 
before training the model, the dataset will go through a data cleaning process where we will be modifying the dataset in a way that is appropriate for training a model. 

In [57]:
import pandas as pd
import numpy as np
data = pd.read_csv('rent_8.csv')
print(data.shape)

(20000, 15)


In [58]:
data.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,longitude,manager_id,photos,price,street_address,interest_level,num_desc_words
0,1.0,1,ec0d62637d75f9d7207d9f9bbc939a28,2016-05-10 03:31:32,MARVELOUS one bedroom in Upper East Side...sho...,E 95 Street,['Hardwood Floors'],40.7827,-73.9461,ed17ada77c1b94b4fb00d126a0c2ea06,[],1850,332 E 95 Street,1,30
1,1.0,2,0,2016-06-17 01:22:53,Gorgeous 2 bedroom (Queen size bedrooms) and 1...,East 32nd Street,"['Doorman', 'Multi-Level', 'Pre-War', 'Dogs Al...",40.7471,-73.9846,ad2928226654cf28722d0dee31e54040,['https://photos.renthop.com/2/7174539_b4c0ff6...,3775,7-9 East 32nd Street,1,59


In [59]:
data.head(2).T #transpose of dataframe.

Unnamed: 0,0,1
bathrooms,1.0,1.0
bedrooms,1,2
building_id,ec0d62637d75f9d7207d9f9bbc939a28,0
created,2016-05-10 03:31:32,2016-06-17 01:22:53
description,MARVELOUS one bedroom in Upper East Side...sho...,Gorgeous 2 bedroom (Queen size bedrooms) and 1...
display_address,E 95 Street,East 32nd Street
features,['Hardwood Floors'],"['Doorman', 'Multi-Level', 'Pre-War', 'Dogs Al..."
latitude,40.7827,40.7471
longitude,-73.9461,-73.9846
manager_id,ed17ada77c1b94b4fb00d126a0c2ea06,ad2928226654cf28722d0dee31e54040


In [60]:
data.info()
#last column specift the data type of the column.
#here, object is equivalent to string data type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bathrooms        20000 non-null  float64
 1   bedrooms         20000 non-null  int64  
 2   building_id      20000 non-null  object 
 3   created          20000 non-null  object 
 4   description      19391 non-null  object 
 5   display_address  19934 non-null  object 
 6   features         20000 non-null  object 
 7   latitude         20000 non-null  float64
 8   longitude        20000 non-null  float64
 9   manager_id       20000 non-null  object 
 10  photos           20000 non-null  object 
 11  price            20000 non-null  int64  
 12  street_address   19996 non-null  object 
 13  interest_level   20000 non-null  int64  
 14  num_desc_words   20000 non-null  int64  
dtypes: float64(3), int64(4), object(8)
memory usage: 2.3+ MB


In [61]:
#creating a subset of original data because we want only numeric data, rather than non numeric data. 
num_data = data[['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']]
num_data.head(2)

Unnamed: 0,bathrooms,bedrooms,longitude,latitude,price
0,1.0,1,-73.9461,40.7827,1850
1,1.0,2,-73.9846,40.7471,3775


In [62]:
#checking if there are any missing values in the columns. 
print(num_data.isnull().any())

bathrooms    False
bedrooms     False
longitude    False
latitude     False
price        False
dtype: bool


__FEATURE AND TARGET SEPARATION__

In [63]:
x_train = num_data.drop('price', axis=1)
y_train = num_data['price']

In [64]:
#creating model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score = True)

In [65]:
#fitting model to the training data.
rf.fit(x_train, y_train)

In [66]:
r2 = rf.score(x_train, y_train)
print(f"{r2:.4f}")

0.8534


__NOTE! the OOB score is computed as the number of correctly predicted rows from the out of bag sample.__

In [67]:
noisy_oob = rf.oob_score_
print(f"OOB Score {noisy_oob:.4f}")

OOB Score 0.0487


__AVERAGE OOB SCORE AFTER PERFORMING 10 RUNS__


In [68]:
scores = []
i = 0
while i<10:
    rf.fit(x_train,y_train)
    oob_score = rf.oob_score_
    scores.append(oob_score)
    i+=1

In [69]:
avg_oob_score = np.mean(scores)
print(avg_oob_score)

0.046721490195516004


### Part 2 - Denoise the data

__EXPLANATION__
Performing EDA (exploratory data analysis) where we will explore data to find anomalies. such as weird prices of apartments, and finding hotels that are only in New York.  

here, for the price factor, we are assuming the lowest price can be at least 900 dollars which might contain all basic amenities such as doors, windows, and so on. As for the upper limit, we are considering 9000 dollars. as for the latitude and longitude, we will select only essential range and ignore all other values such as 0 or coordinates which does not belong to New York. 

In [70]:
num_data.describe()

Unnamed: 0,bathrooms,bedrooms,longitude,latitude,price
count,20000.0,20000.0,20000.0,20000.0,20000.0
mean,1.20985,1.5427,-67.830865,40.514548,3538.713
std,0.499575,1.119214,28.338739,4.98745,8549.412
min,0.0,0.0,-118.271,0.0,-17000.0
25%,1.0,1.0,-73.9909,40.7266,2450.0
50%,1.0,1.0,-73.976,40.7521,3100.0
75%,1.0,2.0,-73.9523,40.7754,4000.0
max,7.0,7.0,88.2892,58.3218,1150000.0


In [71]:
print(num_data.bathrooms.value_counts())

bathrooms
1.0    16025
2.0     3088
3.0      284
1.5      253
0.0      123
2.5      104
4.0       71
3.5       28
5.0       11
4.5        7
5.5        3
7.0        1
6.0        1
6.5        1
Name: count, dtype: int64


In [72]:
print(num_data.bedrooms.value_counts())

bedrooms
1    6383
2    5916
0    3846
3    2947
4     770
5     111
6      26
7       1
Name: count, dtype: int64


In [73]:
clean_data = num_data[(num_data.price > 900) & (num_data.price < 9_000)]


In [74]:
clean_data.describe()

Unnamed: 0,bathrooms,bedrooms,longitude,latitude,price
count,19044.0,19044.0,19044.0,19044.0,19044.0
mean,1.172758,1.505461,-67.789548,40.498787,3405.922968
std,0.417111,1.090709,28.47512,5.007054,1347.038277
min,0.0,0.0,-118.271,0.0,950.0
25%,1.0,1.0,-73.991,40.7264,2495.0
50%,1.0,1.0,-73.9761,40.7516,3100.0
75%,1.0,2.0,-73.952,40.7752,3995.0
max,4.5,6.0,88.2892,58.3218,8995.0


In [75]:
clean_data = clean_data[(clean_data.longitude!=0) | (clean_data.latitude!=0)]
clean_data = clean_data[(clean_data['latitude']>40.55) &
                    (clean_data['latitude']<40.94) &
                    (clean_data['longitude']>-74.1) &
                    (clean_data['longitude']<-73.67)]

In [76]:
clean_data.describe()

Unnamed: 0,bathrooms,bedrooms,longitude,latitude,price
count,18100.0,18100.0,18100.0,18100.0,18100.0
mean,1.172928,1.506464,-73.972336,40.750725,3407.803591
std,0.415807,1.089869,0.029602,0.040048,1349.236917
min,0.0,0.0,-74.0852,40.5712,950.0
25%,1.0,1.0,-73.9918,40.7279,2495.0
50%,1.0,1.0,-73.9779,40.7517,3100.0
75%,1.0,2.0,-73.9546,40.7741,3995.0
max,4.0,6.0,-73.7001,40.9154,8995.0


In [77]:
x,y = clean_data.drop('price', axis=1), clean_data['price']
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(x,y)
clean_oob_r2 = rf.oob_score_
print(f"Validation OOB score {clean_oob_r2:.4f}")


Validation OOB score 0.8270


__AVERAGE OOB SCORE AFTER PERFORMING 10 RUNS__

In [78]:
scores = []
i = 0
while i<10:
    rf.fit(x,y)
    oob_score = rf.oob_score_
    scores.append(oob_score)
    i+=1

In [79]:
avg_clean_oob = np.mean(scores)
print(avg_clean_oob)

0.8269900191170165
