In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")


sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df = pd.read_csv('apartments_final.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            871 non-null    object 
 1   listing_id     871 non-null    object 
 2   zipcode        871 non-null    int64  
 3   monthly_rent   871 non-null    int64  
 4   bedrooms       871 non-null    int64  
 5   bathrooms      871 non-null    float64
 6   square_feet    871 non-null    int64  
 7   walk_score     871 non-null    int64  
 8   transit_score  871 non-null    int64  
 9   deposit        871 non-null    int64  
 10  latitude       871 non-null    float64
 11  longitude      871 non-null    float64
 12  neighborhood   871 non-null    object 
 13  nbhd           871 non-null    object 
dtypes: float64(3), int64(7), object(4)
memory usage: 95.4+ KB


In [4]:
df.head()

Unnamed: 0,url,listing_id,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude,neighborhood,nbhd
0,https://www.apartments.com/2372-beckwith-dr-in...,ze0jqwy,46218,751,2,1.0,679,25,33,705,39.801,-86.126,Martindale-Brightwood,Martindale-Brightwood
1,https://www.apartments.com/7491-n-shadeland-av...,s34dq64,46250,1795,3,2.0,2500,54,33,1795,39.89,-86.045,I-69 Fall Creek,I-69 Fall Creek
2,https://www.apartments.com/nice-3-bedroom-ranc...,n669z9m,46237,1095,3,1.0,1439,29,32,1095,39.722,-86.121,University Heights,University Heights
3,https://www.apartments.com/1102-n-oakland-ave-...,c4kr5zf,46201,1300,3,1.5,1500,67,38,800,39.783,-86.113,Near Eastside,Near Eastside
4,https://www.apartments.com/634-e-10th-st-india...,rl2dfp0,46202,1600,2,2.0,1400,74,53,1600,39.781,-86.146,Chatham Arch,Chatham Arch


In [5]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

In [6]:
nbhd_counts=df['neighborhood'].value_counts()

top_10p=np.percentile(nbhd_counts, 90)

def nbhd_count(nbhd):
        if len(df[df.neighborhood == nbhd])>top_10p:
            return nbhd
        else:
            return 'Other'

In [7]:
#apply the nbhd_count function to sort out the top 10% of popular neighborhoods in the nbhd column
df['nbhd']=df['nbhd'].apply(nbhd_count)

In [8]:
df.head()

Unnamed: 0,url,listing_id,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude,neighborhood,nbhd
0,https://www.apartments.com/2372-beckwith-dr-in...,ze0jqwy,46218,751,2,1.0,679,25,33,705,39.801,-86.126,Martindale-Brightwood,Other
1,https://www.apartments.com/7491-n-shadeland-av...,s34dq64,46250,1795,3,2.0,2500,54,33,1795,39.89,-86.045,I-69 Fall Creek,Other
2,https://www.apartments.com/nice-3-bedroom-ranc...,n669z9m,46237,1095,3,1.0,1439,29,32,1095,39.722,-86.121,University Heights,Other
3,https://www.apartments.com/1102-n-oakland-ave-...,c4kr5zf,46201,1300,3,1.5,1500,67,38,800,39.783,-86.113,Near Eastside,Near Eastside
4,https://www.apartments.com/634-e-10th-st-india...,rl2dfp0,46202,1600,2,2.0,1400,74,53,1600,39.781,-86.146,Chatham Arch,Other


In [9]:
df.nbhd.unique()

array(['Other', 'Near Eastside', 'East Warren', 'Far Eastside',
       'Indianapolis', 'Snacks Guion Creek', 'South Perry', 'Augusta',
       'South Emerson', 'Eagle Creek'], dtype=object)

In [10]:

df['nbhd'].replace({1: 'Near Eastside',
                      2: 'East Warren',
                      3: 'Far Eastside',
                      4: 'Indianapolis',
                      5: 'Snacks Guion Creek',
                      6: 'South Perry',
                      7: 'Augusta',
                      8: 'South Emerson',
                      9:  'Eagle Creek',
                      10: 'Other'}, inplace=True)

In [11]:
df_dummy = pd.get_dummies(df['nbhd'])

In [12]:
df_dummy.head()

Unnamed: 0,Augusta,Eagle Creek,East Warren,Far Eastside,Indianapolis,Near Eastside,Other,Snacks Guion Creek,South Emerson,South Perry
0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0


In [13]:
df =  pd.concat([df,df_dummy], axis=1)

In [14]:
X=df[['zipcode', 'bedrooms', 'bathrooms',
       'square_feet','Augusta', 'Eagle Creek',
       'East Warren', 'Far Eastside', 'Indianapolis', 'Near Eastside', 'Other',
       'Snacks Guion Creek', 'South Emerson', 'South Perry']]
y=df[['monthly_rent']]
X

Unnamed: 0,zipcode,bedrooms,bathrooms,square_feet,Augusta,Eagle Creek,East Warren,Far Eastside,Indianapolis,Near Eastside,Other,Snacks Guion Creek,South Emerson,South Perry
0,46218,2,1.000,679,0,0,0,0,0,0,1,0,0,0
1,46250,3,2.000,2500,0,0,0,0,0,0,1,0,0,0
2,46237,3,1.000,1439,0,0,0,0,0,0,1,0,0,0
3,46201,3,1.500,1500,0,0,0,0,0,1,0,0,0,0
4,46202,2,2.000,1400,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,46227,3,1.500,1398,0,0,0,0,0,0,1,0,0,0
867,46250,3,2.500,1759,0,0,0,0,0,0,1,0,0,0
868,46220,2,2.000,1177,0,0,0,0,0,0,1,0,0,0
869,46240,2,2.000,1084,0,0,0,0,0,0,1,0,0,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)

In [16]:

## Create an instance of Random Forest Regressor
#
forest = RandomForestRegressor(n_estimators=100,
                               max_depth=5,
                                 random_state=42)
#
# Fit the model
#
forest.fit(X_train, y_train)
 
#

#

RandomForestRegressor(max_depth=5, random_state=42)

In [19]:

#

## Measure model performance
y_pred = forest.predict(X_test)
#print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))