In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")


sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df = pd.read_csv('apartments_final.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            871 non-null    object 
 1   listing_id     871 non-null    object 
 2   zipcode        871 non-null    int64  
 3   monthly_rent   871 non-null    int64  
 4   bedrooms       871 non-null    int64  
 5   bathrooms      871 non-null    float64
 6   square_feet    871 non-null    float64
 7   walk_score     871 non-null    int64  
 8   transit_score  871 non-null    int64  
 9   deposit        871 non-null    int64  
 10  latitude       871 non-null    float64
 11  longitude      871 non-null    float64
 12  neighborhood   871 non-null    object 
 13  nbhd           871 non-null    object 
dtypes: float64(4), int64(6), object(4)
memory usage: 95.4+ KB


In [4]:
df.head()

Unnamed: 0,url,listing_id,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude,neighborhood,nbhd
0,https://www.apartments.com/2372-beckwith-dr-in...,ze0jqwy,46218,751,2,1.0,679.0,25,33,705,39.801,-86.126,Martindale-Brightwood,Martindale-Brightwood
1,https://www.apartments.com/7491-n-shadeland-av...,s34dq64,46250,1795,3,2.0,2500.0,54,33,1795,39.89,-86.045,I-69 Fall Creek,I-69 Fall Creek
2,https://www.apartments.com/nice-3-bedroom-ranc...,n669z9m,46237,1095,3,1.0,1439.0,29,32,1095,39.722,-86.121,University Heights,University Heights
3,https://www.apartments.com/1102-n-oakland-ave-...,c4kr5zf,46201,1300,3,1.5,1500.0,67,38,800,39.783,-86.113,Near Eastside,Near Eastside
4,https://www.apartments.com/634-e-10th-st-india...,rl2dfp0,46202,1600,2,2.0,1400.0,74,53,1600,39.781,-86.146,Chatham Arch,Chatham Arch


In [5]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

In [6]:
nbhd_counts=df['neighborhood'].value_counts()

top_10p=np.percentile(nbhd_counts, 90)

def nbhd_count(nbhd):
        if len(df[df.neighborhood == nbhd])>top_10p:
            return nbhd
        else:
            return 'Other'

In [7]:
#apply the nbhd_count function to sort out the top 10% of popular neighborhoods in the nbhd column
df['nbhd']=df['nbhd'].apply(nbhd_count)

In [8]:
df.head()

Unnamed: 0,url,listing_id,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude,neighborhood,nbhd
0,https://www.apartments.com/2372-beckwith-dr-in...,ze0jqwy,46218,751,2,1.0,679.0,25,33,705,39.801,-86.126,Martindale-Brightwood,Other
1,https://www.apartments.com/7491-n-shadeland-av...,s34dq64,46250,1795,3,2.0,2500.0,54,33,1795,39.89,-86.045,I-69 Fall Creek,Other
2,https://www.apartments.com/nice-3-bedroom-ranc...,n669z9m,46237,1095,3,1.0,1439.0,29,32,1095,39.722,-86.121,University Heights,Other
3,https://www.apartments.com/1102-n-oakland-ave-...,c4kr5zf,46201,1300,3,1.5,1500.0,67,38,800,39.783,-86.113,Near Eastside,Near Eastside
4,https://www.apartments.com/634-e-10th-st-india...,rl2dfp0,46202,1600,2,2.0,1400.0,74,53,1600,39.781,-86.146,Chatham Arch,Other


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   url            871 non-null    object 
 1   listing_id     871 non-null    object 
 2   zipcode        871 non-null    int64  
 3   monthly_rent   871 non-null    int64  
 4   bedrooms       871 non-null    int64  
 5   bathrooms      871 non-null    float64
 6   square_feet    871 non-null    float64
 7   walk_score     871 non-null    int64  
 8   transit_score  871 non-null    int64  
 9   deposit        871 non-null    int64  
 10  latitude       871 non-null    float64
 11  longitude      871 non-null    float64
 12  neighborhood   871 non-null    object 
 13  nbhd           871 non-null    object 
dtypes: float64(4), int64(6), object(4)
memory usage: 95.4+ KB


In [10]:
df

Unnamed: 0,url,listing_id,zipcode,monthly_rent,bedrooms,bathrooms,square_feet,walk_score,transit_score,deposit,latitude,longitude,neighborhood,nbhd
0,https://www.apartments.com/2372-beckwith-dr-in...,ze0jqwy,46218,751,2,1.000,679.000,25,33,705,39.801,-86.126,Martindale-Brightwood,Other
1,https://www.apartments.com/7491-n-shadeland-av...,s34dq64,46250,1795,3,2.000,2500.000,54,33,1795,39.890,-86.045,I-69 Fall Creek,Other
2,https://www.apartments.com/nice-3-bedroom-ranc...,n669z9m,46237,1095,3,1.000,1439.000,29,32,1095,39.722,-86.121,University Heights,Other
3,https://www.apartments.com/1102-n-oakland-ave-...,c4kr5zf,46201,1300,3,1.500,1500.000,67,38,800,39.783,-86.113,Near Eastside,Near Eastside
4,https://www.apartments.com/634-e-10th-st-india...,rl2dfp0,46202,1600,2,2.000,1400.000,74,53,1600,39.781,-86.146,Chatham Arch,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,https://www.apartments.com/regency-lihtc-india...,2zv7v1c,46227,1088,3,1.500,1398.000,38,22,1088,39.685,-86.155,Southdale,Other
867,https://www.apartments.com/scandia-apartments-...,9er4df5,46250,1679,3,2.500,1759.000,35,0,1679,39.922,-86.071,Castleton,Other
868,https://www.apartments.com/ayr-indianapolis-in...,fkdb8hv,46220,1954,2,2.000,1177.000,51,33,250,39.866,-86.117,Glendale,Other
869,https://www.apartments.com/the-lodge-at-trails...,1wp4yw6,46240,1395,2,2.000,1084.000,32,18,1395,39.925,-86.138,Nora Far Northside,Other


In [11]:
df.nbhd.unique()

array(['Other', 'Near Eastside', 'East Warren', 'Far Eastside',
       'Indianapolis', 'Snacks Guion Creek', 'South Perry', 'Augusta',
       'South Emerson', 'Eagle Creek'], dtype=object)

In [12]:

df['nbhd'].replace({1: 'Near Eastside',
                      2: 'East Warren',
                      3: 'Far Eastside',
                      4: 'Indianapolis',
                      5: 'Snacks Guion Creek',
                      6: 'South Perry',
                      7: 'Augusta',
                      8: 'South Emerson',
                      9:  'Eagle Creek',
                      10: 'Other'}, inplace=True)

In [13]:
df_dummy = pd.get_dummies(df['nbhd'])

In [14]:
df_dummy.head()

Unnamed: 0,Augusta,Eagle Creek,East Warren,Far Eastside,Indianapolis,Near Eastside,Other,Snacks Guion Creek,South Emerson,South Perry
0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0


In [15]:
df_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   Augusta             871 non-null    uint8
 1   Eagle Creek         871 non-null    uint8
 2   East Warren         871 non-null    uint8
 3   Far Eastside        871 non-null    uint8
 4   Indianapolis        871 non-null    uint8
 5   Near Eastside       871 non-null    uint8
 6   Other               871 non-null    uint8
 7   Snacks Guion Creek  871 non-null    uint8
 8   South Emerson       871 non-null    uint8
 9   South Perry         871 non-null    uint8
dtypes: uint8(10)
memory usage: 8.6 KB


In [16]:
df =  pd.concat([df,df_dummy], axis=1)

In [17]:
df.columns

Index(['url', 'listing_id', 'zipcode', 'monthly_rent', 'bedrooms', 'bathrooms',
       'square_feet', 'walk_score', 'transit_score', 'deposit', 'latitude',
       'longitude', 'neighborhood', 'nbhd', 'Augusta', 'Eagle Creek',
       'East Warren', 'Far Eastside', 'Indianapolis', 'Near Eastside', 'Other',
       'Snacks Guion Creek', 'South Emerson', 'South Perry'],
      dtype='object')

In [24]:
X=df[['zipcode', 'bedrooms', 'bathrooms',
       'square_feet', 'deposit','Augusta', 'Eagle Creek',
       'East Warren', 'Far Eastside', 'Indianapolis', 'Near Eastside', 'Other',
       'Snacks Guion Creek', 'South Emerson', 'South Perry']]
y=df[['monthly_rent']]
X

Unnamed: 0,zipcode,bedrooms,bathrooms,square_feet,deposit,Augusta,Eagle Creek,East Warren,Far Eastside,Indianapolis,Near Eastside,Other,Snacks Guion Creek,South Emerson,South Perry
0,46218,2,1.000,679.000,705,0,0,0,0,0,0,1,0,0,0
1,46250,3,2.000,2500.000,1795,0,0,0,0,0,0,1,0,0,0
2,46237,3,1.000,1439.000,1095,0,0,0,0,0,0,1,0,0,0
3,46201,3,1.500,1500.000,800,0,0,0,0,0,1,0,0,0,0
4,46202,2,2.000,1400.000,1600,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,46227,3,1.500,1398.000,1088,0,0,0,0,0,0,1,0,0,0
867,46250,3,2.500,1759.000,1679,0,0,0,0,0,0,1,0,0,0
868,46220,2,2.000,1177.000,250,0,0,0,0,0,0,1,0,0,0
869,46240,2,2.000,1084.000,1395,0,0,0,0,0,0,1,0,0,0


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)

In [29]:

## Create an instance of Random Forest Regressor
#
forest = RandomForestRegressor(n_estimators=5,
                               max_depth=5,
                                 random_state=42)
#
# Fit the model
#
forest.fit(X_train, y_train)
 
#

#

RandomForestRegressor(max_depth=5, n_estimators=5, random_state=42)

In [31]:
# Measure model performance
y_pred = forest.predict(X_test)
#print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [33]:

from sklearn import metrics
print(metrics.r2_score(y_test,y_pred))


0.949671746386456


In [39]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix = confusion_matrix(y_test, y_pred)
# confusion_matrix

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets