In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pathlib as Path
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Note - You must set up your own config file
from config import db_password

In [4]:
# Name the SQL database your are accessing
database = "RealLeads"

# Make your local connection to the database in PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database=database,
    user="postgres",
    password=db_password)

In [5]:
# Table selection Function
def table_select(table_name):
    table =f"select * from \"{table_name}\""
    dataFrame = pd.read_sql(table, conn)
    return dataFrame

In [6]:
# Names of the tables to be imported
prop_charac = table_select("prop_charac_clean")
pub_rec = table_select("pub_rec_clean")
sales_data = table_select("sales_data_clean")

In [7]:
prop_charac.head()

Unnamed: 0,MLSNumber,Address,BuildingName,Ownership,Senior_Community_YN,Condo/Coop_Assoc_YN,HOA_YN,AssociationFee,AssociationFeeFrequency,Structure_Type,...,Garage_YN,GarageSpaces,GarageFeatures,Parking,ExteriorFeatures,ExteriorMaterial,Main_Roof,Foundation,PorchDeck,SwimmingPoolType
0,DENC518086,2615 Pecksniff Rd,NONE AVAILABLE,FeeSimple,No,No,Yes,$15,Annually,Detached,...,Yes,1.0,,,"ExtensiveHardscape,Sidewalks,StoneRetainingWal...","BrickFront,VinylSiding",ArchitecturalShingle,,"Patios,Porches",
1,DENC518982,4938 S Tupelo Turn,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,"AluminumSiding,Brick,VinylSiding",Asphalt,,,
2,DENC512992,15 Kristina Ct,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,VinylSiding,,,,
3,DENC512104,3251 Champions Dr,,FeeSimple,No,No,Yes,$50,Annually,Interior Row/Townhouse,...,Yes,1.0,,,,Other,Shingle,,Patios,
4,DENC503480,3706 Lafayette St,,FeeSimple,No,No,No,,,Detached,...,No,,,,,Asbestos,,,,


In [8]:
prop_charac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 59 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   MLSNumber                  5576 non-null   object 
 1   Address                    5576 non-null   object 
 2   BuildingName               1139 non-null   object 
 3   Ownership                  5576 non-null   object 
 4   Senior_Community_YN        5576 non-null   object 
 5   Condo/Coop_Assoc_YN        5576 non-null   object 
 6   HOA_YN                     5576 non-null   object 
 7   AssociationFee             1582 non-null   object 
 8   AssociationFeeFrequency    1582 non-null   object 
 9   Structure_Type             5576 non-null   object 
 10  Acres                      5354 non-null   float64
 11  LotDimensions              4847 non-null   object 
 12  LotDescription             1455 non-null   object 
 13  FeeIncludes                661 non-null    objec

In [9]:
pub_rec.head()

Unnamed: 0,MLSNumber,Tax_ID,Address-truncated,PropertyCityState,Zip_Code,Zip4,CarrierRoute,PropDoNotMail,OwnerNames,OwnerLastName,...,BldgSqFtTotal,Stories,Bedrooms,Exterior,BsmtDesc,FireplaceTotal,GrgType,HeatDelivery,YearBuilt,YearRemod
0,DENC518086,08-038.30-119,2615 Pecksniff,"Wilmington, DE",19808,3026,C010,N,James Robinson,Robinson,...,1875.0,1.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,Att/BuiltIn/Bsmt,Hot Water/Steam,1958,0
1,DENC518982,08-036.10-081,4938 S Tupelo,"Wilmington, DE",19808,1026,C009,N,Xiaopeng Deng,Deng,...,1575.0,2.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,,Hot/Warm Air,1976,0
2,DENC512992,08-044.30-363,15 Kristina,"Wilmington, DE",19808,4063,C084,N,Robert F Walls,Walls,...,,2.0,2.0,"Aluminum, Vinyl",,0.0,,Heat Pump,1985,0
3,DENC512104,08-036.40-376,3251 Champions,"Wilmington, DE",19808,2601,C039,N,Michael J Downs,Downs,...,,2.0,2.0,Other,,1.0,Att/BuiltIn/Bsmt,Heat Pump,1985,0
4,DENC503480,07-041.10-071,3706 Lafayette,"Wilmington, DE",19808,6014,C001,N,Maria Corona,Corona,...,,1.0,3.0,Asbestos,,0.0,,Hot/Warm Air,1957,0


In [10]:
pub_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 58 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MLSNumber                5576 non-null   object 
 1   Tax_ID                   5576 non-null   object 
 2   Address-truncated        5576 non-null   object 
 3   PropertyCityState        5576 non-null   object 
 4   Zip_Code                 5576 non-null   object 
 5   Zip4                     5576 non-null   object 
 6   CarrierRoute             5572 non-null   object 
 7   PropDoNotMail            5573 non-null   object 
 8   OwnerNames               5246 non-null   object 
 9   OwnerLastName            5206 non-null   object 
 10  OwnerFirstName           4917 non-null   object 
 11  Owner2LastName           2359 non-null   object 
 12  Owner2FirstName          2359 non-null   object 
 13  Owner3LastName           89 non-null     object 
 14  Owner3FirstName         

In [11]:
sales_data.head()

Unnamed: 0,MLSNumber,Address,Status,Sold_Price,Sold_Price_less_Concession,Orig_List_Price,Current_List_Price,Days_on_Market,Previous_Days_on_Market,ListDate,StatusDate,Agreement_of_Sale_Date,SettledDate,Concessions_YN,Concessions_Remarks,SellerConcessionsAmount,FinalFinancing
0,DENC518086,2615 Pecksniff Rd,Closed,335000.0,335000.0,330000.0,330000.0,5.0,5.0,2020-12-11,2021-02-11,2020-12-16,2021-02-11,No,,0.0,FHA
1,DENC518982,4938 S Tupelo Turn,Closed,200000.0,200000.0,215500.0,210000.0,47.0,67.0,2021-01-07,2021-04-08,2021-02-21,2021-04-08,No,,0.0,Conventional
2,DENC512992,15 Kristina Ct,Closed,200000.0,200000.0,200000.0,200000.0,15.0,15.0,2020-12-28,2021-02-26,2021-01-09,2021-02-26,No,,0.0,Conventional
3,DENC512104,3251 Champions Dr,Closed,200000.0,200000.0,219900.0,214900.0,11.0,11.0,2020-10-27,2020-12-14,2020-11-05,2020-12-11,No,,0.0,Conventional
4,DENC503480,3706 Lafayette St,Closed,200000.0,200000.0,190000.0,190000.0,3.0,3.0,2020-06-20,2020-08-02,2020-06-21,2020-07-31,No,,0.0,Conventional


In [12]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   MLSNumber                   5576 non-null   object 
 1   Address                     5576 non-null   object 
 2   Status                      5576 non-null   object 
 3   Sold_Price                  5576 non-null   float64
 4   Sold_Price_less_Concession  5576 non-null   float64
 5   Orig_List_Price             5576 non-null   float64
 6   Current_List_Price          5576 non-null   float64
 7   Days_on_Market              5576 non-null   float64
 8   Previous_Days_on_Market     5576 non-null   float64
 9   ListDate                    5576 non-null   object 
 10  StatusDate                  5576 non-null   object 
 11  Agreement_of_Sale_Date      5576 non-null   object 
 12  SettledDate                 5576 non-null   object 
 13  Concessions_YN              5576 

In [13]:
# Drop Rows from prop_charac
prop_charac = prop_charac[['MLSNumber',
                            'Bedrooms',
                            'Baths',
                            'Basement_YN',
                            'Garage_YN',
                            'AboveGradeSqFt',
                            'BelowGradeSqFt',
                            'Condo/Coop_Assoc_YN',
                            'Central_Air_YN',
                            'HOA_YN',
                            'Age',
                          'Ownership',
                          'Structure_Type',
                          'InteriorSqFt',
                          'NumberofStories']]

prop_charac

Unnamed: 0,MLSNumber,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
0,DENC518086,4.0,2.0,Yes,Yes,2099.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1"
1,DENC518982,3.0,2.0,Yes,No,1575.0,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1"
2,DENC512992,2.0,2.0,Yes,No,1150.0,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1"
3,DENC512104,2.0,3.0,Yes,Yes,1425.0,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1"
4,DENC503480,3.0,2.0,Yes,No,925.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,3.0,2.0,Yes,Yes,3045.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main"
5572,DENC520392,4.0,4.0,Yes,Yes,3175.0,636.0,No,Yes,Yes,1973.0,FeeSimple,Detached,3811.0,"Lower1,Main,Upper1"
5573,DENC2014038,4.0,3.0,Yes,Yes,2850.0,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1"
5574,DENC2005484,4.0,3.0,Yes,Yes,3425.0,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1"


In [14]:
# Drop Rows from pub_rec
pub_rec = pub_rec[['MLSNumber',
                    'Zip_Code',
                    'SchoolDistrict',
                    'AnnualTax',
                    'LotAcres',
                    'SubdivisionNeighborhood',
                    'Municipality',
                      'Lot',
                      'OwnerOccupied',
                      'TotalLandAsmt',
                  'TotalBldgAsmt',
                  'PropertyClass',
                  'YearBuilt']]

pub_rec

Unnamed: 0,MLSNumber,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,TotalLandAsmt,TotalBldgAsmt,PropertyClass,YearBuilt
0,DENC518086,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,True,12600,52600,Residential,1958
1,DENC518982,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,False,9900,45400,Residential,1976
2,DENC512992,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,False,9400,38000,Residential,1985
3,DENC512104,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,True,8600,57300,Residential,1985
4,DENC503480,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,True,8600,32300,Residential,1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,True,31300,99400,Residential,1958
5572,DENC520392,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,True,25200,132700,Residential,1973
5573,DENC2014038,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,True,30000,103600,Residential,1989
5574,DENC2005484,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,True,28500,117800,Residential,1992


In [15]:
# Drop Rows from sales_data

sales_data = sales_data[['MLSNumber',
                        'Days_on_Market',
                        'Orig_List_Price']]

sales_data

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price
0,DENC518086,5.0,330000.0
1,DENC518982,47.0,215500.0
2,DENC512992,15.0,200000.0
3,DENC512104,11.0,219900.0
4,DENC503480,3.0,190000.0
...,...,...,...
5571,DENC520114,5.0,595000.0
5572,DENC520392,25.0,639000.0
5573,DENC2014038,7.0,529900.0
5574,DENC2005484,25.0,620000.0


In [16]:
# Merge sales_data and pub_rec dataframes
merge_df = pd.merge(sales_data, pub_rec, on='MLSNumber')
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,TotalLandAsmt,TotalBldgAsmt,PropertyClass,YearBuilt
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,True,12600,52600,Residential,1958
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,False,9900,45400,Residential,1976
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,False,9400,38000,Residential,1985
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,True,8600,57300,Residential,1985
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,True,8600,32300,Residential,1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,True,31300,99400,Residential,1958
5572,DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,True,25200,132700,Residential,1973
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,True,30000,103600,Residential,1989
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,True,28500,117800,Residential,1992


In [17]:
# Merge merge_df and prop_charac dataframes
merge_df = pd.merge(merge_df, prop_charac, on='MLSNumber')
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,...,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,...,2099.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1"
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,...,1575.0,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1"
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,...,1150.0,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1"
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,...,1425.0,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1"
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,...,925.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,...,3045.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main"
5572,DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,...,3175.0,636.0,No,Yes,Yes,1973.0,FeeSimple,Detached,3811.0,"Lower1,Main,Upper1"
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,...,2850.0,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1"
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,...,3425.0,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1"


In [18]:
# Info for all columns in new dataframe
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5576 entries, 0 to 5575
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MLSNumber                5576 non-null   object 
 1   Days_on_Market           5576 non-null   float64
 2   Orig_List_Price          5576 non-null   float64
 3   Zip_Code                 5576 non-null   object 
 4   SchoolDistrict           5576 non-null   object 
 5   AnnualTax                5576 non-null   int64  
 6   LotAcres                 5576 non-null   float64
 7   SubdivisionNeighborhood  5288 non-null   object 
 8   Municipality             5576 non-null   object 
 9   Lot                      5576 non-null   object 
 10  OwnerOccupied            5576 non-null   bool   
 11  TotalLandAsmt            5576 non-null   int64  
 12  TotalBldgAsmt            5576 non-null   int64  
 13  PropertyClass            5576 non-null   object 
 14  YearBuilt               

In [19]:
# Determine the number of unique values in each column.
merge_df.nunique()

MLSNumber                  5576
Days_on_Market              200
Orig_List_Price             618
Zip_Code                     15
SchoolDistrict                4
AnnualTax                  2540
LotAcres                    173
SubdivisionNeighborhood     619
Municipality                 12
Lot                         844
OwnerOccupied                 2
TotalLandAsmt               376
TotalBldgAsmt              1139
PropertyClass                 4
YearBuilt                   154
Bedrooms                      9
Baths                         7
Basement_YN                   2
Garage_YN                     2
AboveGradeSqFt              478
BelowGradeSqFt              611
Condo/Coop_Assoc_YN           2
Central_Air_YN                2
HOA_YN                        2
Age                         166
Ownership                     4
Structure_Type                9
InteriorSqFt               1133
NumberofStories              35
dtype: int64

In [20]:
# Find null rows
merge_df.isna().sum()

MLSNumber                    0
Days_on_Market               0
Orig_List_Price              0
Zip_Code                     0
SchoolDistrict               0
AnnualTax                    0
LotAcres                     0
SubdivisionNeighborhood    288
Municipality                 0
Lot                          0
OwnerOccupied                0
TotalLandAsmt                0
TotalBldgAsmt                0
PropertyClass                0
YearBuilt                    0
Bedrooms                     5
Baths                        5
Basement_YN                  0
Garage_YN                    0
AboveGradeSqFt               0
BelowGradeSqFt             182
Condo/Coop_Assoc_YN          0
Central_Air_YN               0
HOA_YN                       0
Age                          0
Ownership                    0
Structure_Type               0
InteriorSqFt                 0
NumberofStories              5
dtype: int64

In [21]:
# Drop rows with null values
merge_df = merge_df.dropna()
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,...,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,...,2099.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1"
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,...,1575.0,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1"
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,...,1150.0,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1"
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,...,1425.0,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1"
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,...,925.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5570,DENC2013538,5.0,589900.0,19707,Red Clay Consolidated,3503,0.82,Berkeley Ridge,Mill Creek Hundred,8,...,2425.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,2425.0,"Lower1,Main"
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,...,3045.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main"
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,...,2850.0,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1"
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,...,3425.0,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1"


In [22]:
# Find max days on market
merge_df['Days_on_Market'].max()

655.0

In [23]:
# Check binning
pd.cut(merge_df['Days_on_Market'], bins=8).value_counts()

(-0.655, 81.875]     4931
(81.875, 163.75]      136
(163.75, 245.625]      20
(245.625, 327.5]       13
(327.5, 409.375]        4
(491.25, 573.125]       3
(573.125, 655.0]        2
(409.375, 491.25]       1
Name: Days_on_Market, dtype: int64

In [25]:
# Drop rows with condition over 120 days on market
merge_df.drop(merge_df[merge_df['Days_on_Market'] >= 120].index, inplace = True)

In [27]:
clean_df = merge_df

In [28]:
# Cut data
cut_labels = ['Less than 1 month', '1-2 months', '2-3 months', 'More than 3 months']
cut_bins = [0, 30, 60, 90, 120]
clean_df['DaysOnMarket_Buckets'] = pd.cut(clean_df['Days_on_Market'], bins=cut_bins, labels=cut_labels)
clean_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,...,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,...,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1",Less than 1 month
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,...,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1",1-2 months
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,...,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1",Less than 1 month
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,...,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1",Less than 1 month
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,...,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main,Less than 1 month
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5570,DENC2013538,5.0,589900.0,19707,Red Clay Consolidated,3503,0.82,Berkeley Ridge,Mill Creek Hundred,8,...,0.0,No,Yes,No,1957.0,FeeSimple,Detached,2425.0,"Lower1,Main",Less than 1 month
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,...,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main",Less than 1 month
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,...,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1",Less than 1 month
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,...,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1",Less than 1 month


In [29]:
# Change index to MLSNumber
clean_df = clean_df.set_index('MLSNumber')
clean_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,True,...,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1",Less than 1 month
DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,False,...,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1",1-2 months
DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,False,...,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1",Less than 1 month
DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,True,...,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1",Less than 1 month
DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,True,...,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main,Less than 1 month
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC2013538,5.0,589900.0,19707,Red Clay Consolidated,3503,0.82,Berkeley Ridge,Mill Creek Hundred,8,True,...,0.0,No,Yes,No,1957.0,FeeSimple,Detached,2425.0,"Lower1,Main",Less than 1 month
DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,True,...,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main",Less than 1 month
DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,True,...,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1",Less than 1 month
DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,True,...,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1",Less than 1 month


In [30]:
# Change object columns to category
for label, content in clean_df.items():
    if pd.api.types.is_string_dtype(content):
        clean_df[label] = content.astype("category").cat.as_ordered()
        
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5019 entries, DENC518086 to DENC526982
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Days_on_Market           5019 non-null   float64 
 1   Orig_List_Price          5019 non-null   float64 
 2   Zip_Code                 5019 non-null   category
 3   SchoolDistrict           5019 non-null   category
 4   AnnualTax                5019 non-null   int64   
 5   LotAcres                 5019 non-null   float64 
 6   SubdivisionNeighborhood  5019 non-null   category
 7   Municipality             5019 non-null   category
 8   Lot                      5019 non-null   category
 9   OwnerOccupied            5019 non-null   bool    
 10  TotalLandAsmt            5019 non-null   int64   
 11  TotalBldgAsmt            5019 non-null   int64   
 12  PropertyClass            5019 non-null   category
 13  YearBuilt                5019 non-null   category
 14

In [31]:
# Change categories to numbers
for label,content in clean_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        clean_df[label] = pd.Categorical(content).codes+1

In [32]:
# Drop rows with null values
clean_df = clean_df.dropna()
clean_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,13,4,2151,0.26,445,4,242,True,...,0.0,1,2,2,1958.0,3,1,2099.0,27,1
DENC518982,47.0,215500.0,13,4,1824,0.06,385,4,764,False,...,352.0,1,2,1,1976.0,3,4,1927.0,27,2
DENC512992,15.0,200000.0,13,4,1563,0.05,600,4,540,False,...,0.0,1,2,1,1985.0,3,4,1150.0,27,1
DENC512104,11.0,219900.0,13,4,2174,0.05,184,4,554,True,...,0.0,1,2,2,1985.0,3,4,1425.0,27,1
DENC503480,3.0,190000.0,13,4,1349,0.17,526,2,752,True,...,0.0,1,2,1,1957.0,3,1,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC2013538,5.0,589900.0,1,4,3503,0.82,39,4,762,True,...,0.0,1,2,1,1957.0,3,1,2425.0,11,1
DENC520114,5.0,595000.0,1,4,4311,1.03,83,4,637,True,...,0.0,1,2,2,1958.0,3,1,3045.0,11,1
DENC2014038,7.0,529900.0,1,4,4407,0.39,470,4,436,True,...,341.0,1,2,1,1989.0,3,1,3191.0,12,1
DENC2005484,25.0,620000.0,1,4,4825,0.54,404,4,306,True,...,0.0,1,2,1,1992.0,3,1,3425.0,12,1


### Balanced Random Forest Classifier

In [75]:
y = clean_df["DaysOnMarket_Buckets"]
X = clean_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(3764, 27)

In [77]:
#Scaling Training Data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

#Scaliing the data

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [87]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=20)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=20)

In [88]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2590804719579142

In [38]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.07709852447919927, 'AnnualTax'),
 (0.07616671793941332, 'Lot'),
 (0.07450428732082429, 'Orig_List_Price'),
 (0.06780494179211782, 'SubdivisionNeighborhood'),
 (0.06753070320458465, 'TotalBldgAsmt'),
 (0.06625241493547084, 'TotalLandAsmt'),
 (0.06038315537933094, 'InteriorSqFt'),
 (0.06034543059011224, 'AboveGradeSqFt'),
 (0.05960326411524165, 'Age'),
 (0.05939896466588028, 'LotAcres'),
 (0.05846569415088071, 'YearBuilt'),
 (0.03642983448130925, 'NumberofStories'),
 (0.035988831343644216, 'Zip_Code'),
 (0.02802315549497175, 'Municipality'),
 (0.0247180716613994, 'BelowGradeSqFt'),
 (0.023038585921115833, 'Baths'),
 (0.020762904746055966, 'Bedrooms'),
 (0.020632190129483967, 'Structure_Type'),
 (0.01910812345292305, 'SchoolDistrict'),
 (0.01470551567071209, 'Garage_YN'),
 (0.010837720921867817, 'Central_Air_YN'),
 (0.01074289863810569, 'HOA_YN'),
 (0.010497593457806742, 'OwnerOccupied'),
 (0.009567466343795844, 'Basement_YN'),
 (0.00336930293707308, 'Condo/Coop_Assoc_YN'),
 (0.003127

In [91]:
# Drop columns from clean_df
updated_df = clean_df.drop(columns=["PropertyClass", "Condo/Coop_Assoc_YN", "Ownership", "Basement_YN"])
updated_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Central_Air_YN,HOA_YN,Age,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,13,4,2151,0.26,445,4,242,True,...,2,2099.0,0.0,2,2,1958.0,1,2099.0,27,1
DENC518982,47.0,215500.0,13,4,1824,0.06,385,4,764,False,...,1,1575.0,352.0,2,1,1976.0,4,1927.0,27,2
DENC512992,15.0,200000.0,13,4,1563,0.05,600,4,540,False,...,1,1150.0,0.0,2,1,1985.0,4,1150.0,27,1
DENC512104,11.0,219900.0,13,4,2174,0.05,184,4,554,True,...,2,1425.0,0.0,2,2,1985.0,4,1425.0,27,1
DENC503480,3.0,190000.0,13,4,1349,0.17,526,2,752,True,...,1,925.0,0.0,2,1,1957.0,1,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC2013538,5.0,589900.0,1,4,3503,0.82,39,4,762,True,...,2,2425.0,0.0,2,1,1957.0,1,2425.0,11,1
DENC520114,5.0,595000.0,1,4,4311,1.03,83,4,637,True,...,2,3045.0,0.0,2,2,1958.0,1,3045.0,11,1
DENC2014038,7.0,529900.0,1,4,4407,0.39,470,4,436,True,...,2,2850.0,341.0,2,1,1989.0,1,3191.0,12,1
DENC2005484,25.0,620000.0,1,4,4825,0.54,404,4,306,True,...,2,3425.0,0.0,2,1,1992.0,1,3425.0,12,1


In [92]:
y = updated_df["DaysOnMarket_Buckets"]
X = updated_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(3764, 23)

In [94]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=50)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=50)

In [95]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2533392430454854

In [96]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.075096108776525, 'Orig_List_Price'),
 (0.07378429416771168, 'SubdivisionNeighborhood'),
 (0.07279210470271867, 'TotalBldgAsmt'),
 (0.07079100527205313, 'AnnualTax'),
 (0.06996774481315958, 'Lot'),
 (0.06580973745461814, 'TotalLandAsmt'),
 (0.06353623487448899, 'InteriorSqFt'),
 (0.06288743364728451, 'AboveGradeSqFt'),
 (0.06268242538079363, 'Age'),
 (0.061064732010417205, 'YearBuilt'),
 (0.059496695324892386, 'LotAcres'),
 (0.041077805228403715, 'Zip_Code'),
 (0.03711721622262042, 'NumberofStories'),
 (0.030142502480167813, 'Municipality'),
 (0.026794654845916848, 'BelowGradeSqFt'),
 (0.02174853416985237, 'Bedrooms'),
 (0.021081092911223728, 'SchoolDistrict'),
 (0.020933882485502358, 'Baths'),
 (0.019824922403716125, 'Structure_Type'),
 (0.01348608138681864, 'Garage_YN'),
 (0.01148002308541977, 'HOA_YN'),
 (0.010644235842014627, 'OwnerOccupied'),
 (0.007760532513680741, 'Central_Air_YN')]

In [97]:
# Drop columns from clean_df
updated_df2 = updated_df.drop(columns=["Central_Air_YN", "HOA_YN", "OwnerOccupied", "Garage_YN", "Structure_Type", "LotAcres", "Baths", "SchoolDistrict", "Bedrooms", "BelowGradeSqFt", "Municipality", "YearBuilt"])
updated_df2

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,AnnualTax,SubdivisionNeighborhood,Lot,TotalLandAsmt,TotalBldgAsmt,AboveGradeSqFt,Age,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DENC518086,5.0,330000.0,13,2151,445,242,12600,52600,2099.0,1958.0,2099.0,27,1
DENC518982,47.0,215500.0,13,1824,385,764,9900,45400,1575.0,1976.0,1927.0,27,2
DENC512992,15.0,200000.0,13,1563,600,540,9400,38000,1150.0,1985.0,1150.0,27,1
DENC512104,11.0,219900.0,13,2174,184,554,8600,57300,1425.0,1985.0,1425.0,27,1
DENC503480,3.0,190000.0,13,1349,526,752,8600,32300,925.0,1957.0,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC2013538,5.0,589900.0,1,3503,39,762,25500,80700,2425.0,1957.0,2425.0,11,1
DENC520114,5.0,595000.0,1,4311,83,637,31300,99400,3045.0,1958.0,3045.0,11,1
DENC2014038,7.0,529900.0,1,4407,470,436,30000,103600,2850.0,1989.0,3191.0,12,1
DENC2005484,25.0,620000.0,1,4825,404,306,28500,117800,3425.0,1992.0,3425.0,12,1


In [98]:
y = updated_df2["DaysOnMarket_Buckets"]
X = updated_df2.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [99]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(3764, 11)

In [100]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=50)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=50)

In [101]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.33843017096457007

In [102]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.10562975525725964, 'SubdivisionNeighborhood'),
 (0.10429489362566118, 'Lot'),
 (0.10403810823800089, 'AnnualTax'),
 (0.10224686608849738, 'Age'),
 (0.1001066158387976, 'TotalBldgAsmt'),
 (0.09950125155069256, 'Orig_List_Price'),
 (0.09723499967073217, 'TotalLandAsmt'),
 (0.08807894765843374, 'InteriorSqFt'),
 (0.0842323466026657, 'AboveGradeSqFt'),
 (0.060163650592504295, 'Zip_Code'),
 (0.05447256487675489, 'NumberofStories')]

In [234]:
# Drop columns from clean_df
updated_df3 = updated_df2.drop(columns=["NumberofStories"])
updated_df3

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,AnnualTax,SubdivisionNeighborhood,Lot,TotalLandAsmt,TotalBldgAsmt,AboveGradeSqFt,Age,InteriorSqFt,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DENC518086,5.0,330000.0,13,2151,445,242,12600,52600,2099.0,1958.0,2099.0,1
DENC518982,47.0,215500.0,13,1824,385,764,9900,45400,1575.0,1976.0,1927.0,2
DENC512992,15.0,200000.0,13,1563,600,540,9400,38000,1150.0,1985.0,1150.0,1
DENC512104,11.0,219900.0,13,2174,184,554,8600,57300,1425.0,1985.0,1425.0,1
DENC503480,3.0,190000.0,13,1349,526,752,8600,32300,925.0,1957.0,925.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
DENC2013538,5.0,589900.0,1,3503,39,762,25500,80700,2425.0,1957.0,2425.0,1
DENC520114,5.0,595000.0,1,4311,83,637,31300,99400,3045.0,1958.0,3045.0,1
DENC2014038,7.0,529900.0,1,4407,470,436,30000,103600,2850.0,1989.0,3191.0,1
DENC2005484,25.0,620000.0,1,4825,404,306,28500,117800,3425.0,1992.0,3425.0,1


In [235]:
y = updated_df3["DaysOnMarket_Buckets"]
X = updated_df3.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [236]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(3764, 10)

In [237]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=10)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=10)

In [238]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.19979521363694777

In [233]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.11884002217898801, 'Age'),
 (0.11663004599182866, 'SubdivisionNeighborhood'),
 (0.11597739936168203, 'TotalBldgAsmt'),
 (0.1159106092845214, 'Orig_List_Price'),
 (0.11447086201297309, 'Lot'),
 (0.11412639280654856, 'AnnualTax'),
 (0.10878051918821857, 'TotalLandAsmt'),
 (0.09936989698597726, 'AboveGradeSqFt'),
 (0.09589425218926247, 'InteriorSqFt')]

###  Naive Random Oversampling

In [103]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({1: 3321, 2: 3321, 3: 3321, 4: 3321, 0: 3321})

In [104]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=10000, random_state=1)

In [105]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.23295296095410284

In [47]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  0,   6,   0,   2,   0],
       [374, 302, 180, 111, 156],
       [ 17,  19,  17,  14,  17],
       [  4,   6,   3,   4,   6],
       [  2,   1,   6,   3,   5]], dtype=int64)

### EasyEnsembleClassifier

In [204]:
y = clean_df["DaysOnMarket_Buckets"]
X = clean_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [207]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_jobs=-1, random_state=100)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_jobs=-1, random_state=100)

In [208]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2682323532323533

In [141]:
# Drop columns from clean_df
updated_df = clean_df.drop(columns=["PropertyClass", "Condo/Coop_Assoc_YN", "Ownership", "Basement_YN"])
updated_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Central_Air_YN,HOA_YN,Age,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,13,4,2151,0.26,445,4,242,True,...,2,2099.0,0.0,2,2,1958.0,1,2099.0,27,1
DENC518982,47.0,215500.0,13,4,1824,0.06,385,4,764,False,...,1,1575.0,352.0,2,1,1976.0,4,1927.0,27,2
DENC512992,15.0,200000.0,13,4,1563,0.05,600,4,540,False,...,1,1150.0,0.0,2,1,1985.0,4,1150.0,27,1
DENC512104,11.0,219900.0,13,4,2174,0.05,184,4,554,True,...,2,1425.0,0.0,2,2,1985.0,4,1425.0,27,1
DENC503480,3.0,190000.0,13,4,1349,0.17,526,2,752,True,...,1,925.0,0.0,2,1,1957.0,1,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC2013538,5.0,589900.0,1,4,3503,0.82,39,4,762,True,...,2,2425.0,0.0,2,1,1957.0,1,2425.0,11,1
DENC520114,5.0,595000.0,1,4,4311,1.03,83,4,637,True,...,2,3045.0,0.0,2,2,1958.0,1,3045.0,11,1
DENC2014038,7.0,529900.0,1,4,4407,0.39,470,4,436,True,...,2,2850.0,341.0,2,1,1989.0,1,3191.0,12,1
DENC2005484,25.0,620000.0,1,4,4825,0.54,404,4,306,True,...,2,3425.0,0.0,2,1,1992.0,1,3425.0,12,1


In [194]:
y = updated_df["DaysOnMarket_Buckets"]
X = updated_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [195]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(3764, 23)

In [202]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_jobs=-1, random_state=50)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_jobs=-1, random_state=50)

In [203]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.24404662904662905