In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pathlib as Path
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Note - You must set up your own config file
from config import db_password

In [4]:
# Name the SQL database your are accessing
database = "RealLeads"

# Make your local connection to the database in PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database=database,
    user="postgres",
    password=db_password)

In [5]:
# Table selection Function
def table_select(table_name):
    table =f"select * from \"{table_name}\""
    dataFrame = pd.read_sql(table, conn)
    return dataFrame

In [6]:
# Names of the tables to be imported
prop_charac = table_select("prop_charac_clean")
pub_rec = table_select("pub_rec_clean")
sales_data = table_select("sales_data_clean")

In [7]:
prop_charac.head()

Unnamed: 0,MLSNumber,Address,BuildingName,Ownership,Senior_Community_YN,Condo/Coop_Assoc_YN,HOA_YN,AssociationFee,AssociationFeeFrequency,Structure_Type,...,Garage_YN,GarageSpaces,GarageFeatures,Parking,ExteriorFeatures,ExteriorMaterial,Main_Roof,Foundation,PorchDeck,SwimmingPoolType
0,DENC518086,2615 Pecksniff Rd,NONE AVAILABLE,FeeSimple,No,No,Yes,$15,Annually,Detached,...,Yes,1.0,,,"ExtensiveHardscape,Sidewalks,StoneRetainingWal...","BrickFront,VinylSiding",ArchitecturalShingle,,"Patios,Porches",
1,DENC518982,4938 S Tupelo Turn,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,"AluminumSiding,Brick,VinylSiding",Asphalt,,,
2,DENC512992,15 Kristina Ct,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,VinylSiding,,,,
3,DENC512104,3251 Champions Dr,,FeeSimple,No,No,Yes,$50,Annually,Interior Row/Townhouse,...,Yes,1.0,,,,Other,Shingle,,Patios,
4,DENC503480,3706 Lafayette St,,FeeSimple,No,No,No,,,Detached,...,No,,,,,Asbestos,,,,


In [8]:
prop_charac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 59 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   MLSNumber                  5576 non-null   object 
 1   Address                    5576 non-null   object 
 2   BuildingName               1139 non-null   object 
 3   Ownership                  5576 non-null   object 
 4   Senior_Community_YN        5576 non-null   object 
 5   Condo/Coop_Assoc_YN        5576 non-null   object 
 6   HOA_YN                     5576 non-null   object 
 7   AssociationFee             1582 non-null   object 
 8   AssociationFeeFrequency    1582 non-null   object 
 9   Structure_Type             5576 non-null   object 
 10  Acres                      5354 non-null   float64
 11  LotDimensions              4847 non-null   object 
 12  LotDescription             1455 non-null   object 
 13  FeeIncludes                661 non-null    objec

In [9]:
pub_rec.head()

Unnamed: 0,MLSNumber,Tax_ID,Address-truncated,PropertyCityState,Zip_Code,Zip4,CarrierRoute,PropDoNotMail,OwnerNames,OwnerLastName,...,BldgSqFtTotal,Stories,Bedrooms,Exterior,BsmtDesc,FireplaceTotal,GrgType,HeatDelivery,YearBuilt,YearRemod
0,DENC518086,08-038.30-119,2615 Pecksniff,"Wilmington, DE",19808,3026,C010,N,James Robinson,Robinson,...,1875.0,1.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,Att/BuiltIn/Bsmt,Hot Water/Steam,1958,0
1,DENC518982,08-036.10-081,4938 S Tupelo,"Wilmington, DE",19808,1026,C009,N,Xiaopeng Deng,Deng,...,1575.0,2.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,,Hot/Warm Air,1976,0
2,DENC512992,08-044.30-363,15 Kristina,"Wilmington, DE",19808,4063,C084,N,Robert F Walls,Walls,...,,2.0,2.0,"Aluminum, Vinyl",,0.0,,Heat Pump,1985,0
3,DENC512104,08-036.40-376,3251 Champions,"Wilmington, DE",19808,2601,C039,N,Michael J Downs,Downs,...,,2.0,2.0,Other,,1.0,Att/BuiltIn/Bsmt,Heat Pump,1985,0
4,DENC503480,07-041.10-071,3706 Lafayette,"Wilmington, DE",19808,6014,C001,N,Maria Corona,Corona,...,,1.0,3.0,Asbestos,,0.0,,Hot/Warm Air,1957,0


In [10]:
pub_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 58 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MLSNumber                5576 non-null   object 
 1   Tax_ID                   5576 non-null   object 
 2   Address-truncated        5576 non-null   object 
 3   PropertyCityState        5576 non-null   object 
 4   Zip_Code                 5576 non-null   object 
 5   Zip4                     5576 non-null   object 
 6   CarrierRoute             5572 non-null   object 
 7   PropDoNotMail            5573 non-null   object 
 8   OwnerNames               5246 non-null   object 
 9   OwnerLastName            5206 non-null   object 
 10  OwnerFirstName           4917 non-null   object 
 11  Owner2LastName           2359 non-null   object 
 12  Owner2FirstName          2359 non-null   object 
 13  Owner3LastName           89 non-null     object 
 14  Owner3FirstName         

In [11]:
sales_data.head()

Unnamed: 0,MLSNumber,Address,Status,Sold_Price,Sold_Price_less_Concession,Orig_List_Price,Current_List_Price,Days_on_Market,Previous_Days_on_Market,ListDate,StatusDate,Agreement_of_Sale_Date,SettledDate,Concessions_YN,Concessions_Remarks,SellerConcessionsAmount,FinalFinancing
0,DENC518086,2615 Pecksniff Rd,Closed,335000.0,335000.0,330000.0,330000.0,5.0,5.0,2020-12-11,2021-02-11,2020-12-16,2021-02-11,No,,0.0,FHA
1,DENC518982,4938 S Tupelo Turn,Closed,200000.0,200000.0,215500.0,210000.0,47.0,67.0,2021-01-07,2021-04-08,2021-02-21,2021-04-08,No,,0.0,Conventional
2,DENC512992,15 Kristina Ct,Closed,200000.0,200000.0,200000.0,200000.0,15.0,15.0,2020-12-28,2021-02-26,2021-01-09,2021-02-26,No,,0.0,Conventional
3,DENC512104,3251 Champions Dr,Closed,200000.0,200000.0,219900.0,214900.0,11.0,11.0,2020-10-27,2020-12-14,2020-11-05,2020-12-11,No,,0.0,Conventional
4,DENC503480,3706 Lafayette St,Closed,200000.0,200000.0,190000.0,190000.0,3.0,3.0,2020-06-20,2020-08-02,2020-06-21,2020-07-31,No,,0.0,Conventional


In [12]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   MLSNumber                   5576 non-null   object 
 1   Address                     5576 non-null   object 
 2   Status                      5576 non-null   object 
 3   Sold_Price                  5576 non-null   float64
 4   Sold_Price_less_Concession  5576 non-null   float64
 5   Orig_List_Price             5576 non-null   float64
 6   Current_List_Price          5576 non-null   float64
 7   Days_on_Market              5576 non-null   float64
 8   Previous_Days_on_Market     5576 non-null   float64
 9   ListDate                    5576 non-null   object 
 10  StatusDate                  5576 non-null   object 
 11  Agreement_of_Sale_Date      5576 non-null   object 
 12  SettledDate                 5576 non-null   object 
 13  Concessions_YN              5576 

In [13]:
# Drop Rows from prop_charac
prop_charac = prop_charac[['MLSNumber',
                            'Bedrooms',
                            'Baths',
                            'Basement_YN',
                            'Garage_YN',
                            'AboveGradeSqFt',
                            'BelowGradeSqFt',
                            'Condo/Coop_Assoc_YN',
                            'Central_Air_YN',
                            'HOA_YN',
                            'Age',
                          'Ownership',
                          'Structure_Type',
                          'InteriorSqFt',
                          'NumberofStories']]

prop_charac

Unnamed: 0,MLSNumber,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
0,DENC518086,4.0,2.0,Yes,Yes,2099.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1"
1,DENC518982,3.0,2.0,Yes,No,1575.0,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1"
2,DENC512992,2.0,2.0,Yes,No,1150.0,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1"
3,DENC512104,2.0,3.0,Yes,Yes,1425.0,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1"
4,DENC503480,3.0,2.0,Yes,No,925.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,3.0,2.0,Yes,Yes,3045.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main"
5572,DENC520392,4.0,4.0,Yes,Yes,3175.0,636.0,No,Yes,Yes,1973.0,FeeSimple,Detached,3811.0,"Lower1,Main,Upper1"
5573,DENC2014038,4.0,3.0,Yes,Yes,2850.0,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1"
5574,DENC2005484,4.0,3.0,Yes,Yes,3425.0,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1"


In [14]:
# Drop Rows from pub_rec
pub_rec = pub_rec[['MLSNumber',
                    'Zip_Code',
                    'SchoolDistrict',
                    'AnnualTax',
                    'LotAcres',
                    'SubdivisionNeighborhood',
                    'Municipality',
                      'Lot',
                      'OwnerOccupied',
                      'TotalLandAsmt',
                  'TotalBldgAsmt',
                  'PropertyClass',
                  'YearBuilt']]

pub_rec

Unnamed: 0,MLSNumber,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,TotalLandAsmt,TotalBldgAsmt,PropertyClass,YearBuilt
0,DENC518086,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,True,12600,52600,Residential,1958
1,DENC518982,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,False,9900,45400,Residential,1976
2,DENC512992,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,False,9400,38000,Residential,1985
3,DENC512104,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,True,8600,57300,Residential,1985
4,DENC503480,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,True,8600,32300,Residential,1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,True,31300,99400,Residential,1958
5572,DENC520392,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,True,25200,132700,Residential,1973
5573,DENC2014038,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,True,30000,103600,Residential,1989
5574,DENC2005484,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,True,28500,117800,Residential,1992


In [15]:
# Drop Rows from sales_data

sales_data = sales_data[['MLSNumber',
                        'Days_on_Market',
                        'Orig_List_Price']]

sales_data

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price
0,DENC518086,5.0,330000.0
1,DENC518982,47.0,215500.0
2,DENC512992,15.0,200000.0
3,DENC512104,11.0,219900.0
4,DENC503480,3.0,190000.0
...,...,...,...
5571,DENC520114,5.0,595000.0
5572,DENC520392,25.0,639000.0
5573,DENC2014038,7.0,529900.0
5574,DENC2005484,25.0,620000.0


In [16]:
# Merge sales_data and pub_rec dataframes
merge_df = pd.merge(sales_data, pub_rec, on='MLSNumber')
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,TotalLandAsmt,TotalBldgAsmt,PropertyClass,YearBuilt
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,True,12600,52600,Residential,1958
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,False,9900,45400,Residential,1976
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,False,9400,38000,Residential,1985
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,True,8600,57300,Residential,1985
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,True,8600,32300,Residential,1957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,True,31300,99400,Residential,1958
5572,DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,True,25200,132700,Residential,1973
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,True,30000,103600,Residential,1989
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,True,28500,117800,Residential,1992


In [17]:
# Merge merge_df and prop_charac dataframes
merge_df = pd.merge(merge_df, prop_charac, on='MLSNumber')
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,...,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,...,2099.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1"
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,...,1575.0,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1"
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,...,1150.0,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1"
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,...,1425.0,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1"
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,...,925.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,...,3045.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main"
5572,DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,...,3175.0,636.0,No,Yes,Yes,1973.0,FeeSimple,Detached,3811.0,"Lower1,Main,Upper1"
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,...,2850.0,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1"
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,...,3425.0,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1"


In [18]:
# Info for all columns in new dataframe
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5576 entries, 0 to 5575
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MLSNumber                5576 non-null   object 
 1   Days_on_Market           5576 non-null   float64
 2   Orig_List_Price          5576 non-null   float64
 3   Zip_Code                 5576 non-null   object 
 4   SchoolDistrict           5576 non-null   object 
 5   AnnualTax                5576 non-null   int64  
 6   LotAcres                 5576 non-null   float64
 7   SubdivisionNeighborhood  5288 non-null   object 
 8   Municipality             5576 non-null   object 
 9   Lot                      5576 non-null   object 
 10  OwnerOccupied            5576 non-null   bool   
 11  TotalLandAsmt            5576 non-null   int64  
 12  TotalBldgAsmt            5576 non-null   int64  
 13  PropertyClass            5576 non-null   object 
 14  YearBuilt               

In [19]:
# Change index to MLSNumber
merge_df = merge_df.set_index('MLSNumber')
merge_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,Mill Creek Hundred,119,True,...,2099.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,2099.0,"Main,Upper1"
DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,Mill Creek Hundred,81,False,...,1575.0,352.0,No,Yes,No,1976.0,FeeSimple,Interior Row/Townhouse,1927.0,"Main,Upper1"
DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,Mill Creek Hundred,363,False,...,1150.0,0.0,No,Yes,No,1985.0,FeeSimple,Interior Row/Townhouse,1150.0,"Main,Upper1"
DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,Mill Creek Hundred,376,True,...,1425.0,0.0,No,Yes,Yes,1985.0,FeeSimple,Interior Row/Townhouse,1425.0,"Main,Upper1"
DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,Christiana Hundred,71,True,...,925.0,0.0,No,Yes,No,1957.0,FeeSimple,Detached,925.0,Main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,Mill Creek Hundred,46,True,...,3045.0,0.0,No,Yes,Yes,1958.0,FeeSimple,Detached,3045.0,"Lower1,Main"
DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,Christiana Hundred,54,True,...,3175.0,636.0,No,Yes,Yes,1973.0,FeeSimple,Detached,3811.0,"Lower1,Main,Upper1"
DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,Mill Creek Hundred,27,True,...,2850.0,341.0,No,Yes,No,1989.0,FeeSimple,Detached,3191.0,"Lower1,Main,Upper1"
DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,Mill Creek Hundred,16,True,...,3425.0,0.0,No,Yes,No,1992.0,FeeSimple,Detached,3425.0,"Lower1,Main,Upper1"


In [20]:
# Change object columns to category
for label, content in merge_df.items():
    if pd.api.types.is_string_dtype(content):
        merge_df[label] = content.astype("category").cat.as_ordered()
        
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5576 entries, DENC518086 to DENC526982
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Days_on_Market           5576 non-null   float64 
 1   Orig_List_Price          5576 non-null   float64 
 2   Zip_Code                 5576 non-null   category
 3   SchoolDistrict           5576 non-null   category
 4   AnnualTax                5576 non-null   int64   
 5   LotAcres                 5576 non-null   float64 
 6   SubdivisionNeighborhood  5288 non-null   category
 7   Municipality             5576 non-null   category
 8   Lot                      5576 non-null   category
 9   OwnerOccupied            5576 non-null   bool    
 10  TotalLandAsmt            5576 non-null   int64   
 11  TotalBldgAsmt            5576 non-null   int64   
 12  PropertyClass            5576 non-null   category
 13  YearBuilt                5576 non-null   category
 14

In [21]:
# Change categories to numbers
for label,content in merge_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        merge_df[label] = pd.Categorical(content).codes+1

In [22]:
# Determine the number of unique values in each column.
merge_df.nunique()

Days_on_Market              200
Orig_List_Price             618
Zip_Code                     15
SchoolDistrict                4
AnnualTax                  2540
LotAcres                    173
SubdivisionNeighborhood     620
Municipality                 12
Lot                         844
OwnerOccupied                 2
TotalLandAsmt               376
TotalBldgAsmt              1139
PropertyClass                 4
YearBuilt                   154
Bedrooms                      9
Baths                         7
Basement_YN                   2
Garage_YN                     2
AboveGradeSqFt              478
BelowGradeSqFt              611
Condo/Coop_Assoc_YN           2
Central_Air_YN                2
HOA_YN                        2
Age                         166
Ownership                     4
Structure_Type                9
InteriorSqFt               1133
NumberofStories              36
dtype: int64

In [23]:
# Find null rows
merge_df.isna().sum()

Days_on_Market               0
Orig_List_Price              0
Zip_Code                     0
SchoolDistrict               0
AnnualTax                    0
LotAcres                     0
SubdivisionNeighborhood      0
Municipality                 0
Lot                          0
OwnerOccupied                0
TotalLandAsmt                0
TotalBldgAsmt                0
PropertyClass                0
YearBuilt                    0
Bedrooms                     5
Baths                        5
Basement_YN                  0
Garage_YN                    0
AboveGradeSqFt               0
BelowGradeSqFt             182
Condo/Coop_Assoc_YN          0
Central_Air_YN               0
HOA_YN                       0
Age                          0
Ownership                    0
Structure_Type               0
InteriorSqFt                 0
NumberofStories              0
dtype: int64

In [24]:
# Drop rows with null values
clean_df = merge_df.dropna()
clean_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,15,4,2151,0.26,449,4,293,True,...,2099.0,0.0,1,2,2,1958.0,3,1,2099.0,27
DENC518982,47.0,215500.0,15,4,1824,0.06,388,4,825,False,...,1575.0,352.0,1,2,1,1976.0,3,4,1927.0,27
DENC512992,15.0,200000.0,15,4,1563,0.05,610,4,592,False,...,1150.0,0.0,1,2,1,1985.0,3,4,1150.0,27
DENC512104,11.0,219900.0,15,4,2174,0.05,185,4,606,True,...,1425.0,0.0,1,2,2,1985.0,3,4,1425.0,27
DENC503480,3.0,190000.0,15,4,1349,0.17,535,2,813,True,...,925.0,0.0,1,2,1,1957.0,3,1,925.0,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,1,4,4311,1.03,84,4,690,True,...,3045.0,0.0,1,2,2,1958.0,3,1,3045.0,11
DENC520392,25.0,639000.0,1,4,5208,0.75,0,2,751,True,...,3175.0,636.0,1,2,2,1973.0,3,1,3811.0,12
DENC2014038,7.0,529900.0,1,4,4407,0.39,476,4,488,True,...,2850.0,341.0,1,2,1,1989.0,3,1,3191.0,12
DENC2005484,25.0,620000.0,1,4,4825,0.54,407,4,358,True,...,3425.0,0.0,1,2,1,1992.0,3,1,3425.0,12


In [25]:
# Find max days on market
clean_df['Days_on_Market'].max()

655.0

In [26]:
# Check binning
pd.cut(clean_df['Days_on_Market'], bins=6).value_counts()

(-0.655, 109.167]     5262
(109.167, 218.333]      97
(218.333, 327.5]        20
(327.5, 436.667]         5
(545.833, 655.0]         4
(436.667, 545.833]       1
Name: Days_on_Market, dtype: int64

In [29]:
# Cut data
cut_labels = ['Less than 1 month', '1-2 months', '3-4 months', '5-6 months', 'More than 6 months']
cut_bins = [0, 30, 60, 120, 180, 655]
clean_df['DaysOnMarket_Buckets'] = pd.cut(clean_df['Days_on_Market'], bins=cut_bins, labels=cut_labels)
clean_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,15,4,2151,0.26,449,4,293,True,...,0.0,1,2,2,1958.0,3,1,2099.0,27,Less than 1 month
DENC518982,47.0,215500.0,15,4,1824,0.06,388,4,825,False,...,352.0,1,2,1,1976.0,3,4,1927.0,27,1-2 months
DENC512992,15.0,200000.0,15,4,1563,0.05,610,4,592,False,...,0.0,1,2,1,1985.0,3,4,1150.0,27,Less than 1 month
DENC512104,11.0,219900.0,15,4,2174,0.05,185,4,606,True,...,0.0,1,2,2,1985.0,3,4,1425.0,27,Less than 1 month
DENC503480,3.0,190000.0,15,4,1349,0.17,535,2,813,True,...,0.0,1,2,1,1957.0,3,1,925.0,26,Less than 1 month
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,1,4,4311,1.03,84,4,690,True,...,0.0,1,2,2,1958.0,3,1,3045.0,11,Less than 1 month
DENC520392,25.0,639000.0,1,4,5208,0.75,0,2,751,True,...,636.0,1,2,2,1973.0,3,1,3811.0,12,Less than 1 month
DENC2014038,7.0,529900.0,1,4,4407,0.39,476,4,488,True,...,341.0,1,2,1,1989.0,3,1,3191.0,12,Less than 1 month
DENC2005484,25.0,620000.0,1,4,4825,0.54,407,4,358,True,...,0.0,1,2,1,1992.0,3,1,3425.0,12,Less than 1 month


In [30]:
# Change object columns to category
for label, content in clean_df.items():
    if pd.api.types.is_string_dtype(content):
        clean_df[label] = content.astype("category").cat.as_ordered()
        
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5389 entries, DENC518086 to DENC526982
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Days_on_Market           5389 non-null   float64 
 1   Orig_List_Price          5389 non-null   float64 
 2   Zip_Code                 5389 non-null   int8    
 3   SchoolDistrict           5389 non-null   int8    
 4   AnnualTax                5389 non-null   int64   
 5   LotAcres                 5389 non-null   float64 
 6   SubdivisionNeighborhood  5389 non-null   int16   
 7   Municipality             5389 non-null   int8    
 8   Lot                      5389 non-null   int16   
 9   OwnerOccupied            5389 non-null   bool    
 10  TotalLandAsmt            5389 non-null   int64   
 11  TotalBldgAsmt            5389 non-null   int64   
 12  PropertyClass            5389 non-null   int8    
 13  YearBuilt                5389 non-null   int16   
 14

In [31]:
# Change categories to numbers
for label,content in clean_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        clean_df[label] = pd.Categorical(content).codes+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [32]:
# Drop rows with null values
clean_df = clean_df.dropna()
clean_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Municipality,Lot,OwnerOccupied,...,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN,HOA_YN,Age,Ownership,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DENC518086,5.0,330000.0,15,4,2151,0.26,449,4,293,True,...,0.0,1,2,2,1958.0,3,1,2099.0,27,1
DENC518982,47.0,215500.0,15,4,1824,0.06,388,4,825,False,...,352.0,1,2,1,1976.0,3,4,1927.0,27,2
DENC512992,15.0,200000.0,15,4,1563,0.05,610,4,592,False,...,0.0,1,2,1,1985.0,3,4,1150.0,27,1
DENC512104,11.0,219900.0,15,4,2174,0.05,185,4,606,True,...,0.0,1,2,2,1985.0,3,4,1425.0,27,1
DENC503480,3.0,190000.0,15,4,1349,0.17,535,2,813,True,...,0.0,1,2,1,1957.0,3,1,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,1,4,4311,1.03,84,4,690,True,...,0.0,1,2,2,1958.0,3,1,3045.0,11,1
DENC520392,25.0,639000.0,1,4,5208,0.75,0,2,751,True,...,636.0,1,2,2,1973.0,3,1,3811.0,12,1
DENC2014038,7.0,529900.0,1,4,4407,0.39,476,4,488,True,...,341.0,1,2,1,1989.0,3,1,3191.0,12,1
DENC2005484,25.0,620000.0,1,4,4825,0.54,407,4,358,True,...,0.0,1,2,1,1992.0,3,1,3425.0,12,1


### Scale the data

In [33]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [34]:
data_scaled = data_scaler.fit_transform(clean_df)
data_scaled

array([[-0.33963144,  0.06957091,  1.32906584, ...,  0.10825249,
         0.82318884, -0.31566134],
       [ 0.86227452, -0.94939156,  1.32906584, ..., -0.08726183,
         0.82318884,  1.24212158],
       [-0.05346336, -1.08732971,  1.32906584, ..., -0.97048636,
         0.82318884, -0.31566134],
       ...,
       [-0.28239783,  1.84852809, -1.50686101, ...,  1.34954102,
        -1.16001594, -0.31566134],
       [ 0.23270473,  2.65034921, -1.50686101, ...,  1.61553142,
        -1.16001594, -0.31566134],
       [ 0.97674175,  0.24755562, -1.50686101, ..., -0.06111747,
         0.82318884,  1.24212158]])

### Balanced Random Forest Classifier

In [35]:
y = clean_df["DaysOnMarket_Buckets"]
X = clean_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(4041, 27)

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [38]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.21571088651241324

In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  4,   4,   0,   3,   0,   0],
       [505, 349,  51, 116,  87,  71],
       [ 27,  15,  12,  13,   7,   6],
       [  7,  15,   3,   7,  11,   9],
       [  2,   1,   1,   2,   4,   6],
       [  3,   2,   0,   2,   2,   1]], dtype=int64)

In [40]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.01      0.36      0.59      0.01      0.46      0.21        11
          1       0.90      0.30      0.78      0.45      0.48      0.22      1179
          2       0.18      0.15      0.96      0.16      0.38      0.13        80
          3       0.05      0.13      0.90      0.07      0.35      0.11        52
          4       0.04      0.25      0.92      0.06      0.48      0.21        16
          5       0.01      0.10      0.93      0.02      0.31      0.09        10

avg / total       0.80      0.28      0.80      0.40      0.47      0.21      1348



In [41]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.08292194265523947, 'AnnualTax'),
 (0.07539964046700023, 'TotalBldgAsmt'),
 (0.07512253391782221, 'Orig_List_Price'),
 (0.06438101574594304, 'TotalLandAsmt'),
 (0.06434206258826894, 'SubdivisionNeighborhood'),
 (0.06159303120032891, 'YearBuilt'),
 (0.061368601532566944, 'AboveGradeSqFt'),
 (0.06090712442547736, 'Lot'),
 (0.05919369702691247, 'InteriorSqFt'),
 (0.056714456594656903, 'LotAcres'),
 (0.055872988372448924, 'Age'),
 (0.03842438141995549, 'Zip_Code'),
 (0.03625901291280783, 'NumberofStories'),
 (0.026724158800691938, 'Structure_Type'),
 (0.0265417187849939, 'Municipality'),
 (0.026035209420740717, 'Baths'),
 (0.024141331917035967, 'Bedrooms'),
 (0.02023002037641753, 'SchoolDistrict'),
 (0.01838574651708037, 'BelowGradeSqFt'),
 (0.017327866550684863, 'Garage_YN'),
 (0.012472620302858142, 'OwnerOccupied'),
 (0.00973947359673613, 'HOA_YN'),
 (0.008547155974961564, 'Basement_YN'),
 (0.007128417018631313, 'Central_Air_YN'),
 (0.00506990262188867, 'Ownership'),
 (0.0039167176484

In [47]:
# Drop columns from clean_df
updated_df = clean_df.drop(columns=["Central_Air_YN", "PropertyClass", "Condo/Coop_Assoc_YN", "OwnerOccupied", "Basement_YN", "Garage_YN", "HOA_YN", "Ownership", "YearBuilt", "LotAcres", "BelowGradeSqFt", "Municipality", "SchoolDistrict", "Zip_Code"])
updated_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,AnnualTax,SubdivisionNeighborhood,Lot,TotalLandAsmt,TotalBldgAsmt,Bedrooms,Baths,AboveGradeSqFt,Age,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
DENC518086,5.0,330000.0,2151,449,293,12600,52600,4.0,2.0,2099.0,1958.0,1,2099.0,27,1
DENC518982,47.0,215500.0,1824,388,825,9900,45400,3.0,2.0,1575.0,1976.0,4,1927.0,27,2
DENC512992,15.0,200000.0,1563,610,592,9400,38000,2.0,2.0,1150.0,1985.0,4,1150.0,27,1
DENC512104,11.0,219900.0,2174,185,606,8600,57300,2.0,3.0,1425.0,1985.0,4,1425.0,27,1
DENC503480,3.0,190000.0,1349,535,813,8600,32300,3.0,2.0,925.0,1957.0,1,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,4311,84,690,31300,99400,3.0,2.0,3045.0,1958.0,1,3045.0,11,1
DENC520392,25.0,639000.0,5208,0,751,25200,132700,4.0,4.0,3175.0,1973.0,1,3811.0,12,1
DENC2014038,7.0,529900.0,4407,476,488,30000,103600,4.0,3.0,2850.0,1989.0,1,3191.0,12,1
DENC2005484,25.0,620000.0,4825,407,358,28500,117800,4.0,3.0,3425.0,1992.0,1,3425.0,12,1


In [48]:
y = updated_df["DaysOnMarket_Buckets"]
X = updated_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(4041, 13)

In [50]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=50)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=50)

In [51]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2827403054083994

###  Naive Random Oversampling

In [52]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({1: 3504, 3: 3504, 2: 3504, 5: 3504, 0: 3504, 4: 3504})

In [53]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [54]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.24151438197792882

In [55]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  3,   0,   3,   1,   1,   3],
       [517,  59, 298,  75, 104, 115],
       [ 32,   0,  19,   5,  18,  25],
       [ 12,   2,   9,   5,   4,  11],
       [  4,   0,   6,   1,   2,   4],
       [  1,   0,   0,   2,   0,   7]], dtype=int64)

### EasyEnsembleClassifier

In [60]:
y = clean_df["DaysOnMarket_Buckets"]
X = clean_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [61]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [62]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.30104083833165785

In [63]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  7,   1,   0,   0,   1,   2],
       [768,  56,  60,  35, 156,  93],
       [ 46,   3,   4,   6,  21,  19],
       [ 17,   1,   2,   3,  10,  10],
       [  5,   2,   1,   0,   7,   2],
       [  1,   1,   1,   0,   1,   6]], dtype=int64)

In [64]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.01      0.64      0.37      0.02      0.49      0.24        11
          1       0.88      0.05      0.96      0.09      0.21      0.04      1168
          2       0.06      0.04      0.95      0.05      0.20      0.03        99
          3       0.07      0.07      0.97      0.07      0.26      0.06        43
          4       0.04      0.41      0.86      0.07      0.59      0.34        17
          5       0.05      0.60      0.91      0.08      0.74      0.53        10

avg / total       0.77      0.06      0.95      0.09      0.23      0.05      1348



In [65]:
# Drop columns from clean_df
updated_df = clean_df.drop(columns=["Central_Air_YN", "PropertyClass", "Condo/Coop_Assoc_YN", "OwnerOccupied", "Basement_YN", "Garage_YN", "HOA_YN", "Ownership", "YearBuilt", "LotAcres", "BelowGradeSqFt", "Municipality", "SchoolDistrict", "Zip_Code"])
updated_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,AnnualTax,SubdivisionNeighborhood,Lot,TotalLandAsmt,TotalBldgAsmt,Bedrooms,Baths,AboveGradeSqFt,Age,Structure_Type,InteriorSqFt,NumberofStories,DaysOnMarket_Buckets
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
DENC518086,5.0,330000.0,2151,449,293,12600,52600,4.0,2.0,2099.0,1958.0,1,2099.0,27,1
DENC518982,47.0,215500.0,1824,388,825,9900,45400,3.0,2.0,1575.0,1976.0,4,1927.0,27,2
DENC512992,15.0,200000.0,1563,610,592,9400,38000,2.0,2.0,1150.0,1985.0,4,1150.0,27,1
DENC512104,11.0,219900.0,2174,185,606,8600,57300,2.0,3.0,1425.0,1985.0,4,1425.0,27,1
DENC503480,3.0,190000.0,1349,535,813,8600,32300,3.0,2.0,925.0,1957.0,1,925.0,26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,4311,84,690,31300,99400,3.0,2.0,3045.0,1958.0,1,3045.0,11,1
DENC520392,25.0,639000.0,5208,0,751,25200,132700,4.0,4.0,3175.0,1973.0,1,3811.0,12,1
DENC2014038,7.0,529900.0,4407,476,488,30000,103600,4.0,3.0,2850.0,1989.0,1,3191.0,12,1
DENC2005484,25.0,620000.0,4825,407,358,28500,117800,4.0,3.0,3425.0,1992.0,1,3425.0,12,1


In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

(4041, 27)

In [67]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [68]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2149821953705946