In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pathlib as Path
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Note - You must set up your own config file
from config import db_password

In [4]:
# Name the SQL database your are accessing
database = "RealLeads"

# Make your local connection to the database in PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database=database,
    user="postgres",
    password=db_password)

In [5]:
# Table selection Function
def table_select(table_name):
    table =f"select * from \"{table_name}\""
    dataFrame = pd.read_sql(table, conn)
    return dataFrame

In [6]:
# Names of the tables to be imported
prop_charac = table_select("prop_charac_clean")
pub_rec = table_select("pub_rec_clean")
sales_data = table_select("sales_data_clean")

In [7]:
prop_charac.head()

Unnamed: 0,MLSNumber,Address,BuildingName,Ownership,Senior_Community_YN,Condo/Coop_Assoc_YN,HOA_YN,AssociationFee,AssociationFeeFrequency,Structure_Type,...,Garage_YN,GarageSpaces,GarageFeatures,Parking,ExteriorFeatures,ExteriorMaterial,Main_Roof,Foundation,PorchDeck,SwimmingPoolType
0,DENC518086,2615 Pecksniff Rd,NONE AVAILABLE,FeeSimple,No,No,Yes,$15,Annually,Detached,...,Yes,1.0,,,"ExtensiveHardscape,Sidewalks,StoneRetainingWal...","BrickFront,VinylSiding",ArchitecturalShingle,,"Patios,Porches",
1,DENC518982,4938 S Tupelo Turn,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,"AluminumSiding,Brick,VinylSiding",Asphalt,,,
2,DENC512992,15 Kristina Ct,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,VinylSiding,,,,
3,DENC512104,3251 Champions Dr,,FeeSimple,No,No,Yes,$50,Annually,Interior Row/Townhouse,...,Yes,1.0,,,,Other,Shingle,,Patios,
4,DENC503480,3706 Lafayette St,,FeeSimple,No,No,No,,,Detached,...,No,,,,,Asbestos,,,,


In [8]:
prop_charac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 59 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   MLSNumber                  5576 non-null   object 
 1   Address                    5576 non-null   object 
 2   BuildingName               1139 non-null   object 
 3   Ownership                  5576 non-null   object 
 4   Senior_Community_YN        5576 non-null   object 
 5   Condo/Coop_Assoc_YN        5576 non-null   object 
 6   HOA_YN                     5576 non-null   object 
 7   AssociationFee             1582 non-null   object 
 8   AssociationFeeFrequency    1582 non-null   object 
 9   Structure_Type             5576 non-null   object 
 10  Acres                      5354 non-null   float64
 11  LotDimensions              4847 non-null   object 
 12  LotDescription             1455 non-null   object 
 13  FeeIncludes                661 non-null    objec

In [9]:
pub_rec.head()

Unnamed: 0,MLSNumber,Tax_ID,Address-truncated,PropertyCityState,Zip_Code,Zip4,CarrierRoute,PropDoNotMail,OwnerNames,OwnerLastName,...,BldgSqFtTotal,Stories,Bedrooms,Exterior,BsmtDesc,FireplaceTotal,GrgType,HeatDelivery,YearBuilt,YearRemod
0,DENC518086,08-038.30-119,2615 Pecksniff,"Wilmington, DE",19808,3026,C010,N,James Robinson,Robinson,...,1875.0,1.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,Att/BuiltIn/Bsmt,Hot Water/Steam,1958,0
1,DENC518982,08-036.10-081,4938 S Tupelo,"Wilmington, DE",19808,1026,C009,N,Xiaopeng Deng,Deng,...,1575.0,2.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,,Hot/Warm Air,1976,0
2,DENC512992,08-044.30-363,15 Kristina,"Wilmington, DE",19808,4063,C084,N,Robert F Walls,Walls,...,,2.0,2.0,"Aluminum, Vinyl",,0.0,,Heat Pump,1985,0
3,DENC512104,08-036.40-376,3251 Champions,"Wilmington, DE",19808,2601,C039,N,Michael J Downs,Downs,...,,2.0,2.0,Other,,1.0,Att/BuiltIn/Bsmt,Heat Pump,1985,0
4,DENC503480,07-041.10-071,3706 Lafayette,"Wilmington, DE",19808,6014,C001,N,Maria Corona,Corona,...,,1.0,3.0,Asbestos,,0.0,,Hot/Warm Air,1957,0


In [10]:
pub_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 58 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MLSNumber                5576 non-null   object 
 1   Tax_ID                   5576 non-null   object 
 2   Address-truncated        5576 non-null   object 
 3   PropertyCityState        5576 non-null   object 
 4   Zip_Code                 5576 non-null   object 
 5   Zip4                     5576 non-null   object 
 6   CarrierRoute             5572 non-null   object 
 7   PropDoNotMail            5573 non-null   object 
 8   OwnerNames               5246 non-null   object 
 9   OwnerLastName            5206 non-null   object 
 10  OwnerFirstName           4917 non-null   object 
 11  Owner2LastName           2359 non-null   object 
 12  Owner2FirstName          2359 non-null   object 
 13  Owner3LastName           89 non-null     object 
 14  Owner3FirstName         

In [11]:
sales_data.head()

Unnamed: 0,MLSNumber,Address,Status,Sold_Price,Sold_Price_less_Concession,Orig_List_Price,Current_List_Price,Days_on_Market,Previous_Days_on_Market,ListDate,StatusDate,Agreement_of_Sale_Date,SettledDate,Concessions_YN,Concessions_Remarks,SellerConcessionsAmount,FinalFinancing
0,DENC518086,2615 Pecksniff Rd,Closed,335000.0,335000.0,330000.0,330000.0,5.0,5.0,2020-12-11,2021-02-11,2020-12-16,2021-02-11,No,,0.0,FHA
1,DENC518982,4938 S Tupelo Turn,Closed,200000.0,200000.0,215500.0,210000.0,47.0,67.0,2021-01-07,2021-04-08,2021-02-21,2021-04-08,No,,0.0,Conventional
2,DENC512992,15 Kristina Ct,Closed,200000.0,200000.0,200000.0,200000.0,15.0,15.0,2020-12-28,2021-02-26,2021-01-09,2021-02-26,No,,0.0,Conventional
3,DENC512104,3251 Champions Dr,Closed,200000.0,200000.0,219900.0,214900.0,11.0,11.0,2020-10-27,2020-12-14,2020-11-05,2020-12-11,No,,0.0,Conventional
4,DENC503480,3706 Lafayette St,Closed,200000.0,200000.0,190000.0,190000.0,3.0,3.0,2020-06-20,2020-08-02,2020-06-21,2020-07-31,No,,0.0,Conventional


In [12]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   MLSNumber                   5576 non-null   object 
 1   Address                     5576 non-null   object 
 2   Status                      5576 non-null   object 
 3   Sold_Price                  5576 non-null   float64
 4   Sold_Price_less_Concession  5576 non-null   float64
 5   Orig_List_Price             5576 non-null   float64
 6   Current_List_Price          5576 non-null   float64
 7   Days_on_Market              5576 non-null   float64
 8   Previous_Days_on_Market     5576 non-null   float64
 9   ListDate                    5576 non-null   object 
 10  StatusDate                  5576 non-null   object 
 11  Agreement_of_Sale_Date      5576 non-null   object 
 12  SettledDate                 5576 non-null   object 
 13  Concessions_YN              5576 

In [13]:
# Drop Rows from prop_charac
prop_charac = prop_charac[['MLSNumber',
                            'Bedrooms',
                            'Baths',
                            'Basement_YN',
                            'Garage_YN',
                            'AboveGradeSqFt',
                            'BelowGradeSqFt',
                            'Condo/Coop_Assoc_YN',
                            'Central_Air_YN']]

prop_charac

Unnamed: 0,MLSNumber,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN
0,DENC518086,4.0,2.0,Yes,Yes,2099.0,0.0,No,Yes
1,DENC518982,3.0,2.0,Yes,No,1575.0,352.0,No,Yes
2,DENC512992,2.0,2.0,Yes,No,1150.0,0.0,No,Yes
3,DENC512104,2.0,3.0,Yes,Yes,1425.0,0.0,No,Yes
4,DENC503480,3.0,2.0,Yes,No,925.0,0.0,No,Yes
...,...,...,...,...,...,...,...,...,...
5571,DENC520114,3.0,2.0,Yes,Yes,3045.0,0.0,No,Yes
5572,DENC520392,4.0,4.0,Yes,Yes,3175.0,636.0,No,Yes
5573,DENC2014038,4.0,3.0,Yes,Yes,2850.0,341.0,No,Yes
5574,DENC2005484,4.0,3.0,Yes,Yes,3425.0,0.0,No,Yes


In [14]:
# Drop Rows from pub_rec
pub_rec = pub_rec[['MLSNumber',
                    'Zip_Code',
                    'SchoolDistrict',
                    'AnnualTax',
                    'LotAcres',
                    'SubdivisionNeighborhood']]

pub_rec

Unnamed: 0,MLSNumber,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood
0,DENC518086,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii
1,DENC518982,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge
2,DENC512992,19808,Red Clay Consolidated,1563,0.05,Woodmill
3,DENC512104,19808,Red Clay Consolidated,2174,0.05,Fairway Falls
4,DENC503480,19808,Red Clay Consolidated,1349,0.17,Washington Hgts
...,...,...,...,...,...,...
5571,DENC520114,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills
5572,DENC520392,19707,Red Clay Consolidated,5208,0.75,
5573,DENC2014038,19707,Red Clay Consolidated,4407,0.39,Stenning Woods
5574,DENC2005484,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas


In [15]:
# Drop Rows from sales_data

sales_data = sales_data[['MLSNumber',
                        'Days_on_Market',
                        'Orig_List_Price']]

sales_data

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price
0,DENC518086,5.0,330000.0
1,DENC518982,47.0,215500.0
2,DENC512992,15.0,200000.0
3,DENC512104,11.0,219900.0
4,DENC503480,3.0,190000.0
...,...,...,...
5571,DENC520114,5.0,595000.0
5572,DENC520392,25.0,639000.0
5573,DENC2014038,7.0,529900.0
5574,DENC2005484,25.0,620000.0


In [16]:
# Merge sales_data and pub_rec dataframes
merge_df = pd.merge(sales_data, pub_rec, on='MLSNumber')
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts
...,...,...,...,...,...,...,...,...
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills
5572,DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas


In [17]:
# Merge merge_df and prop_charac dataframes
merge_df = pd.merge(merge_df, prop_charac, on='MLSNumber')
merge_df

Unnamed: 0,MLSNumber,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN
0,DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,4.0,2.0,Yes,Yes,2099.0,0.0,No,Yes
1,DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,3.0,2.0,Yes,No,1575.0,352.0,No,Yes
2,DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,2.0,2.0,Yes,No,1150.0,0.0,No,Yes
3,DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,2.0,3.0,Yes,Yes,1425.0,0.0,No,Yes
4,DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,3.0,2.0,Yes,No,925.0,0.0,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,3.0,2.0,Yes,Yes,3045.0,0.0,No,Yes
5572,DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,4.0,4.0,Yes,Yes,3175.0,636.0,No,Yes
5573,DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,4.0,3.0,Yes,Yes,2850.0,341.0,No,Yes
5574,DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,4.0,3.0,Yes,Yes,3425.0,0.0,No,Yes


In [18]:
# Info for all columns in new dataframe
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5576 entries, 0 to 5575
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   MLSNumber                5576 non-null   object 
 1   Days_on_Market           5576 non-null   float64
 2   Orig_List_Price          5576 non-null   float64
 3   Zip_Code                 5576 non-null   object 
 4   SchoolDistrict           5576 non-null   object 
 5   AnnualTax                5576 non-null   int64  
 6   LotAcres                 5576 non-null   float64
 7   SubdivisionNeighborhood  5288 non-null   object 
 8   Bedrooms                 5571 non-null   float64
 9   Baths                    5571 non-null   float64
 10  Basement_YN              5576 non-null   object 
 11  Garage_YN                5576 non-null   object 
 12  AboveGradeSqFt           5576 non-null   float64
 13  BelowGradeSqFt           5394 non-null   float64
 14  Condo/Coop_Assoc_YN     

In [19]:
# Change index to MLSNumber
merge_df = merge_df.set_index('MLSNumber')
merge_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
DENC518086,5.0,330000.0,19808,Red Clay Consolidated,2151,0.26,Sherwood Park Ii,4.0,2.0,Yes,Yes,2099.0,0.0,No,Yes
DENC518982,47.0,215500.0,19808,Red Clay Consolidated,1824,0.06,Pepper Ridge,3.0,2.0,Yes,No,1575.0,352.0,No,Yes
DENC512992,15.0,200000.0,19808,Red Clay Consolidated,1563,0.05,Woodmill,2.0,2.0,Yes,No,1150.0,0.0,No,Yes
DENC512104,11.0,219900.0,19808,Red Clay Consolidated,2174,0.05,Fairway Falls,2.0,3.0,Yes,Yes,1425.0,0.0,No,Yes
DENC503480,3.0,190000.0,19808,Red Clay Consolidated,1349,0.17,Washington Hgts,3.0,2.0,Yes,No,925.0,0.0,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,19707,Red Clay Consolidated,4311,1.03,Canterbury Hills,3.0,2.0,Yes,Yes,3045.0,0.0,No,Yes
DENC520392,25.0,639000.0,19707,Red Clay Consolidated,5208,0.75,,4.0,4.0,Yes,Yes,3175.0,636.0,No,Yes
DENC2014038,7.0,529900.0,19707,Red Clay Consolidated,4407,0.39,Stenning Woods,4.0,3.0,Yes,Yes,2850.0,341.0,No,Yes
DENC2005484,25.0,620000.0,19707,Red Clay Consolidated,4825,0.54,Quaker Lea Villas,4.0,3.0,Yes,Yes,3425.0,0.0,No,Yes


In [20]:
# Change object columns to category
for label, content in merge_df.items():
    if pd.api.types.is_string_dtype(content):
        merge_df[label] = content.astype("category").cat.as_ordered()
        
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5576 entries, DENC518086 to DENC526982
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Days_on_Market           5576 non-null   float64 
 1   Orig_List_Price          5576 non-null   float64 
 2   Zip_Code                 5576 non-null   category
 3   SchoolDistrict           5576 non-null   category
 4   AnnualTax                5576 non-null   int64   
 5   LotAcres                 5576 non-null   float64 
 6   SubdivisionNeighborhood  5288 non-null   category
 7   Bedrooms                 5571 non-null   float64 
 8   Baths                    5571 non-null   float64 
 9   Basement_YN              5576 non-null   category
 10  Garage_YN                5576 non-null   category
 11  AboveGradeSqFt           5576 non-null   float64 
 12  BelowGradeSqFt           5394 non-null   float64 
 13  Condo/Coop_Assoc_YN      5576 non-null   category
 14

In [21]:
# Change categories to numbers
for label,content in merge_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        merge_df[label] = pd.Categorical(content).codes+1

In [23]:
# Determine the number of unique values in each column.
merge_df.nunique()

Days_on_Market              200
Orig_List_Price             618
Zip_Code                     15
SchoolDistrict                4
AnnualTax                  2540
LotAcres                    173
SubdivisionNeighborhood     620
Bedrooms                      9
Baths                         7
Basement_YN                   2
Garage_YN                     2
AboveGradeSqFt              478
BelowGradeSqFt              611
Condo/Coop_Assoc_YN           2
Central_Air_YN                2
dtype: int64

In [24]:
# Find null rows
merge_df.isna().sum()

Days_on_Market               0
Orig_List_Price              0
Zip_Code                     0
SchoolDistrict               0
AnnualTax                    0
LotAcres                     0
SubdivisionNeighborhood      0
Bedrooms                     5
Baths                        5
Basement_YN                  0
Garage_YN                    0
AboveGradeSqFt               0
BelowGradeSqFt             182
Condo/Coop_Assoc_YN          0
Central_Air_YN               0
dtype: int64

In [25]:
# Drop rows with null values
clean_df = merge_df.dropna()
clean_df

Unnamed: 0_level_0,Days_on_Market,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN
MLSNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
DENC518086,5.0,330000.0,15,4,2151,0.26,449,4.0,2.0,2,2,2099.0,0.0,1,2
DENC518982,47.0,215500.0,15,4,1824,0.06,388,3.0,2.0,2,1,1575.0,352.0,1,2
DENC512992,15.0,200000.0,15,4,1563,0.05,610,2.0,2.0,2,1,1150.0,0.0,1,2
DENC512104,11.0,219900.0,15,4,2174,0.05,185,2.0,3.0,2,2,1425.0,0.0,1,2
DENC503480,3.0,190000.0,15,4,1349,0.17,535,3.0,2.0,2,1,925.0,0.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DENC520114,5.0,595000.0,1,4,4311,1.03,84,3.0,2.0,2,2,3045.0,0.0,1,2
DENC520392,25.0,639000.0,1,4,5208,0.75,0,4.0,4.0,2,2,3175.0,636.0,1,2
DENC2014038,7.0,529900.0,1,4,4407,0.39,476,4.0,3.0,2,2,2850.0,341.0,1,2
DENC2005484,25.0,620000.0,1,4,4825,0.54,407,4.0,3.0,2,2,3425.0,0.0,1,2


In [27]:
# Create our features
X = clean_df.drop(columns='Days_on_Market')

# Create our target
y = clean_df.loc[:, ['Days_on_Market']].copy()

In [28]:
X.describe()

Unnamed: 0,Orig_List_Price,Zip_Code,SchoolDistrict,AnnualTax,LotAcres,SubdivisionNeighborhood,Bedrooms,Baths,Basement_YN,Garage_YN,AboveGradeSqFt,BelowGradeSqFt,Condo/Coop_Assoc_YN,Central_Air_YN
count,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0,5389.0
mean,322182.4,8.438857,3.073669,2774.678419,0.228662,318.569122,3.31954,2.452774,1.843199,1.596771,1800.611431,203.155502,1.050288,1.908517
std,112379.6,4.937116,1.102551,1347.610071,0.305603,197.903877,0.767248,0.815044,0.363647,0.490591,727.910503,354.445399,0.218558,0.288321
min,139900.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
25%,240000.0,4.0,2.0,1834.0,0.08,144.0,3.0,2.0,2.0,1.0,1375.0,0.0,1.0,2.0
50%,295000.0,10.0,4.0,2421.0,0.17,310.0,3.0,2.0,2.0,2.0,1725.0,0.0,1.0,2.0
75%,375000.0,12.0,4.0,3400.0,0.26,526.0,4.0,3.0,2.0,2.0,2175.0,372.0,1.0,2.0
max,3899000.0,15.0,4.0,12291.0,9.2,619.0,8.0,6.0,2.0,2.0,6425.0,2585.0,2.0,2.0


In [29]:
# Check the balance of our target values
y['Days_on_Market'].value_counts()

5.0      681
4.0      595
6.0      550
3.0      426
7.0      356
        ... 
262.0      1
244.0      1
230.0      1
259.0      1
139.0      1
Name: Days_on_Market, Length: 193, dtype: int64

In [30]:
### Balanced Random Forest Classifier

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

  after removing the cwd from sys.path.


BalancedRandomForestClassifier(random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)



5.211590577444236e-05

In [34]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.00      0.00      1.00      0.00      0.00      0.00         6
        1.0       0.00      0.00      1.00      0.00      0.00      0.00        71
        2.0       0.00      0.00      1.00      0.00      0.00      0.00        61
        3.0       0.00      0.00      0.99      0.00      0.00      0.00       110
        4.0       0.00      0.00      1.00      0.00      0.00      0.00       164
        5.0       0.12      0.01      0.99      0.01      0.08      0.01       156
        6.0       0.00      0.00      1.00      0.00      0.00      0.00       134
        7.0       0.00      0.00      1.00      0.00      0.00      0.00        79
        8.0       0.00      0.00      1.00      0.00      0.00      0.00        66
        9.0       0.00      0.00      1.00      0.00      0.00      0.00        45
       10.0       0.00      0.00      1.00      0.00      0.00      0.00        42
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.00      0.00      1.00      0.00      0.00      0.00         6
        1.0       0.00      0.00      1.00      0.00      0.00      0.00        71
        2.0       0.00      0.00      1.00      0.00      0.00      0.00        61
        3.0       0.00      0.00      0.99      0.00      0.00      0.00       110
        4.0       0.00      0.00      1.00      0.00      0.00      0.00       164
        5.0       0.12      0.01      0.99      0.01      0.08      0.01       156
        6.0       0.00      0.00      1.00      0.00      0.00      0.00       134
        7.0       0.00      0.00      1.00      0.00      0.00      0.00        79
        8.0       0.00      0.00      1.00      0.00      0.00      0.00        66
        9.0       0.00      0.00      1.00      0.00      0.00      0.00        45
       10.0       0.00      0.00      1.00      0.00      0.00      0.00        42
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.1360869266356385, 'AnnualTax'),
 (0.13245696562003617, 'Orig_List_Price'),
 (0.12169844686993335, 'SubdivisionNeighborhood'),
 (0.11617931932846068, 'AboveGradeSqFt'),
 (0.10488823851889802, 'LotAcres'),
 (0.08053210856420664, 'Zip_Code'),
 (0.06259482915661839, 'Bedrooms'),
 (0.05948181649081227, 'Baths'),
 (0.05624097513924892, 'BelowGradeSqFt'),
 (0.04328115839029497, 'SchoolDistrict'),
 (0.033877072883979543, 'Garage_YN'),
 (0.019261272491541227, 'Basement_YN'),
 (0.01877914253311439, 'Central_Air_YN'),
 (0.014641727377216892, 'Condo/Coop_Assoc_YN')]

In [38]:
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df

Unnamed: 0,feature,importance
0,Orig_List_Price,0.132457
1,Zip_Code,0.080532
2,SchoolDistrict,0.043281
3,AnnualTax,0.136087
4,LotAcres,0.104888
5,SubdivisionNeighborhood,0.121698
6,Bedrooms,0.062595
7,Baths,0.059482
8,Basement_YN,0.019261
9,Garage_YN,0.033877


In [39]:
### Linear Regression

In [40]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [41]:
model.fit(X,y)

LinearRegression()

In [42]:
y_pred = model.predict(X)
print(y_pred.shape)

(5389, 1)


In [43]:
y_pred

array([[11.56945652],
       [10.49326017],
       [11.56447794],
       ...,
       [24.23315983],
       [28.74391442],
       [20.10156775]])

In [44]:
model.score(X,y)

0.07679234986426664