In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pathlib as Path
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Note - You must set up your own config file
from config import db_password

ModuleNotFoundError: No module named 'config'

In [None]:
# Name the SQL database your are accessing
database = "RealLeads"

# Make your local connection to the database in PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database=database,
    user="postgres",
    password=db_password)

In [None]:
# Table selection Function
def table_select(table_name):
    table =f"select * from \"{table_name}\""
    dataFrame = pd.read_sql(table, conn)
    return dataFrame

In [None]:
# Names of the tables to be imported
prop_charac = table_select("prop_charac_clean")
pub_rec = table_select("pub_rec_clean")
sales_data = table_select("sales_data_clean")

In [None]:
prop_charac.head()

In [None]:
prop_charac.info()

In [None]:
pub_rec.head()

In [None]:
pub_rec.info()

In [None]:
sales_data.head()

In [None]:
sales_data['FinalFinancing'].value_counts()

In [None]:
# Look at NAME value counts for binning
loan_counts = sales_data.FinalFinancing.value_counts()
#  How many name counts are greater than 5?
loan_counts[loan_counts<20]

In [None]:
sales_data.info()

In [None]:
sales_data['ListDate'].head()

In [None]:
sales_data['ListDate'] = pd.to_datetime(sales_data['ListDate'])

In [None]:
sales_data.info()

In [None]:
sales_data['Month'] = pd.DatetimeIndex(sales_data['ListDate']).month

In [None]:
sales_data['Year'] = pd.DatetimeIndex(sales_data['ListDate']).year

In [None]:
# Drop Rows from prop_charac
prop_charac = prop_charac[['MLSNumber',
                            'Bedrooms',
                            'Baths',
                            'Basement_YN',
                            'Garage_YN',
                            'AboveGradeSqFt',
                            'BelowGradeSqFt',
                            'Condo/Coop_Assoc_YN',
                            'Central_Air_YN',
                            'HOA_YN',
                            'Age',
                          'Ownership',
                          'Structure_Type',
                          'InteriorSqFt',
                          'NumberofStories']]

prop_charac

In [None]:
# Drop Rows from pub_rec
pub_rec = pub_rec[['MLSNumber',
                    'Zip_Code',
                    'SchoolDistrict',
                    'AnnualTax',
                    'LotAcres',
                    'SubdivisionNeighborhood',
                    'Municipality',
                      'Lot',
                      'OwnerOccupied',
                      'TotalLandAsmt',
                  'TotalBldgAsmt',
                  'PropertyClass',
                  'YearBuilt']]

pub_rec

In [None]:
# Drop Rows from sales_data

sales_data = sales_data[['MLSNumber',
                        'Days_on_Market',
                        'Orig_List_Price',
                         'Month',
                         'Year'
                        ]]

sales_data

In [None]:
# Merge sales_data and pub_rec dataframes
merge_df = pd.merge(sales_data, pub_rec, on='MLSNumber')
merge_df

In [None]:
# Merge merge_df and prop_charac dataframes
merge_df = pd.merge(merge_df, prop_charac, on='MLSNumber')
merge_df

In [None]:
# Info for all columns in new dataframe
merge_df.info()

In [None]:
# Determine the number of unique values in each column.
merge_df.nunique()

In [None]:
# Find null rows
merge_df.isna().sum()

In [None]:
# Drop rows with null values
merge_df = merge_df.dropna()
merge_df

In [None]:
# Find max days on market
merge_df['Days_on_Market'].max()

In [None]:
# Check binning
pd.cut(merge_df['Days_on_Market'], bins=8).value_counts()

In [None]:
# Drop rows with condition over 120 days on market
merge_df.drop(merge_df[merge_df['Days_on_Market'] >= 120].index, inplace = True)

In [None]:
clean_df = merge_df.copy()

In [None]:
# Cut data
cut_labels = ['Less than 2 months', 'More than 2 months']
cut_bins = [0, 60,120]
clean_df['DaysOnMarket_Buckets'] = pd.cut(clean_df['Days_on_Market'], bins=cut_bins, labels=cut_labels)
clean_df

In [None]:
# Change index to MLSNumber
clean_df = clean_df.set_index('MLSNumber')
clean_df

In [None]:
# Change object columns to category
for label, content in clean_df.items():
    if pd.api.types.is_string_dtype(content):
        clean_df[label] = content.astype("category").cat.as_ordered()
        
clean_df.info()

In [None]:
clean_df.head()

In [None]:
# Change categories to numbers
for label,content in clean_df.items():
    if not pd.api.types.is_numeric_dtype(content):
        clean_df[label] = pd.Categorical(content).codes+1

In [None]:
clean_df.head()

In [None]:
# Drop rows with null values
clean_df = clean_df.dropna()
clean_df

In [None]:
clean_df.columns

In [None]:
import seaborn as sns

corr = clean_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

### Balanced Random Forest Classifier

In [None]:
y = clean_df["DaysOnMarket_Buckets"]
X = clean_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=20)
brf.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

In [None]:
# Drop columns from clean_df
updated_df = clean_df.drop(columns=["PropertyClass", "Condo/Coop_Assoc_YN", "Ownership", "OwnerOccupied", "HOA_YN", "Basement_YN", "Central_Air_YN"])
updated_df

In [None]:
y = updated_df["DaysOnMarket_Buckets"]
X = updated_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=20)
brf.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

In [None]:
updated_df.info()

In [None]:
# Drop columns from clean_df
updated_df2 = updated_df.drop(columns=["Garage_YN", "Structure_Type", "Lot", "Baths", "SchoolDistrict", "Bedrooms", "BelowGradeSqFt", "Municipality", "YearBuilt", "Zip_Code"])
updated_df2

In [None]:
y = updated_df2["DaysOnMarket_Buckets"]
X = updated_df2.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=20)
brf.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

### EasyEnsembleClassifier

In [None]:
y = clean_df["DaysOnMarket_Buckets"]
X = clean_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_jobs=-1, random_state=100)
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Drop columns from clean_df
updated_df = clean_df.drop(columns=["PropertyClass", "Condo/Coop_Assoc_YN", "Ownership", "Basement_YN"])
updated_df

In [None]:
y = updated_df["DaysOnMarket_Buckets"]
X = updated_df.drop(columns=["Days_on_Market", "DaysOnMarket_Buckets"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_jobs=-1, random_state=50)
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)