In [2]:
import pandas as pd
from pandas import DataFrame as DF
import numpy as np
from numpy.testing import assert_almost_equal
import re
import sys
import time
import threading
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
pthurl =  '3rdpartyExport.csv'
features1 = pd.read_csv(pthurl)

In [4]:
def clean(dat: pd.DataFrame) -> pd.DataFrame: 
    def try_lower(x): 
        try: 
            return x.lower()
        except AttributeError:
            return x

    return (dat.assign(**{feat: dat[feat].apply(try_lower) for feat in dat.select_dtypes(exclude=[np.number]).columns})
            .rename(columns={' $/ sq ft ': 'price_per_sq_ft', 
                               '$ / sq ft for keywords': 'price_per_sq_ft_by_keyword', 
                               ' $/ sq ft .1': 'price_per_sq_ft_.1'})
            .rename(columns=lambda s: s.replace(' ', '_').lower())
            
           )

In [5]:
features1 = clean(pd.read_csv(pthurl))

features1.columns

Index(['price_per_sq_ft', 'sale_price', 'list_price',
       '>90_day_before_sale_zestimate', 'price_per_sq_ft_by_keyword',
       'close_date', 'street_number', 'street_name', 'street_suffix', 'city',
       'county', 'zip5', 'unnamed:_12', 'beds_total', 'baths.lavs',
       'est_fin_abv_grd_sqft', 'acreage', 'architecture_level',
       'architecture_style', 'price_per_sq_ft_.1', 'basement_type',
       'unnamed:_21', 'sale_price.1', 'exterior', 'cooling',
       'exterior_features', 'garage_dimensions', 'garage_features',
       'garage_size', 'heating', 'original_list_price', 'photo_count',
       'pool_yn', 'public_remarks', 'avg', 'count', 'without',
       'sqft-est_tot_fin', 'sqft-est_fin_lower_floor', 'sqft-est_fin_abv_grd',
       'sqft-est_tot_bsmt', 'year_built', 'year_remodeled', 'porch_type'],
      dtype='object')

In [6]:
# Some features engineering
features1 = features1[['price_per_sq_ft', 'sale_price', 'list_price', 'close_date', 'zip5',
                       'beds_total', 'baths.lavs', 'est_fin_abv_grd_sqft','acreage',
                       'architecture_level', 'architecture_style', 'basement_type','exterior',
                       'cooling', 'exterior_features', 'garage_features', 'heating',
                       'sqft-est_tot_fin', 'sqft-est_fin_lower_floor', 'sqft-est_fin_abv_grd',
                       'sqft-est_tot_bsmt', 'year_built']]

features1.shape

(955, 22)

In [7]:
print(features1.shape)
features1.isnull().sum()

(955, 22)


price_per_sq_ft               0
sale_price                    0
list_price                    0
close_date                    0
zip5                          0
beds_total                    0
baths.lavs                    0
est_fin_abv_grd_sqft          0
acreage                       0
architecture_level            2
architecture_style            5
basement_type                94
exterior                      2
cooling                      39
exterior_features           205
garage_features             151
heating                       2
sqft-est_tot_fin              0
sqft-est_fin_lower_floor    337
sqft-est_fin_abv_grd          0
sqft-est_tot_bsmt            64
year_built                    0
dtype: int64

In [8]:
No_NaNs = features1.copy()
No_NaNs = No_NaNs.dropna()
No_NaNs.isnull().sum()

price_per_sq_ft             0
sale_price                  0
list_price                  0
close_date                  0
zip5                        0
beds_total                  0
baths.lavs                  0
est_fin_abv_grd_sqft        0
acreage                     0
architecture_level          0
architecture_style          0
basement_type               0
exterior                    0
cooling                     0
exterior_features           0
garage_features             0
heating                     0
sqft-est_tot_fin            0
sqft-est_fin_lower_floor    0
sqft-est_fin_abv_grd        0
sqft-est_tot_bsmt           0
year_built                  0
dtype: int64

In [9]:
encoder = ce.OrdinalEncoder()
XN = No_NaNs.drop(['sale_price'], axis=1)
yn = No_NaNs.sale_price
XN = encoder.fit_transform(XN)
XN.shape

(391, 21)

In [10]:
#Splitting data for experimental features encoded
XN_train, XN_test, yn_train, yn_test = train_test_split(
    XN, yn, test_size=0.2, random_state=42)

# Confirming correct shapes of data
print("XN_train shape is: ", (XN_train.shape))
print("XN_test shape is: ",(XN_test.shape))
print("yn_train shape is: ",(yn_train.shape))
print("yn_trest shape is: ",(yn_test.shape))

XN_train shape is:  (312, 21)
XN_test shape is:  (79, 21)
yn_train shape is:  (312,)
yn_trest shape is:  (79,)


In [11]:
# Baseline Prediction
logreg = LogisticRegression()
logreg.fit(XN_train,yn_train)
yn_pred = logreg.predict(XN_train)
accuracy_score(yn_train,yn_pred)

0.6955128205128205