In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import mplcyberpunk as mplnk
import seaborn as sns
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from statsmodels.graphics.tsaplots import plot_acf
import joblib

In [130]:
df = pd.read_csv('../data/processed/median_price_all_homes.csv', index_col=0)

In [131]:
df.tail()

Unnamed: 0,RegionName,State,Metro,CountyName,timestamp,median_price
203875,Hillsboro Beach,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2019-12,3395.0
203876,Amagansett,NY,New York-Newark-Jersey City,Suffolk County,2019-12,35000.0
203877,Quogue,NY,New York-Newark-Jersey City,Suffolk County,2019-12,31666.0
203878,Ogden,KS,Manhattan,Riley County,2019-12,909.0
203879,Wainscott,NY,New York-Newark-Jersey City,Suffolk County,2019-12,28333.0


In [132]:
df.shape

(203880, 6)

In [133]:
df.describe()

Unnamed: 0,median_price
count,109832.0
mean,1693.666181
std,1430.900602
min,500.0
25%,1200.0
50%,1500.0
75%,1900.0
max,50000.0


In [134]:
df.isnull().sum()

RegionName          0
State               0
Metro             120
CountyName          0
timestamp           0
median_price    94048
dtype: int64

In [135]:
df.rename(columns={
    'RegionName': 'region',
    'State':'state', 
    'Metro': 'metro',
    'CountyName': 'county'
    }, inplace=True)

In [136]:
df.tail()

Unnamed: 0,region,state,metro,county,timestamp,median_price
203875,Hillsboro Beach,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2019-12,3395.0
203876,Amagansett,NY,New York-Newark-Jersey City,Suffolk County,2019-12,35000.0
203877,Quogue,NY,New York-Newark-Jersey City,Suffolk County,2019-12,31666.0
203878,Ogden,KS,Manhattan,Riley County,2019-12,909.0
203879,Wainscott,NY,New York-Newark-Jersey City,Suffolk County,2019-12,28333.0


In [137]:
df.dtypes

region           object
state            object
metro            object
county           object
timestamp        object
median_price    float64
dtype: object

In [138]:
df = df.astype({
    'region': str,
    'state': str,
    'metro': str,
    'county': str,
    'timestamp': 'datetime64[ns]'
})

In [139]:
df['timestamp'].head()

0   2010-01-01
1   2010-01-01
2   2010-01-01
3   2010-01-01
4   2010-01-01
Name: timestamp, dtype: datetime64[ns]

In [140]:
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month

In [141]:
df.tail()

Unnamed: 0,region,state,metro,county,timestamp,median_price,year,month
203875,Hillsboro Beach,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2019-12-01,3395.0,2019,12
203876,Amagansett,NY,New York-Newark-Jersey City,Suffolk County,2019-12-01,35000.0,2019,12
203877,Quogue,NY,New York-Newark-Jersey City,Suffolk County,2019-12-01,31666.0,2019,12
203878,Ogden,KS,Manhattan,Riley County,2019-12-01,909.0,2019,12
203879,Wainscott,NY,New York-Newark-Jersey City,Suffolk County,2019-12-01,28333.0,2019,12


In [142]:
df = df[['region', 'state', 'metro', 'county', 'year', 'month', 'timestamp', 'median_price']]

In [143]:
df.tail()

Unnamed: 0,region,state,metro,county,year,month,timestamp,median_price
203875,Hillsboro Beach,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2019,12,2019-12-01,3395.0
203876,Amagansett,NY,New York-Newark-Jersey City,Suffolk County,2019,12,2019-12-01,35000.0
203877,Quogue,NY,New York-Newark-Jersey City,Suffolk County,2019,12,2019-12-01,31666.0
203878,Ogden,KS,Manhattan,Riley County,2019,12,2019-12-01,909.0
203879,Wainscott,NY,New York-Newark-Jersey City,Suffolk County,2019,12,2019-12-01,28333.0


In [144]:
df = df.astype({
    'year': int,
    'month': int
})

In [145]:
df.nunique()

region          1549
state             50
metro            343
county           492
year              10
month             12
timestamp        120
median_price    2716
dtype: int64

In [151]:
class Exputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, include=True):
        super(Exputer, self).__init__()
        self.include = include
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        combined_data = np.column_stack((X, y))
        null_rows = np.isnan(combined_data).any(axis=1)
        

In [152]:
df.dropna(inplace=True)

In [153]:
df.shape

(109832, 8)