In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from statsmodels.tsa.stattools import adfuller
import seaborn as sns

In [2]:
raw_df = pd.read_csv('data/zillow_data.csv')

Filter only for Texas
<br>Drop Metro because there were many nulls, and it was redundant with other location data 
<br> Drop regionID and SizeRank because values were specific to Zillow 

In [3]:
raw_df = raw_df[raw_df['State']=='TX']
raw_df.drop(raw_df.columns[np.r_[0,3,4, 6:100]], axis=1, inplace=True)
raw_df = raw_df.rename(columns={'RegionName': 'zipcode'})


In [4]:
#Calculate historical return on investment
raw_df['ROI']= (raw_df['2018-04']/raw_df['2004-01'])-1
raw_df['ROI']


#Calculate standard deviation of monthly values
raw_df['std']=raw_df.loc[:,'2004-01':'2018-04'].std(skipna=True, axis=1)

#Calculate historical mean value
raw_df['mean']=raw_df.loc[:,'2004-01':'2018-04'].mean(skipna=True, axis=1)

#Calculate coefficient of variance
raw_df['CV']=raw_df['std']/raw_df['mean']

#Show calculated values
raw_df[['zipcode','std','mean','ROI','CV']].head()

#Descriptive statistics of coefficients of variance.
print(raw_df.CV.describe())

#Define upper limit of CV according to risk profile.
upper_cv = raw_df.CV.quantile(.6)
print(f'\nCV upper limit: {upper_cv}')

#Get the 5 zipcodes with highest ROIs within the firms risk profile.
zc_best5 = raw_df[raw_df['CV']<upper_cv].sort_values('ROI',axis=0,ascending=False)[:5]
print('\n Best 5 Zipcodes:')
zc_best5[['zipcode','ROI','CV']]

count    989.000000
mean       0.121449
std        0.036183
min        0.019827
25%        0.098176
50%        0.120857
75%        0.142168
max        0.293064
Name: CV, dtype: float64

CV upper limit: 0.12848011113068844

 Best 5 Zipcodes:


Unnamed: 0,zipcode,ROI,CV
9851,76050,0.94347,0.117471
7776,78204,0.848754,0.119656
11091,77514,0.845737,0.126108
2901,77550,0.843943,0.120857
4807,79602,0.830396,0.124068


In [5]:
raw_df

Unnamed: 0,zipcode,City,CountyName,2004-01,2004-02,2004-03,2004-04,2004-05,2004-06,2004-07,...,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04,ROI,std,mean,CV
1,75070,McKinney,Collin,187700.0,188800.0,190300.0,191800.0,193000.0,193900.0,194500.0,...,315000,316600,318100,319600,321100,321800,0.714438,38257.775512,227040.116279,0.168507
2,77494,Katy,Harris,231800.0,233100.0,233500.0,233000.0,232100.0,231300.0,230700.0,...,320800,321200,321200,323000,326900,329900,0.423210,33208.367605,266633.139535,0.124547
4,79936,El Paso,El Paso,84000.0,84700.0,85500.0,86400.0,87200.0,88000.0,88900.0,...,120300,120300,120300,120500,121000,121500,0.446429,10248.462666,113729.651163,0.090112
5,77084,Houston,Harris,119400.0,120400.0,121200.0,121900.0,122400.0,122700.0,123000.0,...,162800,162800,162800,162900,163500,164300,0.376047,14170.210695,130143.604651,0.108881
8,77449,Katy,Harris,123900.0,125300.0,126600.0,127500.0,128100.0,128500.0,128800.0,...,170900,172300,173300,174200,175400,176200,0.422115,16508.811448,134945.348837,0.122337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14372,76941,Mertzon,Irion,,,,,,,,...,123500,124700,124300,122600,121600,121600,,13734.029878,99318.367347,0.138283
14472,79313,Anton,Hockley,,,,,,,,...,58900,61500,63000,63600,63500,63300,,5286.674724,54424.675325,0.097137
14492,79355,Plains,Yoakum,,,,,,,,...,99700,97700,95800,94600,94000,93500,,12929.465872,78967.346939,0.163732
14599,79366,Ransom Canyon,Lubbock,161900.0,162300.0,162700.0,163100.0,163400.0,163600.0,163900.0,...,251300,251500,251700,252500,255000,257500,0.590488,25909.956515,189098.837209,0.137018


In [6]:
def melt_data(raw_df):
    melted = pd.melt(raw_df, id_vars=['zipcode', 'City', 'CountyName','ROI','std','mean','CV'], var_name='date')
    melted['date'] = pd.to_datetime(melted['date'], infer_datetime_format=True)
    melted['year'] = [d.year for d in melted.date]
    melted = melted.dropna(subset=['value'])
    return melted

In [7]:
df = melt_data(raw_df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167292 entries, 0 to 170107
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   zipcode     167292 non-null  int64         
 1   City        167292 non-null  object        
 2   CountyName  167292 non-null  object        
 3   ROI         162884 non-null  float64       
 4   std         167292 non-null  float64       
 5   mean        167292 non-null  float64       
 6   CV          167292 non-null  float64       
 7   date        167292 non-null  datetime64[ns]
 8   value       167292 non-null  float64       
 9   year        167292 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 14.0+ MB


In [8]:
df.set_index(df['date'], inplace = True)
df.drop('date',axis=1, inplace=True)
df

Unnamed: 0_level_0,zipcode,City,CountyName,ROI,std,mean,CV,value,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01,75070,McKinney,Collin,0.714438,38257.775512,227040.116279,0.168507,187700.0,2004
2004-01-01,77494,Katy,Harris,0.423210,33208.367605,266633.139535,0.124547,231800.0,2004
2004-01-01,79936,El Paso,El Paso,0.446429,10248.462666,113729.651163,0.090112,84000.0,2004
2004-01-01,77084,Houston,Harris,0.376047,14170.210695,130143.604651,0.108881,119400.0,2004
2004-01-01,77449,Katy,Harris,0.422115,16508.811448,134945.348837,0.122337,123900.0,2004
...,...,...,...,...,...,...,...,...,...
2018-04-01,76941,Mertzon,Irion,,13734.029878,99318.367347,0.138283,121600.0,2018
2018-04-01,79313,Anton,Hockley,,5286.674724,54424.675325,0.097137,63300.0,2018
2018-04-01,79355,Plains,Yoakum,,12929.465872,78967.346939,0.163732,93500.0,2018
2018-04-01,79366,Ransom Canyon,Lubbock,0.590488,25909.956515,189098.837209,0.137018,257500.0,2018


In [9]:
# Select the first quantile
q1 = df['value'].quantile(.25)

# Select the third quantile
q3 = df['value'].quantile(.75)


# Create a filtered in between q1 & q3
filtered =df[(df['value']>=q1) 
             & (df['value']<=q3)]
filtered

Unnamed: 0_level_0,zipcode,City,CountyName,ROI,std,mean,CV,value,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01,77084,Houston,Harris,0.376047,14170.210695,130143.604651,0.108881,119400.0,2004
2004-01-01,77449,Katy,Harris,0.422115,16508.811448,134945.348837,0.122337,123900.0,2004
2004-01-01,78660,Pflugerville,Travis,0.578674,23031.089035,180093.023256,0.127884,153800.0,2004
2004-01-01,77573,League City,Galveston,0.568086,23867.673325,199441.279070,0.119673,166700.0,2004
2004-01-01,79912,El Paso,El Paso,0.476452,13149.347629,171614.534884,0.076621,127400.0,2004
...,...,...,...,...,...,...,...,...,...
2018-04-01,76064,Maypearl,Ellis,0.659048,15308.985812,136593.604651,0.112077,174200.0,2018
2018-04-01,77577,Liverpool,Brazoria,0.761511,17597.138719,108870.348837,0.161634,149200.0,2018
2018-04-01,77663,Kountze,Hardin,0.312721,11184.253614,131325.581395,0.085164,148600.0,2018
2018-04-01,76941,Mertzon,Irion,,13734.029878,99318.367347,0.138283,121600.0,2018


In [10]:
df = pd.DataFrame(filtered)
df 

Unnamed: 0_level_0,zipcode,City,CountyName,ROI,std,mean,CV,value,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01,77084,Houston,Harris,0.376047,14170.210695,130143.604651,0.108881,119400.0,2004
2004-01-01,77449,Katy,Harris,0.422115,16508.811448,134945.348837,0.122337,123900.0,2004
2004-01-01,78660,Pflugerville,Travis,0.578674,23031.089035,180093.023256,0.127884,153800.0,2004
2004-01-01,77573,League City,Galveston,0.568086,23867.673325,199441.279070,0.119673,166700.0,2004
2004-01-01,79912,El Paso,El Paso,0.476452,13149.347629,171614.534884,0.076621,127400.0,2004
...,...,...,...,...,...,...,...,...,...
2018-04-01,76064,Maypearl,Ellis,0.659048,15308.985812,136593.604651,0.112077,174200.0,2018
2018-04-01,77577,Liverpool,Brazoria,0.761511,17597.138719,108870.348837,0.161634,149200.0,2018
2018-04-01,77663,Kountze,Hardin,0.312721,11184.253614,131325.581395,0.085164,148600.0,2018
2018-04-01,76941,Mertzon,Irion,,13734.029878,99318.367347,0.138283,121600.0,2018


In [11]:
df_income = pd.read_csv('data/zip_codes_2019_median_inc.csv')
df_income['state'] = df_income['Preferred name'].str.slice(-2)
df_income = df_income[df_income['state']=='TX']
df_income = (df_income[df_income['Type'] != 'PO box'])
df_income

Unnamed: 0,ZIP Code,Type,State FIPS,Preferred name,Alternate names,Population (2019),Housing units (2019),Median family income (2019),MFI percentile (2019),Latitude,Longitude,Land area,Water area,state
31142,73301,unique,48,"Austin, TX",Irs Service Center,,,,,,,,,TX
31143,73344,unique,48,"Austin, TX",Irs Service Center,,,,,,,,,TX
31744,75001,standard,48,"Addison, TX",,14992,9298,"$79,551",68.0,32.959999,-96.838997,3.834,0.003,TX
31745,75002,standard,48,"Allen, TX","Lucas, Parker",71253,23314,"$105,656",88.0,33.090000,-96.609001,37.179,2.056,TX
31746,75006,standard,48,"Carrollton, TX",,51642,19178,"$72,325",57.0,32.962002,-96.899002,16.875,0.254,TX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34257,79961,unique,48,"El Paso, TX",El Paso Water Utilities,,,,,,,,,TX
34258,79968,unique,48,"El Paso, TX",Univ Of Tx Elp,,,,,,,,,TX
34259,79976,unique,48,"El Paso, TX",Southern Union Gas Co,,,,,,,,,TX
34260,79978,unique,48,"El Paso, TX",El Paso Natural Gas,,,,,,,,,TX


Clean up column names

In [12]:
df_income = df_income.rename(columns={'Population (2019)': 'population', 
                        'Housing units (2019)': 'housing_units',
                         'ZIP Code': 'zipcode'})

In [13]:
df_income['Median family income (2019)'] = df_income['Median family income (2019)'].str.strip('$\n\t')
df_income['Median family income (2019)'] = df_income['Median family income (2019)'].str.replace(',','')
df_income = df_income[df_income['Median family income (2019)'].notna()]
df_income['median_family_income'] = df_income['Median family income (2019)'].astype(int)
 

In [14]:
df_income.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1623 entries, 31744 to 34238
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   zipcode                      1623 non-null   int64  
 1   Type                         1623 non-null   object 
 2   State FIPS                   1623 non-null   int64  
 3   Preferred name               1623 non-null   object 
 4   Alternate names              689 non-null    object 
 5   population                   1623 non-null   object 
 6   housing_units                1623 non-null   object 
 7   Median family income (2019)  1623 non-null   object 
 8   MFI percentile (2019)        1623 non-null   float64
 9   Latitude                     1623 non-null   float64
 10  Longitude                    1623 non-null   float64
 11  Land area                    1623 non-null   float64
 12  Water area                   1623 non-null   float64
 13  state        

In [15]:
df_income.drop(df_income.columns[np.r_[1:5,7,8,11,12]], axis=1, inplace=True)


In [16]:
df_income.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1623 entries, 31744 to 34238
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   zipcode               1623 non-null   int64  
 1   population            1623 non-null   object 
 2   housing_units         1623 non-null   object 
 3   Latitude              1623 non-null   float64
 4   Longitude             1623 non-null   float64
 5   state                 1623 non-null   object 
 6   median_family_income  1623 non-null   int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 101.4+ KB


In [18]:
# df_merged = df.merge(df_income, on ='zipcode')
df_merged = df.reset_index().merge(df_income, how="left").set_index('date')
df_merged 

Unnamed: 0_level_0,zipcode,City,CountyName,ROI,std,mean,CV,value,year,population,housing_units,Latitude,Longitude,state,median_family_income
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-01-01,77084,Houston,Harris,0.376047,14170.210695,130143.604651,0.108881,119400.0,2004,107673,37968,29.827000,-95.660004,TX,70460.0
2004-01-01,77449,Katy,Harris,0.422115,16508.811448,134945.348837,0.122337,123900.0,2004,128294,38672,29.837999,-95.734001,TX,82716.0
2004-01-01,78660,Pflugerville,Travis,0.578674,23031.089035,180093.023256,0.127884,153800.0,2004,91300,31981,30.440001,-97.595001,TX,99733.0
2004-01-01,77573,League City,Galveston,0.568086,23867.673325,199441.279070,0.119673,166700.0,2004,88131,31342,29.504000,-95.086998,TX,121943.0
2004-01-01,79912,El Paso,El Paso,0.476452,13149.347629,171614.534884,0.076621,127400.0,2004,78267,32370,31.849001,-106.533997,TX,77146.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-04-01,76064,Maypearl,Ellis,0.659048,15308.985812,136593.604651,0.112077,174200.0,2018,1818,633,32.301998,-97.037003,TX,69350.0
2018-04-01,77577,Liverpool,Brazoria,0.761511,17597.138719,108870.348837,0.161634,149200.0,2018,1481,719,29.284000,-95.281998,TX,76382.0
2018-04-01,77663,Kountze,Hardin,0.312721,11184.253614,131325.581395,0.085164,148600.0,2018,,,,,,
2018-04-01,76941,Mertzon,Irion,,13734.029878,99318.367347,0.138283,121600.0,2018,1455,783,31.271999,-100.889000,TX,64464.0


In [19]:
# df.drop(df.columns[np.r_[9:18,20,21]], axis=1, inplace=True)

In [20]:
df

Unnamed: 0_level_0,zipcode,City,CountyName,ROI,std,mean,CV,value,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01,77084,Houston,Harris,0.376047,14170.210695,130143.604651,0.108881,119400.0,2004
2004-01-01,77449,Katy,Harris,0.422115,16508.811448,134945.348837,0.122337,123900.0,2004
2004-01-01,78660,Pflugerville,Travis,0.578674,23031.089035,180093.023256,0.127884,153800.0,2004
2004-01-01,77573,League City,Galveston,0.568086,23867.673325,199441.279070,0.119673,166700.0,2004
2004-01-01,79912,El Paso,El Paso,0.476452,13149.347629,171614.534884,0.076621,127400.0,2004
...,...,...,...,...,...,...,...,...,...
2018-04-01,76064,Maypearl,Ellis,0.659048,15308.985812,136593.604651,0.112077,174200.0,2018
2018-04-01,77577,Liverpool,Brazoria,0.761511,17597.138719,108870.348837,0.161634,149200.0,2018
2018-04-01,77663,Kountze,Hardin,0.312721,11184.253614,131325.581395,0.085164,148600.0,2018
2018-04-01,76941,Mertzon,Irion,,13734.029878,99318.367347,0.138283,121600.0,2018


In [None]:
df['median_family_income'].isna().sum()


# EDA

Create DataFrame to perform stationarity checks and transformations

In [None]:
df_prices = pd.DataFrame(df['value'], index = df.index)
df_prices

**Mean prices by year**

In [None]:
avg_prices = df_prices.groupby('date').aggregate({'value':'mean'})
avg_prices.plot()

In [None]:
std_prices = df_prices.groupby('date').aggregate({'value':'std'})
std_prices.plot()

In [None]:
med_prices = df_prices.groupby('date').aggregate({'value':'median'})
med_prices.plot()

In [None]:
def stationarity_check(df):
    roll_mean = df.rolling(window=12, center=False).mean()
    roll_std = df.rolling(window=12, center=False).std()
    
    df_test = adfuller(df)
    print('Results of Dickey-Fuller Test: \n')

    dfoutput = pd.Series(df_test[0:4], index=['Test Statistic', 'p-value', 
                                             '#Lags Used', 'Number of Observations Used'])
    for key,value in df_test[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    roll_mean.plot()
    roll_std.plot()
    return None

In [None]:
results = stationarity_check(df_prices)
results

**Log Transformations**

In [None]:
df_log = np.log(df_prices)
fig = plt.figure(figsize=(15,8))
plt.plot(df_log, color='blue');


**Weighted Rolling Mean**

In [None]:
weight_roll_mean = df_log.ewm(halflife=4).mean()
df_log_wrm = df_log - weight_roll_mean
df_log_wrm.plot(figsize=(15,8))



In [None]:
type(df_log_wrm.index)

In [None]:
from statsmodels.tsa.arima.model import ARIMA

mod = ARIMA(df_log_wrm, order=(1, 0, 0))
res = mod.fit()
print(res.summary())