In [52]:
import pandas as pd
import numpy as np

# 1.⁠ ⁠mortgage: calculated

# 2.⁠ ⁠rentals: ZORI(smoothed): all homes plus multifamily time series - metro&us
# The index is dollar-denominated by computing the mean of listed rents that fall 
# into the 40th to 60th percentile range for all homes and apartments in a given
# region, which is weighted to reflect the rental housing stock.
rentals = pd.read_csv("data/Dataset to use/Metro_zori_uc_sfrcondomfr_sm_month.csv")
# print(rentals.head())

# 3.⁠ ⁠SALES: median sale price (smooth, all homes, monthly) - metro&us
# The Sales Count Nowcast is the estimated number of unique properties that sold 
# during the month after accounting for the latency between when sales occur and 
# when they are reported. Available only for the raw cut of all homes.
# Sale Price (median/mean): The price at which homes across various geographies were sold.
# Sale-to-List Ratio (mean/median): Ratio of sale vs. final list price.
# Percent of Sales Below/Above List: Share of sales where sale price below/above the 
# final list price; excludes homes sold for exactly the list price.
median_sales = pd.read_csv("data/Dataset to use/Metro_median_sale_price_uc_sfrcondo_sm_month.csv")
# print(median_sales.head())

# 4.⁠ ⁠ZHVF (Forecast), All homes (SFR, condo/co-op), raw, mid-tier - metro & us
# A month-ahead, quarter-ahead and year-ahead forecast of the Zillow Home Value Index (ZHVI).
home_forecast = pd.read_csv("data/Dataset to use/Metro_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_month.csv")
print(home_forecast.head())


   RegionID  SizeRank       RegionName RegionType StateName    BaseDate  \
0    102001         0    United States    country       NaN  2024-02-29   
1    394913         1     New York, NY        msa        NY  2024-02-29   
2    753899         2  Los Angeles, CA        msa        CA  2024-02-29   
3    394463         3      Chicago, IL        msa        IL  2024-02-29   
4    394514         4       Dallas, TX        msa        TX  2024-02-29   

   2024-03-31  2024-05-31  2025-02-28  
0         0.8         2.2         0.6  
1         0.5         1.0        -1.6  
2         1.0         2.3        -0.9  
3         1.1         3.0        -1.0  
4         1.1         2.8         0.8  


In [50]:
# Data cleaning for rentals dataset 

# Check for missing values 
rentals.dropna(subset=['RegionID', 'RegionName'], inplace=True)

# data frame to long format 
rentals_melted = rentals.melt(id_vars=["RegionID", "SizeRank", "RegionName", "RegionType", "StateName"],
                              var_name="Date",
                              value_name="Rent")

# Convert 'Date' to datetime format
rentals_melted['Date'] = pd.to_datetime(rentals_melted['Date'], errors='coerce')

# Drop rows where 'Rent' is null
rentals_melted.dropna(subset=['Rent'], inplace=True)


# Drop all rows where RegionType is country
state_rentals = rentals_melted[rentals_melted['RegionType'] != 'country']

state_rentals = state_rentals.copy()
state_rentals.rename(columns={
    'RegionName': 'Region Name',
    'RegionType': 'Region Type',
    'StateName': 'State Name',
    'Date': 'Date Recorded',
    'Rent': 'Monthly Rent'
}, inplace=True)

state_abbreviations = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
    'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
    'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
    'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
    'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia',
    'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}

# Replace state abbreviations with full names in the 'State Name' column
state_rentals['State Name'] = state_rentals['State Name'].map(state_abbreviations).fillna(state_rentals['State Name'])

print(state_rentals.head())
# Export the cleaned and transformed data to a CSV file
state_rentals.to_csv("cleaned_state_rentals.csv", index=False)


   RegionID  SizeRank      Region Name Region Type  State Name Date Recorded  \
1    394913         1     New York, NY         msa    New York    2015-01-31   
2    753899         2  Los Angeles, CA         msa  California    2015-01-31   
3    394463         3      Chicago, IL         msa    Illinois    2015-01-31   
4    394514         4       Dallas, TX         msa       Texas    2015-01-31   
5    394692         5      Houston, TX         msa       Texas    2015-01-31   

   Monthly Rent  
1   2286.918320  
2   1833.212831  
3   1418.891001  
4   1104.552997  
5   1226.598136  


In [51]:
# Convert 'Date' to datetime format and melt the data frame
median_sales_melted = median_sales.melt(id_vars=["RegionID", "SizeRank", "RegionName", "RegionType", "StateName"],
                                        var_name="Date",
                                        value_name="MedianSalePrice")
median_sales_melted['Date'] = pd.to_datetime(median_sales_melted['Date'], errors='coerce')

# Drop rows where 'MedianSalePrice' is null
median_sales_melted.dropna(subset=['MedianSalePrice'], inplace=True)

# Drop all rows where RegionType is country
state_median_sales = median_sales_melted[median_sales_melted['RegionType'] != 'country']
state_median_sales = state_median_sales.copy()

# Rename columns
state_median_sales.rename(columns={
    'RegionName': 'Region Name',
    'RegionType': 'Region Type',
    'StateName': 'State Name',
    'Date': 'Date Recorded',
    'MedianSalePrice': 'Median Sale Price'
}, inplace=True)

# Replace state abbreviations with full names in the 'State Name' column
state_abbreviations = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
    'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
    'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
    'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
    'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia',
    'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}
state_median_sales['State Name'] = state_median_sales['State Name'].map(state_abbreviations).fillna(state_median_sales['State Name'])

# Export the cleaned and transformed data to a CSV file
state_median_sales.to_csv("cleaned_median_sales.csv", index=False)

# Print the head of the dataset to verify the changes
print(state_median_sales.head())


   RegionID  SizeRank      Region Name Region Type  State Name Date Recorded  \
1    394913         1     New York, NY         msa    New York    2008-04-30   
2    753899         2  Los Angeles, CA         msa  California    2008-04-30   
3    394463         3      Chicago, IL         msa    Illinois    2008-04-30   
4    394514         4       Dallas, TX         msa       Texas    2008-04-30   
5    394692         5      Houston, TX         msa       Texas    2008-04-30   

   Median Sale Price  
1           395000.0  
2           461000.0  
3           227333.0  
4           142967.0  
5           145788.0  


In [53]:
# Melt the DataFrame to long format
home_forecast_melted = home_forecast.melt(id_vars=["RegionID", "SizeRank", "RegionName", "RegionType", "StateName", "BaseDate"],
                                          var_name="ForecastDate",
                                          value_name="ForecastGrowth")

# Convert 'BaseDate' and 'ForecastDate' to datetime format
home_forecast_melted['BaseDate'] = pd.to_datetime(home_forecast_melted['BaseDate'], errors='coerce')
home_forecast_melted['ForecastDate'] = pd.to_datetime(home_forecast_melted['ForecastDate'], errors='coerce')

# Drop rows where 'ForecastGrowth' is null
home_forecast_melted.dropna(subset=['ForecastGrowth'], inplace=True)

# Drop all rows where RegionType is country
state_home_forecast = home_forecast_melted[home_forecast_melted['RegionType'] != 'country']
state_home_forecast = state_home_forecast.copy()

# Rename columns
state_home_forecast.rename(columns={
    'RegionName': 'Region Name',
    'RegionType': 'Region Type',
    'StateName': 'State Name',
    'BaseDate': 'Base Date Recorded',
    'ForecastDate': 'Forecast Date',
    'ForecastGrowth': 'Forecast Growth (%)'
}, inplace=True)

# Replace state abbreviations with full names in the 'State Name' column, using previously defined 'state_abbreviations'
state_abbreviations = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
    'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
    'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
    'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
    'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia',
    'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}
state_home_forecast['State Name'] = state_home_forecast['State Name'].map(state_abbreviations).fillna(state_home_forecast['State Name'])

# Export the cleaned and transformed data to a CSV file
state_home_forecast.to_csv("cleaned_state_home_forecast.csv", index=False)

# Print the head of the dataset to verify the changes
print(state_home_forecast.head())


   RegionID  SizeRank      Region Name Region Type  State Name  \
1    394913         1     New York, NY         msa    New York   
2    753899         2  Los Angeles, CA         msa  California   
3    394463         3      Chicago, IL         msa    Illinois   
4    394514         4       Dallas, TX         msa       Texas   
5    394692         5      Houston, TX         msa       Texas   

  Base Date Recorded Forecast Date  Forecast Growth (%)  
1         2024-02-29    2024-03-31                  0.5  
2         2024-02-29    2024-03-31                  1.0  
3         2024-02-29    2024-03-31                  1.1  
4         2024-02-29    2024-03-31                  1.1  
5         2024-02-29    2024-03-31                  0.8  
