In [14]:
import pandas as pd
from pandas import json_normalize

In [15]:
# Load the JSON file as a DataFrame
property_data_df = pd.read_json("Data/random_property_data.json")

In [16]:
print(property_data_df.columns)

Index(['abbreviatedAddress', 'address', 'apartmentsForRentInZipcodeSearchUrl',
       'attributionInfo', 'bathrooms', 'bedrooms', 'big', 'boroughId',
       'boroughSearchUrl', 'brokerId',
       ...
       'zestimateHighPercent', 'zestimateLowPercent', 'zillowOfferMarket',
       'zipPlusFour', 'zipcode', 'zipcodeSearchUrl', 'zoContactSubtitle',
       'zoMarketName', 'zoResaleStartAnOfferEnabled', 'zpid'],
      dtype='object', length=212)


In [248]:
desired_columns = [
    'county', 'homeType', 'livingArea', 'livingAreaUnits', 'lotSize', 'lotAreaUnits',
    'neighborhoodRegion', 'resoFacts', 'price', 'hiResImageLink'
]

filtered_property_data_df = property_data_df[desired_columns]

print(filtered_property_data_df.head())

         county       homeType  livingArea livingAreaUnits  lotSize   
0  Story County  SINGLE_FAMILY      1404.0     Square Feet     4330  \
1  Story County  SINGLE_FAMILY      2021.0     Square Feet     8001   
2  Boone County            LOT         NaN            None    87120   
3  Boone County  SINGLE_FAMILY      2215.0     Square Feet    49658   
4  Story County  SINGLE_FAMILY      1050.0     Square Feet     8702   

  lotAreaUnits                  neighborhoodRegion   
0  Square Feet           {'name': 'Oak-Riverside'}  \
1  Square Feet        {'name': 'Somerset Village'}   
2        Acres                                None   
3        Acres                                None   
4  Square Feet  {'name': 'College Creek/Old Ames'}   

                                           resoFacts   price   
0  {'aboveGradeFinishedArea': '1,404', 'accessibi...  239900  \
1  {'aboveGradeFinishedArea': None, 'accessibilit...  405000   
2  {'aboveGradeFinishedArea': None, 'accessibilit...  22

In [249]:
# Flatten the nested data in the DataFrame
flattened_property_data_df = json_normalize(filtered_property_data_df.to_dict(orient="records"))
print(flattened_property_data_df.head())

         county       homeType  livingArea livingAreaUnits  lotSize   
0  Story County  SINGLE_FAMILY      1404.0     Square Feet     4330  \
1  Story County  SINGLE_FAMILY      2021.0     Square Feet     8001   
2  Boone County            LOT         NaN            None    87120   
3  Boone County  SINGLE_FAMILY      2215.0     Square Feet    49658   
4  Story County  SINGLE_FAMILY      1050.0     Square Feet     8702   

  lotAreaUnits   price                                     hiResImageLink   
0  Square Feet  239900  https://photos.zillowstatic.com/fp/df0bd099740...  \
1  Square Feet  405000  https://photos.zillowstatic.com/fp/4f94778f62c...   
2        Acres  228000  https://photos.zillowstatic.com/fp/17cec68ca0e...   
3        Acres  924900  https://photos.zillowstatic.com/fp/17839d6b2ae...   
4  Square Feet  470000  https://photos.zillowstatic.com/fp/737c4d70a7b...   

  neighborhoodRegion.name resoFacts.aboveGradeFinishedArea  ...   
0           Oak-Riverside                  

In [250]:
print(flattened_property_data_df['resoFacts.atAGlanceFacts'])

0     [{'factLabel': 'Type', 'factValue': 'Single Fa...
1     [{'factLabel': 'Type', 'factValue': 'Single Fa...
2                                                  None
3     [{'factLabel': 'Type', 'factValue': 'Single Fa...
4     [{'factLabel': 'Type', 'factValue': 'Single Fa...
5     [{'factLabel': 'Type', 'factValue': 'Townhouse...
6     [{'factLabel': 'Type', 'factValue': 'Single Fa...
7     [{'factLabel': 'Type', 'factValue': 'Single Fa...
8     [{'factLabel': 'Type', 'factValue': 'Townhouse...
9     [{'factLabel': 'Type', 'factValue': 'Townhouse...
10    [{'factLabel': 'Type', 'factValue': 'Single Fa...
11    [{'factLabel': 'Type', 'factValue': 'Single Fa...
12    [{'factLabel': 'Type', 'factValue': 'Townhouse...
13    [{'factLabel': 'Type', 'factValue': 'Single Fa...
14    [{'factLabel': 'Type', 'factValue': 'Single Fa...
15    [{'factLabel': 'Type', 'factValue': 'Single Fa...
16                                                 None
17    [{'factLabel': 'Type', 'factValue': 'Singl

In [251]:
atAGlanceFacts = flattened_property_data_df['resoFacts.atAGlanceFacts']

# Find unique factLabels to create the columns
fact_labels = set()
for row in atAGlanceFacts:
    if row:
        for fact in row:
            fact_labels.add(fact['factLabel'])

fact_labels = list(fact_labels)
facts_df = pd.DataFrame(columns=fact_labels)

for idx, row in atAGlanceFacts.items():
    if row:
        fact_dict = {}
        for fact in row:
            fact_label = fact['factLabel']
            fact_value = fact['factValue']
            fact_dict[fact_label] = fact_value
        facts_df = pd.concat([facts_df, pd.DataFrame(fact_dict, index=[idx])])
    else:
        facts_df = pd.concat([facts_df, pd.DataFrame({}, index=[idx])])

print(facts_df)

   Offer Review Date          Lot Days on Zillow                   Parking   
0               None   4,330 sqft        54 Days   1 Attached Garage space  \
1               None   8,001 sqft              0  2 Attached Garage spaces   
2               None          NaN            NaN                       NaN   
3               None   1.14 Acres        63 Days  3 Attached Garage spaces   
4               None   8,702 sqft         7 Days                      None   
5               None          NaN       231 Days                      None   
6               None   0.31 Acres        21 Days  2 Attached Garage spaces   
7               None  10,454 sqft       376 Days  3 Attached Garage spaces   
8               None          NaN              0   1 Attached Garage space   
9               None   6,534 sqft       174 Days  2 Attached Garage spaces   
10              None   0.31 Acres        91 Days  3 Attached Garage spaces   
11              None   0.31 Acres       160 Days  3 Attached Gar

In [252]:
print(flattened_property_data_df['resoFacts.basement'])

0                                          Full
1                                      Finished
2                                          None
3                                Full,Sump Pump
4                       Full,Partial,Unfinished
5                                          None
6                               Walk-Out Access
7                     Full,Unfinished,Sump Pump
8                                Full,Sump Pump
9     Full,Unfinished,Walk-Out Access,Sump Pump
10           Daylight,Full,Unfinished,Sump Pump
11                               None,Sump Pump
12                             None,Crawl Space
13                               Full,Sump Pump
14                                         Full
15                                         Full
16                                         None
17                    Full,Unfinished,Sump Pump
18                                         None
19                   Unfinished,Walk-Out Access
Name: resoFacts.basement, dtype: object


In [253]:
facts_df['GarageType'] = facts_df['Parking'].apply(
    lambda x: "NA" if pd.isnull(x) or x == "None" or x == "NaN" else 'Attchd' if "Attached" in x else "BuiltIn"
)

In [254]:
facts_df = facts_df.drop(columns=[
    'Type', 'Heating', 'Cooling', 'Parking', 'HOA', 'Lot', 'Days on Zillow', 'Price/sqft', 'Buyers Agency Fee', 'Offer Review Date'
])

In [255]:
flattened_property_data_df = pd.concat([flattened_property_data_df, facts_df], axis=1)

In [256]:
flattened_property_data_df['numStories'] = flattened_property_data_df['resoFacts.stories'].apply(lambda x: "2Story" if x == 2 else "1Story" if x == 1 else None)
flattened_property_data_df['BldgType'] = flattened_property_data_df['homeType'].apply(
    lambda x: '1Fam' if x == "SINGLE_FAMILY" else "TwnhsE" if x == 'CONDO' or x == 'TOWNHOUSE' else "2fmCon" if x == "MULTI_FAMILY" else "LOT"
)

print(flattened_property_data_df['numStories'])


0     1Story
1     2Story
2       None
3     1Story
4     2Story
5     1Story
6     1Story
7     1Story
8     1Story
9     1Story
10    1Story
11    1Story
12    1Story
13    1Story
14    2Story
15      None
16      None
17    1Story
18      None
19    2Story
Name: numStories, dtype: object


In [257]:
flattened_property_data_df['FullBath'] = flattened_property_data_df['resoFacts.bathroomsFull'].fillna(0) + flattened_property_data_df['resoFacts.bathroomsThreeQuarter'].fillna(0)
flattened_property_data_df['HalfBath'] = flattened_property_data_df['resoFacts.bathroomsHalf'].fillna(0) + flattened_property_data_df['resoFacts.bathroomsOneQuarter'].fillna(0) + flattened_property_data_df['resoFacts.bathroomsPartial'].fillna(0)
flattened_property_data_df['BedroomAbvGr'] = flattened_property_data_df['resoFacts.bedrooms'].fillna(0)

In [258]:
flattened_property_data_df = flattened_property_data_df.drop(columns=[
    'livingAreaUnits', 'lotAreaUnits', 'neighborhoodRegion', 'neighborhoodRegion.name', 
    'resoFacts.aboveGradeFinishedArea','resoFacts.architecturalStyle', 'resoFacts.accessibilityFeatures', 'resoFacts.additionalParcelsDescription',
    'resoFacts.allowedPets', 'resoFacts.appliances', 'resoFacts.associationAmenities', 'resoFacts.associationFee', 'resoFacts.associationFee2',
    'resoFacts.associationFeeIncludes', 'resoFacts.associationName', 'resoFacts.associationName2', 'resoFacts.associationPhone',
    'resoFacts.associationPhone2', 'resoFacts.atAGlanceFacts', 'resoFacts.attic', 'resoFacts.availabilityDate', 'resoFacts.basement', 'resoFacts.bathrooms', 
    'resoFacts.bathroomsFull', 'resoFacts.bathroomsHalf', 'resoFacts.bathroomsOneQuarter', 'resoFacts.bathroomsPartial', 'resoFacts.bathroomsThreeQuarter', 
    'resoFacts.basementYN', 'resoFacts.bedrooms', 
])

In [259]:
reso_facts_df = flattened_property_data_df.filter(regex=r'^resoFacts\.', axis=1)

In [260]:
flattened_property_data_df['Fireplaces'] = reso_facts_df['resoFacts.fireplaces'].fillna(0)

In [261]:
flattened_property_data_df['Foundation'] = reso_facts_df['resoFacts.foundationDetails'].apply(
    lambda x: 'Slab' if 'Slab' in x else 'PConc' if "Poured" in x else "BrkTil" if "Brick/Mortar" in x else "CBlock" if "Block" in x else None
)

In [262]:
reso_facts_df['resoFacts.zoningDescription']

0         Residential
1                None
2                None
3                 Res
4     Residential Med
5               F-PRD
6                  RL
7         Residential
8               F-PRD
9     FS-RL-Suburbn R
10        Residential
11    Planned Residen
12              F-PRD
13    Residential Low
14    Residential Low
15                Res
16    Suburban Res Lo
17              FS-RL
18                  R
19               None
Name: resoFacts.zoningDescription, dtype: object

In [263]:
flattened_property_data_df['PoolArea'] = 0.0
flattened_property_data_df['PoolQC'] = 'NA'
flattened_property_data_df['Fence'] = 'NA'

In [264]:
print(non_reso_facts_df['address.community'])

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
Name: address.community, dtype: object


In [276]:
non_reso_facts_df = flattened_property_data_df.filter(regex=r'^(?!resoFacts\.)', axis=1)
non_reso_facts_df.drop(columns=['homeType', 'county', 'numStories', 'hiResImageLink'], inplace=True)
non_reso_facts_df.rename(columns={'livingArea': 'GrLivArea', 'lotSize': 'LotArea', 'price': 'SalePrice', 'Year Built': 'YearBuilt'}, inplace=True)
# Display the non-resoFacts DataFrame
print(non_reso_facts_df.head())

   GrLivArea  LotArea  SalePrice YearBuilt GarageType BldgType  FullBath   
0     1404.0     4330     239900      1958     Attchd     1Fam       2.0  \
1     2021.0     8001     405000      2002     Attchd     1Fam       3.0   
2        NaN    87120     228000       NaN         NA      LOT       0.0   
3     2215.0    49658     924900      2010     Attchd     1Fam       3.0   
4     1050.0     8702     470000      1895         NA     1Fam       1.0   

   HalfBath  BedroomAbvGr  Fireplaces Foundation  PoolArea PoolQC Fence  
0       1.0           6.0         0.0     BrkTil       0.0     NA    NA  
1       1.0           4.0         1.0      PConc       0.0     NA    NA  
2       0.0           0.0         0.0       None       0.0     NA    NA  
3       1.0           4.0         3.0     CBlock       0.0     NA    NA  
4       0.0           5.0         0.0     BrkTil       0.0     NA    NA  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_reso_facts_df.drop(columns=['homeType', 'county', 'numStories', 'hiResImageLink'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_reso_facts_df.rename(columns={'livingArea': 'GrLivArea', 'lotSize': 'LotArea', 'price': 'SalePrice', 'Year Built': 'YearBuilt'}, inplace=True)


In [277]:
print(non_reso_facts_df.columns)

Index(['GrLivArea', 'LotArea', 'SalePrice', 'YearBuilt', 'GarageType',
       'BldgType', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'Fireplaces',
       'Foundation', 'PoolArea', 'PoolQC', 'Fence'],
      dtype='object')
