In [33]:
import altair as alt
import pandas as pd
import numpy as np

In [63]:
# ensure zipcode is object because some states have leading 0's which are removed when they are casted as int/float
apts2019 = pd.read_csv('data/apts_2019_clean.csv', dtype={'zipcode': 'object'}) 
apts2025 = pd.read_csv('data/apts_2025_clean.csv', dtype={'zipcode': 'object'})

In [64]:
apts2019.columns, apts2025.columns

(Index(['id', 'category', 'title', 'body', 'amenities', 'bathrooms', 'bedrooms',
        'has_photo', 'pets_allowed', 'price', 'price_type', 'square_feet',
        'address', 'cityname', 'state', 'latitude', 'longitude', 'time',
        'zipcode'],
       dtype='object'),
 Index(['zpid', 'address', 'cityname', 'state', 'zipcode', 'latitude',
        'longitude', 'title', 'square_feet', 'price', 'bedrooms'],
       dtype='object'))

In [65]:
apts2019['year'] = 2019
apts2025['year'] = 2025

In [66]:
apts2025.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5697 entries, 0 to 5696
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   zpid         5697 non-null   int64  
 1   address      5697 non-null   object 
 2   cityname     5697 non-null   object 
 3   state        5697 non-null   object 
 4   zipcode      5697 non-null   object 
 5   latitude     5697 non-null   float64
 6   longitude    5697 non-null   float64
 7   title        5665 non-null   object 
 8   square_feet  218 non-null    float64
 9   price        5697 non-null   int64  
 10  bedrooms     5697 non-null   int64  
 11  year         5697 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 534.2+ KB


In [67]:
apts2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9612 entries, 0 to 9611
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            9612 non-null   int64  
 1   category      9612 non-null   object 
 2   title         9612 non-null   object 
 3   body          9612 non-null   object 
 4   amenities     6301 non-null   object 
 5   bathrooms     9582 non-null   float64
 6   bedrooms      9606 non-null   float64
 7   has_photo     9612 non-null   object 
 8   pets_allowed  5637 non-null   object 
 9   price         9612 non-null   int64  
 10  price_type    9612 non-null   object 
 11  square_feet   9612 non-null   int64  
 12  address       6346 non-null   object 
 13  cityname      9612 non-null   object 
 14  state         9612 non-null   object 
 15  latitude      9612 non-null   float64
 16  longitude     9612 non-null   float64
 17  time          9612 non-null   int64  
 18  zipcode       9605 non-null 

In [68]:
apts_combined = pd.concat([apts2019, apts2025], ignore_index=True, sort=False)

In [69]:
apts_combined

Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,has_photo,pets_allowed,price,...,square_feet,address,cityname,state,latitude,longitude,time,zipcode,year,zpid
0,5.668627e+09,housing/rent/apartment,"Studio apartment 2nd St NE, Uhland Terrace NE,...","This unit is located at second St NE, Uhland T...",,,0.0,Thumbnail,,790,...,101.0,,Washington,DC,38.905700,-76.98610,1.577359e+09,20002,2019,
1,5.664597e+09,housing/rent/apartment,Studio apartment 814 Schutte Road,"This unit is located at 814 Schutte Road, Evan...",,,1.0,Thumbnail,,425,...,106.0,814 Schutte Rd,Evansville,IN,37.968000,-87.66210,1.577017e+09,47712,2019,
2,5.668627e+09,housing/rent/apartment,"Studio apartment N Scott St, 14th St N, Arling...","This unit is located at N Scott St, 14th St N,...",,1.0,0.0,Thumbnail,,1390,...,107.0,,Arlington,VA,38.891000,-77.08160,1.577359e+09,22201,2019,
3,5.659918e+09,housing/rent/apartment,Studio apartment 1717 12th Ave,"This unit is located at 1717 12th Ave, Seattle...",,1.0,0.0,Thumbnail,,925,...,116.0,1717 12th Avenue,Seattle,WA,47.616000,-122.32750,1.576668e+09,98122,2019,
4,5.668627e+09,housing/rent/apartment,"Studio apartment Washington Blvd, N Cleveland ...","This unit is located at Washington Blvd, N Cle...",,,0.0,Thumbnail,,880,...,125.0,,Arlington,VA,38.873800,-77.10550,1.577359e+09,22203,2019,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15304,,,1427 Ernest,,,,0.0,,,1796,...,,1427 Ernest St,Honolulu,HI,21.305540,-157.84510,,96822,2025,452016081.0
15305,,,"829 Waiaka, 2Bed",,,,2.0,,,2400,...,653.0,828 Waiaka Pl #86D051593,Honolulu,HI,21.288483,-157.81987,,96826,2025,456931480.0
15306,,,"1226 Kinau, 303",,,,0.0,,,1975,...,239.0,1226 Kinau St #303,Honolulu,HI,21.302372,-157.84195,,96814,2025,440375256.0
15307,,,"Naulu, Studio",,,,0.0,,,637,...,,99-009 Kalaloa St #8E28C4E12,Aiea,HI,21.365606,-157.93646,,96701,2025,447666734.0


In [71]:
apts = apts_combined.drop(columns=['zpid', 'id', 'time', 'category', 'has_photo'])

In [72]:
apts.columns

Index(['title', 'body', 'amenities', 'bathrooms', 'bedrooms', 'pets_allowed',
       'price', 'price_type', 'square_feet', 'address', 'cityname', 'state',
       'latitude', 'longitude', 'zipcode', 'year'],
      dtype='object')

In [74]:
apts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15309 entries, 0 to 15308
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         15277 non-null  object 
 1   body          9612 non-null   object 
 2   amenities     6301 non-null   object 
 3   bathrooms     9582 non-null   float64
 4   bedrooms      15303 non-null  float64
 5   pets_allowed  5637 non-null   object 
 6   price         15309 non-null  int64  
 7   price_type    9612 non-null   object 
 8   square_feet   9830 non-null   float64
 9   address       12043 non-null  object 
 10  cityname      15309 non-null  object 
 11  state         15309 non-null  object 
 12  latitude      15309 non-null  float64
 13  longitude     15309 non-null  float64
 14  zipcode       15302 non-null  object 
 15  year          15309 non-null  int64  
dtypes: float64(5), int64(2), object(9)
memory usage: 1.9+ MB


In [75]:
apts.to_csv('data/apts_combined.csv', index=False, encoding="utf-8")