In [1]:
# import dependencies
import pandas as pd

# Database dependencies
from sqlalchemy import create_engine
from config import username, db_password

In [2]:
#  create the database engine
db_string = f"postgresql://{username}:{db_password}@127.0.0.1:5432/FinalProject"
engine = create_engine(db_string)

In [3]:
# read in data
urban_population_df = pd.read_csv("resources/urban_populations.csv")
city_population_df = pd.read_csv("resources/city_populations.csv")
housing_df = pd.read_csv("resources/realtor-data.csv")

In [4]:
# preview data frames
urban_population_df.head()

Unnamed: 0,city,population_2019,population_2020,population_2021,Sources,Unnamed: 5,Unnamed: 6
0,Mount Vernon,67345,73893,72584,2019: https://www.city-data.com/city/Mount-Ver...,,
1,Mount Vernon,67345,73893,72584,2020: https://www.census.gov/quickfacts/fact/t...,,
2,Mount Vernon,67345,73893,72584,2021: https://censusreporter.org/profiles/1600...,,
3,Mount Vernon,67345,73893,72584,,,
4,Mount Vernon,67345,73893,72584,,,


In [5]:
city_population_df.head()

Unnamed: 0,city,population_2019,population_2020,population_2021,Sources
0,Brooklyn,2559903,2736074,2641052,NYC 2020 census- https://popfactfinder.planni...
1,Bronx,1418207,1472654,1424948,NYC 2019- https://www1.nyc.gov/assets/planning...
2,Staten Island,476143,495747,493494,NYC 2021= http://www.citypopulation.de/en/usa/...
3,Queens,2253858,2405464,2331143,Yonkers 19-21- https://www.populationu.com/cit...
4,Yonkers,200370,211569,212521,


In [6]:
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,
1,for_sale,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Km 78 9 Carr # 135,Adjuntas,Puerto Rico,601.0,1527.0,
2,for_sale,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",556G 556-G 16 St,Juana Diaz,Puerto Rico,795.0,748.0,
3,for_sale,145000.0,4.0,2.0,0.10,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",R5 Comunidad El Paraso Calle De Oro R-5 Ponce,Ponce,Puerto Rico,731.0,1800.0,
4,for_sale,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",14 Navarro,Mayaguez,Puerto Rico,680.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
923154,for_sale,445000.0,1.0,2.0,0.99,"1008 King St, Chappaqua, NY, 10514",1008 King St,Chappaqua,New York,10514.0,1052.0,2011-05-09
923155,for_sale,418000.0,4.0,2.0,0.40,"3 Elmwood Dr, Monroe, NY, 10950",3 Elmwood Dr,Monroe,New York,10950.0,1650.0,2015-07-21
923156,for_sale,469000.0,4.0,2.0,0.18,"13 N Conger Ave, Congers, NY, 10920",13 N Conger Ave,Congers,New York,10920.0,2123.0,
923157,for_sale,825000.0,5.0,5.0,0.79,"7 Miller Rd, Valley Cottage, NY, 10989",7 Miller Rd,Valley Cottage,New York,10989.0,3775.0,2010-06-02


## Data Preprocessing

#### Urban Population Data

In [7]:
# drop excess columns
urban_population_df = urban_population_df.drop(urban_population_df.columns[[4]], axis=1)

In [8]:
# drop rows with null values & duplicates
urban_population_df = urban_population_df.drop_duplicates(subset=None, keep='first', inplace= False)

In [9]:
# keep only cities: Tuckahoe, Nyack, Suffern, Stony Point, and Scarsdale
urban_population_df = urban_population_df.loc[
                                (urban_population_df["city"] == "Tuckahoe") |
                                (urban_population_df["city"] == "Nyack") |
                                (urban_population_df["city"] == "Suffern") |
                                (urban_population_df["city"] == "Stony Point") |
                                (urban_population_df["city"] == "Scarsdale")]

urban_population_df

Unnamed: 0,city,population_2019,population_2020,population_2021,Unnamed: 5,Unnamed: 6
91,Nyack,7156,7247,7236,,
123,Tuckahoe,6549,7084,6974,,
153,Suffern,11007,11441,11402,,
197,Stony Point,12586,14813,14768,,
212,Scarsdale,17871,18253,17939,,


#### City Population Data

In [10]:
# drop excess columns
city_population_df = city_population_df.drop(city_population_df.columns[[4]], axis=1)

In [11]:
# drop rows with null values
city_population_df = city_population_df.dropna()

In [12]:
city_population_df

Unnamed: 0,city,population_2019,population_2020,population_2021
0,Brooklyn,2559903,2736074,2641052
1,Bronx,1418207,1472654,1424948
2,Staten Island,476143,495747,493494
3,Queens,2253858,2405464,2331143
4,Yonkers,200370,211569,212521


#### Housing Data

In [13]:
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,
1,for_sale,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Km 78 9 Carr # 135,Adjuntas,Puerto Rico,601.0,1527.0,
2,for_sale,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",556G 556-G 16 St,Juana Diaz,Puerto Rico,795.0,748.0,
3,for_sale,145000.0,4.0,2.0,0.10,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",R5 Comunidad El Paraso Calle De Oro R-5 Ponce,Ponce,Puerto Rico,731.0,1800.0,
4,for_sale,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",14 Navarro,Mayaguez,Puerto Rico,680.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
923154,for_sale,445000.0,1.0,2.0,0.99,"1008 King St, Chappaqua, NY, 10514",1008 King St,Chappaqua,New York,10514.0,1052.0,2011-05-09
923155,for_sale,418000.0,4.0,2.0,0.40,"3 Elmwood Dr, Monroe, NY, 10950",3 Elmwood Dr,Monroe,New York,10950.0,1650.0,2015-07-21
923156,for_sale,469000.0,4.0,2.0,0.18,"13 N Conger Ave, Congers, NY, 10920",13 N Conger Ave,Congers,New York,10920.0,2123.0,
923157,for_sale,825000.0,5.0,5.0,0.79,"7 Miller Rd, Valley Cottage, NY, 10989",7 Miller Rd,Valley Cottage,New York,10989.0,3775.0,2010-06-02


In [14]:
# drop unnecessary columns
housing_df = housing_df.drop(housing_df.columns[[5,6,]], axis=1)

In [15]:
# keep only the state of New York
housing_df = housing_df.loc[
                        (housing_df["state"] == "New York")]

In [16]:
# keep only dates jan 2019 - dec 2021
housing_df = housing_df[
                    (housing_df["sold_date"] >= "2019-01-01") & 
                    (housing_df["sold_date"] <= "2021-12-31")]
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size,sold_date
54248,for_sale,425000.0,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,2021-11-24
54533,for_sale,435000.0,3.0,2.0,5.01,East Chatham,New York,12060.0,2504.0,2020-11-09
54551,for_sale,425000.0,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,2021-11-24
54952,for_sale,625000.0,,,19.70,Hillsdale,New York,12529.0,,2021-01-20
55599,for_sale,625000.0,,,19.70,Hillsdale,New York,12529.0,,2021-01-20
...,...,...,...,...,...,...,...,...,...,...
923109,for_sale,799000.0,5.0,4.0,2.32,Tomkins Cove,New York,10986.0,3280.0,2021-04-15
923110,for_sale,1200000.0,4.0,3.0,5.20,Stony Point,New York,10980.0,2800.0,2021-01-08
923142,for_sale,799000.0,5.0,3.0,0.11,West Harrison,New York,10604.0,2420.0,2019-08-12
923148,for_sale,519000.0,4.0,3.0,0.54,Cornwall,New York,12518.0,2612.0,2020-01-13


In [17]:
# drop rows with null values
housing_df = housing_df.dropna()
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size,sold_date
54248,for_sale,425000.0,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,2021-11-24
54533,for_sale,435000.0,3.0,2.0,5.01,East Chatham,New York,12060.0,2504.0,2020-11-09
54551,for_sale,425000.0,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,2021-11-24
56030,for_sale,339000.0,4.0,5.0,0.51,New Lebanon,New York,12125.0,3133.0,2019-09-11
56088,for_sale,625000.0,4.0,2.0,5.20,Copake Falls,New York,12517.0,2029.0,2019-10-23
...,...,...,...,...,...,...,...,...,...,...
923109,for_sale,799000.0,5.0,4.0,2.32,Tomkins Cove,New York,10986.0,3280.0,2021-04-15
923110,for_sale,1200000.0,4.0,3.0,5.20,Stony Point,New York,10980.0,2800.0,2021-01-08
923142,for_sale,799000.0,5.0,3.0,0.11,West Harrison,New York,10604.0,2420.0,2019-08-12
923148,for_sale,519000.0,4.0,3.0,0.54,Cornwall,New York,12518.0,2612.0,2020-01-13


In [18]:
# drop extra cities for sql join
housing_join_df = housing_df.loc[
                        # low population
                        (housing_df["city"] == "Nyack") |
                        (housing_df["city"] == "Tuckahoe") |
                        (housing_df["city"] == "Suffern") |
                        (housing_df["city"] == "Stony Point") |
                        (housing_df["city"] == "Scarsdale") |
                        # high population
                        (housing_df["city"] == "Brooklyn") |
                        (housing_df["city"] == "Bronx") |
                        (housing_df["city"] == "Staten Island") |
                        (housing_df["city"] == "Queens") |
                        (housing_df["city"] == "Yonkers")]

In [19]:
housing_df.dtypes

status         object
price         float64
bed           float64
bath          float64
acre_lot      float64
city           object
state          object
zip_code      float64
house_size    float64
sold_date      object
dtype: object

## Export Data to PostgreSQL Database

In [20]:
# export dataframes to perform sql join to add populations to housing dataframes
urban_population_df.to_sql(name="urban_populations", con=engine)
city_population_df.to_sql(name="city_populations", con=engine)
housing_join_df.to_sql(name="housing", con=engine)
housing_df.to_sql(name="housing_without_join", con=engine)