In [184]:
# import dependencies
import pandas as pd

# Database dependencies
from sqlalchemy import create_engine
from config import username, db_password

In [185]:
#  create the database engine
db_string = f"postgresql://{username}:{db_password}@127.0.0.1:5432/FinalProject"
engine = create_engine(db_string)

In [186]:
# read in data
urban_population_df = pd.read_csv("resources/urban_populations.csv")
city_population_df = pd.read_csv("resources/city_populations.csv")
housing_df = pd.read_csv("resources/realtor-data.csv")

In [187]:
# preview data frames
urban_population_df.head()

Unnamed: 0,city,population_2019,population_2020,population_2021,Sources
0,Mount Vernon,67345,73893,72584,2019: https://www.city-data.com/city/Mount-Ver...
1,Mount Vernon,67345,73893,72584,2020: https://www.census.gov/quickfacts/fact/t...
2,Mount Vernon,67345,73893,72584,2021: https://censusreporter.org/profiles/1600...
3,New Rochelle,78557,79726,81590,2019: https://www.city-data.com/city/New-Roche...
4,New Rochelle,78557,79726,81590,2020: https://www.census.gov/quickfacts/fact/t...


In [188]:
city_population_df.head()

Unnamed: 0,city,population_2019,population_2020,population_2021,Sources
0,Brooklyn,2559903,2736074,2641052,NYC 2020 census- https://popfactfinder.planni...
1,Bronx,1418207,1472654,1424948,NYC 2019- https://www1.nyc.gov/assets/planning...
2,Staten Island,476143,495747,493494,NYC 2021= http://www.citypopulation.de/en/usa/...
3,Queens,2253858,2405464,2331143,
4,Manhattan,1628706,1694251,1576876,


In [189]:
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,
1,for_sale,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Km 78 9 Carr # 135,Adjuntas,Puerto Rico,601.0,1527.0,
2,for_sale,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",556G 556-G 16 St,Juana Diaz,Puerto Rico,795.0,748.0,
3,for_sale,145000.0,4.0,2.0,0.10,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",R5 Comunidad El Paraso Calle De Oro R-5 Ponce,Ponce,Puerto Rico,731.0,1800.0,
4,for_sale,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",14 Navarro,Mayaguez,Puerto Rico,680.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
923154,for_sale,445000.0,1.0,2.0,0.99,"1008 King St, Chappaqua, NY, 10514",1008 King St,Chappaqua,New York,10514.0,1052.0,2011-05-09
923155,for_sale,418000.0,4.0,2.0,0.40,"3 Elmwood Dr, Monroe, NY, 10950",3 Elmwood Dr,Monroe,New York,10950.0,1650.0,2015-07-21
923156,for_sale,469000.0,4.0,2.0,0.18,"13 N Conger Ave, Congers, NY, 10920",13 N Conger Ave,Congers,New York,10920.0,2123.0,
923157,for_sale,825000.0,5.0,5.0,0.79,"7 Miller Rd, Valley Cottage, NY, 10989",7 Miller Rd,Valley Cottage,New York,10989.0,3775.0,2010-06-02


## Data Preprocessing

#### Urban Population Data

In [190]:
# drop excess columns
urban_population_df = urban_population_df.drop(urban_population_df.columns[[4]], axis=1)

In [191]:
# drop rows with null values
urban_population_df = urban_population_df.drop_duplicates(subset=None, keep='first', inplace= False)

In [192]:
urban_population_df

Unnamed: 0,city,population_2019,population_2020,population_2021
0,Mount Vernon,67345,73893,72584
3,New Rochelle,78557,79726,81590
6,Nyack,7156,7247,7236
9,Tuckahoe,6549,7084,6974
12,Suffern,11007,11441,11402
15,Warwick,6775,6652,6672
18,Stony Point,12586,14813,14768
21,Scarsdale,17871,18253,17939


#### City Population Data

In [193]:
# drop excess columns
city_population_df = city_population_df.drop(city_population_df.columns[[4]], axis=1)

In [194]:
# drop rows with null values
city_population_df = city_population_df.dropna()

In [195]:
city_population_df

Unnamed: 0,city,population_2019,population_2020,population_2021
0,Brooklyn,2559903,2736074,2641052
1,Bronx,1418207,1472654,1424948
2,Staten Island,476143,495747,493494
3,Queens,2253858,2405464,2331143
4,Manhattan,1628706,1694251,1576876


#### Housing Data

In [196]:
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,
1,for_sale,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Km 78 9 Carr # 135,Adjuntas,Puerto Rico,601.0,1527.0,
2,for_sale,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",556G 556-G 16 St,Juana Diaz,Puerto Rico,795.0,748.0,
3,for_sale,145000.0,4.0,2.0,0.10,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",R5 Comunidad El Paraso Calle De Oro R-5 Ponce,Ponce,Puerto Rico,731.0,1800.0,
4,for_sale,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",14 Navarro,Mayaguez,Puerto Rico,680.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
923154,for_sale,445000.0,1.0,2.0,0.99,"1008 King St, Chappaqua, NY, 10514",1008 King St,Chappaqua,New York,10514.0,1052.0,2011-05-09
923155,for_sale,418000.0,4.0,2.0,0.40,"3 Elmwood Dr, Monroe, NY, 10950",3 Elmwood Dr,Monroe,New York,10950.0,1650.0,2015-07-21
923156,for_sale,469000.0,4.0,2.0,0.18,"13 N Conger Ave, Congers, NY, 10920",13 N Conger Ave,Congers,New York,10920.0,2123.0,
923157,for_sale,825000.0,5.0,5.0,0.79,"7 Miller Rd, Valley Cottage, NY, 10989",7 Miller Rd,Valley Cottage,New York,10989.0,3775.0,2010-06-02


In [197]:
# drop unnecessary columns
housing_df = housing_df.drop(housing_df.columns[[5,6,]], axis=1)

In [198]:
housing_df = housing_df.loc[
                        (housing_df["state"] == "New York")]

In [199]:
housing_df = housing_df.loc[
                        (housing_df["city"] == "Mount Vernon") |
                        (housing_df["city"] == "New Rochelle") |
                        (housing_df["city"] == "Nyack") |
                        (housing_df["city"] == "Tuckahoe") |
                        (housing_df["city"] == "Suffern") |
                        (housing_df["city"] == "Warwick") |
                        (housing_df["city"] == "Stony Point") |
                        (housing_df["city"] == "Scarsdale") |
                        (housing_df["city"] == "Brooklyn") |
                        (housing_df["city"] == "Bronx") |
                        (housing_df["city"] == "Staten Island") |
                        (housing_df["city"] == "Queens") |
                        (housing_df["city"] == "Manhattan")]
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size,sold_date
465079,for_sale,1575000.0,3.0,2.0,20.00,Brooklyn,New York,11225.0,2400.0,1987-11-27
465080,for_sale,899000.0,,,0.07,Bronx,New York,10466.0,1880.0,1985-05-29
465081,for_sale,890000.0,7.0,3.0,0.10,Bronx,New York,10468.0,2824.0,1996-03-08
465090,for_sale,344900.0,2.0,1.0,,Bronx,New York,10463.0,1000.0,2006-11-22
465095,for_sale,1500000.0,8.0,3.0,0.04,Brooklyn,New York,11221.0,2700.0,2010-02-22
...,...,...,...,...,...,...,...,...,...,...
923036,for_sale,579000.0,3.0,3.0,0.05,Suffern,New York,10901.0,2164.0,2007-03-30
923061,for_sale,3475000.0,6.0,8.0,0.78,Scarsdale,New York,10583.0,8137.0,2012-03-27
923084,for_sale,549000.0,5.0,2.0,0.09,Nyack,New York,10960.0,2870.0,
923110,for_sale,1200000.0,4.0,3.0,5.20,Stony Point,New York,10980.0,2800.0,2021-01-08


In [200]:
# keep only dates jan 2019 - dec 2021
housing_df = housing_df[
                    (housing_df["sold_date"] >= "2019-01-01") & 
                    (housing_df["sold_date"] <= "2021-12-31")]
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size,sold_date
465601,for_sale,785000.0,1.0,1.0,,Manhattan,New York,10022.0,,2020-02-19
465708,for_sale,959000.0,8.0,4.0,0.07,Bronx,New York,10466.0,,2019-01-09
465785,for_sale,2125000.0,2.0,3.0,,Brooklyn,New York,11238.0,2150.0,2019-10-09
465849,for_sale,899000.0,2.0,1.0,,Manhattan,New York,10022.0,,2019-07-17
465906,for_sale,785000.0,3.0,2.0,,Queens,New York,11385.0,1400.0,2021-09-01
...,...,...,...,...,...,...,...,...,...,...
922663,for_sale,750000.0,4.0,3.0,0.05,Tuckahoe,New York,10707.0,2065.0,2019-07-05
922810,for_sale,799000.0,5.0,4.0,0.63,Suffern,New York,10901.0,3750.0,2021-05-26
922964,for_sale,3700000.0,6.0,8.0,0.67,Scarsdale,New York,10583.0,8048.0,2020-08-26
922968,for_sale,799000.0,4.0,2.0,0.19,Tuckahoe,New York,10707.0,2224.0,2019-11-21


In [201]:
# drop rows with null values
housing_df = housing_df.dropna()
housing_df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size,sold_date
466030,for_sale,979000.0,8.0,5.0,0.04,Bronx,New York,10467.0,3297.0,2019-01-24
466364,for_sale,937500.0,4.0,3.0,0.06,Bronx,New York,10472.0,2585.0,2019-08-26
466770,for_sale,879000.0,7.0,4.0,0.05,Bronx,New York,10458.0,3374.0,2020-10-23
467315,for_sale,589000.0,3.0,2.0,0.05,Bronx,New York,10469.0,1664.0,2021-02-01
467471,for_sale,800000.0,4.0,2.0,0.05,Brooklyn,New York,11207.0,2880.0,2019-12-23
...,...,...,...,...,...,...,...,...,...,...
922663,for_sale,750000.0,4.0,3.0,0.05,Tuckahoe,New York,10707.0,2065.0,2019-07-05
922810,for_sale,799000.0,5.0,4.0,0.63,Suffern,New York,10901.0,3750.0,2021-05-26
922964,for_sale,3700000.0,6.0,8.0,0.67,Scarsdale,New York,10583.0,8048.0,2020-08-26
922968,for_sale,799000.0,4.0,2.0,0.19,Tuckahoe,New York,10707.0,2224.0,2019-11-21


In [202]:
housing_df.dtypes

status         object
price         float64
bed           float64
bath          float64
acre_lot      float64
city           object
state          object
zip_code      float64
house_size    float64
sold_date      object
dtype: object

## Export Data to PostgreSQL Database

In [204]:
# export dataframes to perform sql join to add populations to housing dataframes
urban_population_df.to_sql(name="urban_populations", con=engine)
city_population_df.to_sql(name="city_populations", con=engine)
housing_df.to_sql(name="housing", con=engine)