# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [1]:
# All import statements needed for the project, for example:

import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db
import numpy as np
from shapely import wkt
import psycopg2

from sqlalchemy.orm import declarative_base

In [2]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "nyc_zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"


NYC_DATA_APP_TOKEN = "ViHT9fZpI3IQw23Bqm2tTgCVE"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "Final_project"
DB_USER = "postgres"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"

# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")


endpoint_331 = f"{BASE_NYC_DATA_URL}resource/{NYC_DATA_311}?\
$select=created_date,incident_zip&\
$where= created_date between '2022-10-01' and'2023-10-01'&$limit=3155000&\
$$app_token={NYC_DATA_APP_TOKEN}"

endpoint_trees = f"{BASE_NYC_DATA_URL}resource/{NYC_DATA_TREES}?"\
                 "$select=tree_id,the_geom,zipcode,zip_city&"\
                 f"$$app_token={NYC_DATA_APP_TOKEN}&$limit=65000 &$where= zip_city='New York'"

## Part 1: Data Preprocessing

In [3]:
def load_and_clean_zipcodes():
    zipcodes_df = gpd.read_file(ZIPCODE_DATA_FILE)
    zipcodes_df = zipcodes_df[zipcodes_df['PO_NAME'] == 'New York']
    zipcodes_df[["ZIPCODE","geometry"]]
    return zipcodes_df[["ZIPCODE","geometry"]]

In [4]:
def download_and_clean_311_data():
    content = requests.get(endpoint_331)
    data_331 = gpd.read_file(content.text)
    return data_331[["created_date","incident_zip"]]

In [5]:
def download_and_clean_tree_data():
    content = requests.get(endpoint_trees)
    data_tree = gpd.read_file(content.text).reset_index()
    return data_tree

In [24]:
def load_and_clean_zillow_data():
    zillow_df = gpd.read_file(ZILLOW_DATA_FILE)
    zillow_df = zillow_df[zillow_df['City'] == 'New York']
    date_columns = [col for col in zillow_df if col >= '2022-10-01' and col <= '2023-09-30']

    for col in date_columns:
        zillow_df[col] = pd.to_numeric(zillow_df[col], errors='coerce')
    
    zillow_df['AverageRent'] = zillow_df[date_columns].mean(axis=1, skipna=True,)
    zillow_df.rename(columns={'RegionName': 'Zip_code'},inplace=True)
    return zillow_df[['Zip_code',"AverageRent"]]

In [7]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes()
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [8]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [9]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 110 entries, 21 to 262
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   ZIPCODE   110 non-null    object  
 1   geometry  110 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 2.6+ KB


In [10]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,ZIPCODE,geometry
21,10034,"POLYGON ((1006203.169 257345.657, 1006214.666 ..."
22,10033,"POLYGON ((1003020.865 256049.162, 1003096.046 ..."
24,10040,"POLYGON ((1002564.135 253724.773, 1002596.698 ..."
32,10032,"POLYGON ((998935.853 249154.260, 998945.457 24..."
36,10031,"POLYGON ((1000830.039 241801.590, 1000913.103 ..."


In [11]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3144567 entries, 0 to 3144566
Data columns (total 3 columns):
 #   Column        Dtype         
---  ------        -----         
 0   created_date  datetime64[ns]
 1   incident_zip  object        
 2   geometry      geometry      
dtypes: datetime64[ns](1), geometry(1), object(1)
memory usage: 72.0+ MB


In [12]:
geodf_311_data.head()

Unnamed: 0,created_date,incident_zip,geometry
0,2023-09-30 23:59:58,11226,
1,2023-09-30 23:59:38,11361,
2,2023-09-30 23:59:35,10002,
3,2023-09-30 23:59:34,11435,
4,2023-09-30 23:59:28,11226,


In [13]:
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 64488 entries, 0 to 64487
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   index     64488 non-null  int64   
 1   tree_id   64488 non-null  object  
 2   zipcode   64488 non-null  object  
 3   zip_city  64488 non-null  object  
 4   geometry  64488 non-null  geometry
dtypes: geometry(1), int64(1), object(3)
memory usage: 2.5+ MB


In [14]:
geodf_tree_data.head()

Unnamed: 0,index,tree_id,zipcode,zip_city,geometry
0,0,190422,10023,New York,POINT (-73.98495 40.77005)
1,1,190426,10023,New York,POINT (-73.98534 40.77021)
2,2,208649,10019,New York,POINT (-73.98730 40.76272)
3,3,193310,10021,New York,POINT (-73.96046 40.77217)
4,4,199760,10024,New York,POINT (-73.98096 40.78209)


In [15]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 4 to 6721
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Zip_code     145 non-null    object 
 1   AverageRent  145 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.4+ KB


In [16]:
df_zillow_data.head()

Unnamed: 0,Zip_code,AverageRent
4,11385,2970.201093
6,11208,2635.90991
12,11236,2323.980013
13,10467,2214.270476
14,11373,2242.765254


## Part 2: Storing Data

In [17]:
#!createdb Final_project

In [18]:
#!psql --dbname Final_project -c 'CREATE EXTENSION postgis;'

In [19]:
# Connect to pgsl
conn = psycopg2.connect(f"dbname={DB_NAME} user={DB_USER}")
cur = conn.cursor()

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [20]:
#engine = db.create_engine(DB_URL)

#### Option 1: SQL

In [21]:
# if using SQL (as opposed to SQLAlchemy), define the SQL statements to create your 4 tables
ZIPCODE_SCHEMA = """
    CREATE TABLE IF NOT EXISTS zip_codes (
        id INTEGER PRIMARY KEY,
        zipcode VARCHAR,
        geometry GEOMETRY
    )
    """

NYC_311_SCHEMA = """
    CREATE TABLE IF NOT EXISTS nyc_311 (
        id INTEGER PRIMARY KEY,
        created_date DATE,
        zipcode GEOMETRY
    )
    """

NYC_TREE_SCHEMA = """
    CREATE TABLE IF NOT EXISTS trees (
        tree_id INTEGER PRIMARY KEY,
        zipcode VARCHAR,
        geometry GEOMETRY
    )
    """

ZILLOW_SCHEMA = """
    CREATE TABLE IF NOT EXISTS rent (
        id INTEGER PRIMARY KEY,
        zipcode VARCHAR,
        average_rent NUMERIC
    )
    """

In [22]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [23]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with cur:
    cur.execute(ZIPCODE_SCHEMA)
    cur.execute(NYC_311_SCHEMA)
    cur.execute(NYC_TREE_SCHEMA)
    cur.execute(ZILLOW_SCHEMA)

conn.commit()

#### Option 2: SQLAlchemy

In [None]:
Base = declarative_base()

class Tree(Base):
    __tablename__ = "trees"

    ...


In [None]:
Base.metadata.create_all(engine)

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

#### Option 1: SQL

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    # write INSERT statements or use pandas/geopandas to write SQL
    raise NotImplemented()

In [None]:
tablename_to_dataframe = {
    "zipcodes": geodf_zipcode_data,
    "complaints": geodf_311_data,
    "trees": geodf_tree_data,
    "rents": df_zillow_data,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

#### Option 2: SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
for row in geodf_tree_data.iterrows():
    tree = Tree(...)
    session.add(tree)

In [None]:
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)