# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [32]:
# All import statements needed for the project, for example:

import json
import pathlib
from requests.exceptions import ReadTimeout
import time
import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db
from sodapy import Socrata
import glob
import psycopg2
from sqlalchemy import create_engine,text

from sqlalchemy.orm import declarative_base

In [2]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "xX3rCbSDM4vF0QEfgh09b2ZWW"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "IEOR4501-XL"

DB_USER = "postgres"
#DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_URL = f"postgresql://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"

QUERY_DIR = pathlib.Path("queries")

# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [4]:
#downloading data from small chunks
def download_nyc_csv_data(year,starttime,endtime,url,filename):
    filepath = f'{DATA_DIR}/{filename}_{year}.csv'
    query=f"""
    select * 
    where created_date between {starttime} 
    and {endtime}
    """

    if not filename:
        print(f"Downloading {url} to {filename}...")
        client = Socrata( "data.cityofnewyork.us",
                  "xX3rCbSDM4vF0QEfgh09b2ZWW",
                  username="yirong263@gmail.com",
                  password="UTDYnmz*zn2u3g6",
                  timeout=60)
        max_retries = 5
        retry_wait = 10  # Initial wait time in seconds

        while max_retries > 0:
            try:    
            # Set initial parameters for the SoQL query
                limit = 1000000  # Example limit
                offset = 0  # Start at the beginning
                total_records = 100000000  # Example total number of records you wish to download
                current_record = 0
                while current_record < total_records:
                    # Adjust the query to include the limit and offset
                    results = client.get(f"{url}",query= query+ f" limit {limit} offset {offset}")
                    
                    # Convert to DataFrame and save to CSV
                    df = pd.DataFrame.from_records(results)
                    df.to_csv(f'{filepath}', index=False)
                    
                    # Update the offset and current_record count
                    offset += limit
                    current_record += len(results)

                    # Optional: Print progress
                    print(f'Downloaded {current_record} of {total_records}')
                break
            
            except ReadTimeout:
                # Wait before retrying
                time.sleep(retry_wait)
                # Reduce the number of retries left
                max_retries -= 1
                # Increase the wait time for the next retry
                retry_wait *= 2
        
        print(f"Done downloading {url} from {year}.")

    else:
        print(f"Reading from {filepath}...")


    return filepath

In [5]:
def load_and_clean_zipcodes(zipcode_datafile):
    """
    Load and clean NYC zipcode data from a shapefile.
    Args:
    zipcode_datafile (str): The file path to the shapefile.
    
    Returns:
    GeoDataFrame: Cleaned geospatial data frame of NYC zipcodes.
    """
    # Load the shapefile using GeoPandas
    gdf = gpd.read_file(zipcode_datafile)
    
    # Remove unnecessary columns from the dataframe
    columns_to_drop = ['BLDGZIP', 'STATE', 'ST_FIPS', 'CTY_FIPS', 'URL', 'SHAPE_AREA', 'SHAPE_LEN']
    gdf_cleaned = gdf.drop(columns=columns_to_drop)
    gdf.drop_duplicates(subset='ZIPCODE', keep='first', inplace=True)
    gdf.dropna(inplace=True)
    gdf.drop_duplicates(inplace=True)
    # Rename columns for clarity
    gdf_cleaned = gdf_cleaned.rename(columns={'PO_NAME': 'City'})
    # Set the coordinate reference system to EPSG 4326
    gdf_cleaned = gdf_cleaned.to_crs(epsg=4326)

    return gdf_cleaned


In [6]:
geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)

In [7]:
def download_and_clean_311_data():
    #data downloading
    download_nyc_csv_data(2015,"2015-01-01T00:00:00.000","2015-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2016,"2016-01-01T00:00:00.000","2016-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2017,"2017-01-01T00:00:00.000","2017-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2018,"2018-01-01T00:00:00.000","2018-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2019,"2019-01-01T00:00:00.000","2019-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2020,"2020-01-01T00:00:00.000","2020-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2021,"2021-01-01T00:00:00.000","2021-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2022,"2022-01-01T00:00:00.000","2022-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2023,"2023-01-01T00:00:00.000","2015-09-30T23:59:59.999","erm2-nwe9",'nyc_311_data')

    # After downloading all chunks
    csv_files = glob.glob('data/nyc_311_data_*.csv')
    # Remove unnecessary columns by keeping only the ones you need for each file
    dfs=[]
    for file in csv_files:
        df=pd.read_csv(file)
        columns_needed = ['unique_key', 'created_date', 'complaint_type','incident_zip','latitude', 'longitude']  # Replace with actual column names
        df = df[columns_needed]
        #eliminate duplicate
        df.drop_duplicates(inplace=True)
        # Remove invalid data points
        # This is highly dependent on the context of your data, but as an example:
        df.dropna(inplace=True) 
        # Normalize column names
        df.columns = [column_name.lower().replace(' ', '_') for column_name in df.columns]
        dfs.append(df) # processed df and append to a list

    # Concatenate all DataFrames into one
    NYC311_df = pd.concat(dfs,ignore_index=True)
    NYC311_df.drop_duplicates(inplace=True)# header duplicate elimination
    NYC311_df = NYC311_df.drop_duplicates(subset=['unique_key'])
    # Normalize Column Types

    # unique_key 
    NYC311_df['unique_key'] = NYC311_df['unique_key'].astype(int)
    # change name into 'id_NYC311'
    NYC311_df.rename(columns={'unique_key': 'id_NYC311'}, inplace=True)

    #incident zip
    #rename from incident_zip to zipcode
    NYC311_df.rename(columns={'incident_zip': 'zipcode'}, inplace=True)
    # Convert the 'zipcode' column to a string type, then filter
    NYC311_df['zipcode']=NYC311_df['zipcode'].astype(float).astype(int)
    NYC311_df = NYC311_df[NYC311_df['zipcode'].apply(lambda x: str(x).isdigit() and len(str(x)) == 5)] 

    #created_date
    #rename "date"
    NYC311_df.rename(columns={'created_date': 'date'}, inplace=True)
    # sorting by date
    NYC311_df = NYC311_df.sort_values(by='date')
    #change date format into yyyy-mm-dd
    NYC311_df['date'] = pd.to_datetime(NYC311_df['date']).dt.strftime('%Y-%m-%d')
        
    # Assuming df is your existing DataFrame with latitude and longitude columns
    NYC311_df = gpd.GeoDataFrame(NYC311_df, geometry=gpd.points_from_xy(NYC311_df['longitude'], NYC311_df['latitude']))
    NYC311_df.set_crs(epsg=4326, inplace=True)


    #save the combined DataFrame to a new CSV file
    # NYC311_df.to_csv('data/nyc_311_data.csv', index=False)
    return NYC311_df
    

In [8]:
def download_and_clean_tree_data():
    download_nyc_csv_data(2015,"2015-01-01T00:00:00.000","2015-12-31T23:59:59.999","5rq2-4hqu",'tree')
    tree_df=pd.read_csv('data/tree_2015.csv')
    # Remove unnecessary columns by keeping only the ones you need
    columns_needed = ['created_at', 'tree_id', 'status','zipcode','health','spc_common', 'latitude', 'longitude']  # Replace with actual column names
    tree_df = tree_df[columns_needed]

    # Remove invalid data points
    # This is highly dependent on the context of your data, but as an example:
    tree_df.drop_duplicates(inplace=True)
    tree_df.dropna(inplace=True)  

    # Normalize column names
    tree_df.columns = [column_name.lower().replace(' ', '_') for column_name in tree_df.columns]
    #created_at
    tree_df.rename(columns={'created_at': 'date'}, inplace=True)
    tree_df['date'] = pd.to_datetime(tree_df['date']).dt.strftime('%Y-%m-%d')#change date format into yyyy-mm-dd

    #zipcode
    tree_df['zipcode'] = tree_df['zipcode'].astype(int)
    tree_df=tree_df.sort_values('date')

    tree_df = gpd.GeoDataFrame(tree_df, geometry=gpd.points_from_xy(tree_df['longitude'], tree_df['latitude']))
    tree_df.set_crs(epsg=4326, inplace=True)
    return tree_df


In [9]:
def load_and_clean_zillow_data():
    """
    Load and clean Zillow rent data from a CSV file.
    
    Returns:
    DataFrame: Cleaned data frame of Zillow rent data.
    """
    # Load the CSV data using Pandas
    zillow_data_path = DATA_DIR / "zillow_rent_data.csv"
    df = pd.read_csv(zillow_data_path)
    
    # Keep only necessary columns
    columns_to_keep = ['RegionName', 'State', 'City', 'Metro', 'CountyName'] + [col for col in df.columns if '-' in col]
    df_cleaned = df[columns_to_keep]
    
    # Remove rows with a significant number of missing values
    df_cleaned = df_cleaned.dropna(thresh=len(df_cleaned.columns)/2, axis=0)
    
    # Rename columns for clarity
    df_cleaned = df_cleaned.rename(columns={'RegionName': 'ZipCode'})
    
    return df_cleaned


In [10]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [11]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

Reading from data/nyc_311_data_2015.csv...
Reading from data/nyc_311_data_2016.csv...
Reading from data/nyc_311_data_2017.csv...
Reading from data/nyc_311_data_2018.csv...
Reading from data/nyc_311_data_2019.csv...
Reading from data/nyc_311_data_2020.csv...
Reading from data/nyc_311_data_2021.csv...
Reading from data/nyc_311_data_2022.csv...
Reading from data/nyc_311_data_2023.csv...


  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)


Reading from data/tree_2015.csv...


In [12]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   ZIPCODE     263 non-null    object  
 1   City        263 non-null    object  
 2   POPULATION  263 non-null    float64 
 3   AREA        263 non-null    float64 
 4   COUNTY      263 non-null    object  
 5   geometry    263 non-null    geometry
dtypes: float64(2), geometry(1), object(3)
memory usage: 12.5+ KB


In [13]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,ZIPCODE,City,POPULATION,AREA,COUNTY,geometry
0,11436,Jamaica,18681.0,22699300.0,Queens,"POLYGON ((-73.80585 40.68291, -73.80569 40.682..."
1,11213,Brooklyn,62426.0,29631000.0,Kings,"POLYGON ((-73.93740 40.67973, -73.93487 40.679..."
2,11212,Brooklyn,83866.0,41972100.0,Kings,"POLYGON ((-73.90294 40.67084, -73.90223 40.668..."
3,11225,Brooklyn,56527.0,23698630.0,Kings,"POLYGON ((-73.95797 40.67066, -73.95576 40.670..."
4,11218,Brooklyn,72280.0,36868800.0,Kings,"POLYGON ((-73.97208 40.65060, -73.97192 40.650..."


In [14]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 23030426 entries, 291587 to 16420015
Data columns (total 7 columns):
 #   Column          Dtype   
---  ------          -----   
 0   id_NYC311       int64   
 1   date            object  
 2   complaint_type  object  
 3   zipcode         int64   
 4   latitude        float64 
 5   longitude       float64 
 6   geometry        geometry
dtypes: float64(2), geometry(1), int64(2), object(2)
memory usage: 1.4+ GB


In [15]:
geodf_311_data.head()

Unnamed: 0,id_NYC311,date,complaint_type,zipcode,latitude,longitude,geometry
291587,29616011,2015-01-01,HEAT/HOT WATER,11207,40.667093,-73.891719,POINT (-73.89172 40.66709)
291738,29615514,2015-01-01,HEAT/HOT WATER,10034,40.868366,-73.916422,POINT (-73.91642 40.86837)
291739,29615513,2015-01-01,HEAT/HOT WATER,11377,40.744999,-73.892968,POINT (-73.89297 40.74500)
291740,29615512,2015-01-01,HEAT/HOT WATER,10472,40.833156,-73.87054,POINT (-73.87054 40.83316)
291741,29615511,2015-01-01,HEAT/HOT WATER,10024,40.787862,-73.976899,POINT (-73.97690 40.78786)


In [16]:
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 189969 entries, 13800 to 196344
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   date        189969 non-null  object  
 1   tree_id     189969 non-null  int64   
 2   status      189969 non-null  object  
 3   zipcode     189969 non-null  int64   
 4   health      189969 non-null  object  
 5   spc_common  189969 non-null  object  
 6   latitude    189969 non-null  float64 
 7   longitude   189969 non-null  float64 
 8   geometry    189969 non-null  geometry
dtypes: float64(2), geometry(1), int64(2), object(4)
memory usage: 14.5+ MB


In [17]:
geodf_tree_data.head()

Unnamed: 0,date,tree_id,status,zipcode,health,spc_common,latitude,longitude,geometry
13800,2015-05-19,347,Alive,10459,Fair,American hornbeam,40.821445,-73.892916,POINT (-73.89292 40.82144)
45386,2015-05-19,317,Alive,10459,Fair,Amur maackia,40.825308,-73.897495,POINT (-73.89750 40.82531)
1890,2015-05-19,306,Alive,10459,Good,Siberian elm,40.823933,-73.897177,POINT (-73.89718 40.82393)
22732,2015-05-19,9,Alive,10474,Poor,silver birch,40.814107,-73.889021,POINT (-73.88902 40.81411)
4301,2015-05-19,307,Alive,10459,Good,Siberian elm,40.824059,-73.89726,POINT (-73.89726 40.82406)


In [18]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2157 entries, 0 to 6721
Columns: 110 entries, ZipCode to 2023-09-30
dtypes: float64(105), int64(1), object(4)
memory usage: 1.8+ MB


In [19]:
df_zillow_data.head()

Unnamed: 0,ZipCode,State,City,Metro,CountyName,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,...,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30
0,77494,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,1606.206406,1612.779844,1622.201575,1630.392427,1632.4115,...,1994.653463,2027.438438,2042.237444,2049.325559,2016.531345,2023.438976,2031.558202,2046.144009,2053.486247,2055.771355
1,77449,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,1257.81466,1255.268025,1262.170452,1274.955754,1285.526052,...,1749.6979,1738.217986,1747.30584,1758.407295,1758.891075,1762.980879,1771.751591,1779.338402,1795.384582,1799.63114
2,77084,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,,,,,,...,1701.21752,1706.900064,1706.067787,1723.72232,1735.48467,1752.132904,1756.990323,1754.429516,1757.602011,1755.03149
4,11385,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,,2087.527084,,2149.924252,2166.263698,...,2935.80822,2895.699421,2873.209025,2881.906361,2913.546218,2963.964134,3005.735342,3034.413822,3064.476503,3079.585783
5,78660,TX,Pflugerville,"Austin-Round Rock-Georgetown, TX",Travis County,1399.372678,1411.391149,1396.562265,1390.741122,1403.065652,...,2087.21953,2107.491824,2103.02069,2109.932132,2099.065912,2110.786195,2112.79221,2113.710515,2098.939433,2094.435442


## Part 2: Storing Data

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

#### Option 2: SQLAlchemy

In [41]:
Base = declarative_base()

from sqlalchemy import create_engine, Column, Integer, String, Float, Date, MetaData, Table
from geoalchemy2 import Geometry
from sqlalchemy.ext.declarative import declarative_base

class NYCZipCode(Base):
    __tablename__ = 'NYCzipcodes'
    ZIPCODE = Column(String, primary_key=True)
    City = Column(String)
    POPULATION = Column(Float)
    AREA = Column(Float)
    COUNTY = Column(String)
    geometry = Column(Geometry('POLYGON', srid=4326))
    

class NYC311Complaints(Base):
    __tablename__ = 'nyc311_complaints'
    id_NYC311 = Column(Integer, primary_key=True)
    date = Column(Date)
    complaint_type = Column(String)
    zipcode = Column(Integer)
    latitude = Column(Float)
    longitude = Column(Float)
    geometry = Column(Geometry(geometry_type='POINT', srid=4326))

class Tree(Base):
    __tablename__ = 'trees'
    tree_id = Column(Integer, primary_key=True)
    date = Column(Date)
    status = Column(String)
    zipcode = Column(Integer)
    health = Column(String)
    spc_common = Column(String)
    latitude = Column(Float)
    longitude = Column(Float)
    geometry = Column(Geometry(geometry_type='POINT', srid=4326))

class ZillowRent(Base):
    __tablename__ = 'zillow_rent'
    
    ZipCode = Column(String, primary_key=True)
    State = Column(String)
    City = Column(String)
    Metro = Column(String)
    CountyName = Column(String)

engine = create_engine(DB_URL)
Base.metadata.create_all(engine)



In [21]:
geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

Add ZipCode Data to Database

In [22]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [24]:

from shapely import wkt
from sqlalchemy.dialects.postgresql import insert

geodf_zipcode_data['geometry'] = geodf_zipcode_data['geometry'].apply(lambda geom: geom.wkt)
for index, row in geodf_zipcode_data.iterrows():
    insert_stmt = insert(NYCZipCode).values(
        ZIPCODE=row['ZIPCODE'],
        City=row['City'],
        POPULATION=row['POPULATION'],
        AREA=row['AREA'],
        COUNTY=row['COUNTY'],
        geometry=row['geometry']
    )
    on_conflict_stmt = insert_stmt.on_conflict_do_update(
        index_elements=['ZIPCODE'],  # Unique constraint or column(s) causing conflict
        set_=dict(
            City=row['City'],
            POPULATION=row['POPULATION'],
            AREA=row['AREA'],
            COUNTY=row['COUNTY'],
            geometry=row['geometry']
        )
    )
    session.execute(on_conflict_stmt)


  geodf_zipcode_data['geometry'] = geodf_zipcode_data['geometry'].apply(lambda geom: geom.wkt)


In [25]:
session.commit()

Add NYC 311 Complaint Data to Database

In [None]:
from sqlalchemy.dialects.postgresql import insert
# Convert geometry column to WKT format
geodf_311_data['geometry'] = geodf_311_data['geometry'].apply(lambda geom: geom.wkt)

def insert_batch(session, model, data, batch_size=200000):
    for i in range(0, len(data), batch_size):
        batch = data.iloc[i:i+batch_size].to_dict(orient='records')

        for record in batch:
            stmt = insert(model).values(record)
            do_nothing_stmt = stmt.on_conflict_do_nothing(index_elements=['id_NYC311'])
            session.execute(do_nothing_stmt)

        session.commit()

# Insert data in batches
insert_batch(session, NYC311Complaints, geodf_311_data)


Add Tree Data to Database

In [44]:
session.rollback()

  session.rollback()


In [36]:
geodf_tree_data=download_and_clean_tree_data()

Reading from data/tree_2015.csv...


In [37]:
geodf_tree_data['geometry'] = geodf_tree_data['geometry'].apply(lambda geom: geom.wkt)

  geodf_tree_data['geometry'] = geodf_tree_data['geometry'].apply(lambda geom: geom.wkt)


In [45]:

for index, row in geodf_tree_data.iterrows():
    # Create a TreeData object for each row
    tree = Tree(
        tree_id=row['tree_id'],
        date=row['date'],
        status=row['status'],
        zipcode=row['zipcode'],
        health=row['health'],
        spc_common=row['spc_common'],
        latitude=row['latitude'],
        longitude=row['longitude'],
        geometry=row['geometry']
    )
    # Add each TreeData object to the session
    session.add(tree)

In [46]:
session.commit()

Add Zillow Rent Data to Database

In [47]:
# Loop through each row in the DataFrame for Zillow rent data
for index, row in df_zillow_data.iterrows():
    # Create a ZillowRent object for each row (assuming ZillowRent is the model class)
    rent = ZillowRent(
        ZipCode=row['ZipCode'],
        State=row['State'],
        City=row['City'],
        Metro=row['Metro'],
        CountyName=row['CountyName']
    )
    # Add each ZillowRent object to the session
    session.add(rent)

In [48]:
# Commit the session to save all added objects to the database
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:

from sqlalchemy.sql import text

# Define the directory where query files will be saved
# Creates the directory if it does not exist
QUERY_DIR = pathlib.Path("queries")
QUERY_DIR.mkdir(exist_ok=True)

# Define the SQL query for Query 1
# This query finds the number of 311 complaints per zip code 
# between 2022-10-01 and 2023-09-30 and orders them in descending order
QUERY_1 = """
SELECT zipcode, COUNT(*) AS complaint_count
FROM nyc311_complaints
WHERE date BETWEEN '2022-10-01' AND '2023-09-30'
GROUP BY zipcode
ORDER BY complaint_count DESC;
"""

# Function to write the SQL query to a file
def write_query_to_file(query, outfile):
    with open(outfile, 'w') as file:
        file.write(query)

# File path for saving the query
QUERY_1_FILENAME = QUERY_DIR / "complaints_per_zipcode.sql"

# Execute the query and print the results
with engine.connect() as conn:
    result = conn.execute(text(QUERY_1))
    for row in result:
        print(row)

# Write the query to a file
write_query_to_file(QUERY_1, QUERY_1_FILENAME)


### Query 2

In [None]:
# Query 2: Finding the top 10 zip codes with the most trees
# This query aims to identify which 10 zip codes have the highest number of trees

QUERY_2 = """
SELECT zipcode, COUNT(*) AS tree_count
FROM tree_data
GROUP BY zipcode
ORDER BY tree_count DESC
LIMIT 10;
"""

# File path for saving the query
QUERY_2_FILENAME = QUERY_DIR / "top_10_zipcodes_by_trees.sql"

# Execute the query and print the results
with engine.connect() as conn:
    result = conn.execute(text(QUERY_2))
    for row in result:
        print(row)

# Write the query to a file
write_query_to_file(QUERY_2, QUERY_2_FILENAME)


### Query 3

In [None]:
# Query 3: Calculating average rent in the areas with the most trees for August 2023
# This query identifies the average rent by zip code for the top 10 zip codes with the most trees, 
# specifically for the month of August 2023

QUERY_3 = """
WITH TopTreeZipCodes AS (
    SELECT zipcode
    FROM tree_data
    GROUP BY zipcode
    ORDER BY COUNT(*) DESC
    LIMIT 10
)
SELECT ttz.zipcode, ROUND(AVG(r.rent), 2) AS average_rent
FROM TopTreeZipCodes ttz
JOIN zillow_rent r ON ttz.zipcode = r.zipcode
WHERE r.date BETWEEN '2023-08-01' AND '2023-08-31'
GROUP BY ttz.zipcode
ORDER BY COUNT(*) DESC;
"""

# File path for saving the query
QUERY_3_FILENAME = QUERY_DIR / "average_rent_in_green_areas.sql"

# Execute the query and print the results
with engine.connect() as conn:
    result = conn.execute(text(QUERY_3))
    for row in result:
        print(row)

# Write the query to a file
write_query_to_file(QUERY_3, QUERY_3_FILENAME)


### Query 4

In [None]:
# Query 4: Correlation between area's rent, tree count, and number of 311 complaints
# This query finds the 5 zip codes with the lowest and highest average rent for January 2023,
# along with the tree count and complaint count for each zip code

QUERY_4 = """
WITH RentRanking AS (
    SELECT zipcode, ROUND(AVG(rent), 2) AS average_rent,
    RANK() OVER (ORDER BY AVG(rent)) AS rent_rank
    FROM zillow_rent
    WHERE date BETWEEN '2023-01-01' AND '2023-01-31'
    GROUP BY zipcode
)
SELECT rr.zipcode, rr.average_rent, 
       (SELECT COUNT(*) FROM tree_data WHERE zipcode = rr.zipcode) AS tree_count,
       (SELECT COUNT(*) FROM nyc311_complaints WHERE zipcode = rr.zipcode AND date BETWEEN '2023-01-01' AND '2023-01-31') AS complaint_count
FROM RentRanking rr
WHERE rr.rent_rank <= 5 OR rr.rent_rank >= (SELECT MAX(rent_rank) - 4 FROM RentRanking)
ORDER BY rr.average_rent;
"""

# File path for saving the query
QUERY_4_FILENAME = QUERY_DIR / "rent_tree_complaint_correlation.sql"

# Execute the query and print the results
with engine.connect() as conn:
    result = conn.execute(text(QUERY_4))
    for row in result:
        print(row)

# Write the query to a file
write_query_to_file(QUERY_4, QUERY_4_FILENAME)


### Query 5

In [None]:
# Query 5: Identifying areas with the most greenery using spatial join
# This query rewrites Query 2 to include a spatial join between the trees table and the zipcodes table
# to determine which trees are located within the boundary of a zipcode

QUERY_5 = """
WITH TreeCount AS (
    SELECT z.zipcode, COUNT(t.id) AS tree_count
    FROM zipcodes z
    JOIN tree_data t ON ST_Within(t.geom, z.geom)
    GROUP BY z.zipcode
)
SELECT zipcode, tree_count
FROM TreeCount
ORDER BY tree_count DESC
LIMIT 10;
"""

# File path for saving the query
QUERY_5_FILENAME = QUERY_DIR / "greenery_areas_with_spatial_join.sql"

# Execute the query and print the results
with engine.connect() as conn:
    result = conn.execute(text(QUERY_5))
    for row in result:
        print(row)

# Write the query to a file
write_query_to_file(QUERY_5, QUERY_5_FILENAME)


### Query 6

In [None]:
# Query 6: Finding trees within a ½ mile radius of a specific coordinate point
# This query identifies which trees are within a ½ mile radius of the given latitude and longitude

QUERY_6 = """
SELECT id, species, health, status, geom
FROM tree_data
WHERE ST_DWithin(
    geom,
    ST_SetSRID(ST_MakePoint(-73.96253174434912, 40.80737875669467), 4326),
    0.5 * 1609.34  -- 0.5 miles in meters
)
"""

# File path for saving the query
QUERY_6_FILENAME = QUERY_DIR / "trees_nearby_coordinate.sql"

# Execute the query and print the results
with engine.connect() as conn:
    result = conn.execute(text(QUERY_6))
    for row in result:
        print(row)

# Write the query to a file
write_query_to_file(QUERY_6, QUERY_6_FILENAME)


## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)