# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [1]:
# All import statements needed for the project, for example:

import json
import pathlib
from requests.exceptions import ReadTimeout
import time
import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db
from sodapy import Socrata
import glob

from sqlalchemy.orm import declarative_base

In [2]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "ZIP_CODE_040114.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "xX3rCbSDM4vF0QEfgh09b2ZWW"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "IEOR4501-XL"
DB_USER = "ylx"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [4]:
#downloading data from small chunks
def download_nyc_csv_data(year,starttime,endtime,url,filename):
    filepath = f'{DATA_DIR}/{filename}_{year}.csv'
    query=f"""
    select * 
    where created_date between {starttime} 
    and {endtime}
    """

    if not filename:
        print(f"Downloading {url} to {filename}...")
        client = Socrata( "data.cityofnewyork.us",
                  "xX3rCbSDM4vF0QEfgh09b2ZWW",
                  username="yirong263@gmail.com",
                  password="UTDYnmz*zn2u3g6",
                  timeout=60)
        max_retries = 5
        retry_wait = 10  # Initial wait time in seconds

        while max_retries > 0:
            try:    
            # Set initial parameters for the SoQL query
                limit = 1000000  # Example limit
                offset = 0  # Start at the beginning
                total_records = 100000000  # Example total number of records you wish to download
                current_record = 0
                while current_record < total_records:
                    # Adjust the query to include the limit and offset
                    results = client.get(f"{url}",query= query+ f" limit {limit} offset {offset}")
                    
                    # Convert to DataFrame and save to CSV
                    df = pd.DataFrame.from_records(results)
                    df.to_csv(f'{filepath}', index=False)
                    
                    # Update the offset and current_record count
                    offset += limit
                    current_record += len(results)

                    # Optional: Print progress
                    print(f'Downloaded {current_record} of {total_records}')
                break
            
            except ReadTimeout:
                # Wait before retrying
                time.sleep(retry_wait)
                # Reduce the number of retries left
                max_retries -= 1
                # Increase the wait time for the next retry
                retry_wait *= 2
        
        print(f"Done downloading {url} from {year}.")

    else:
        print(f"Reading from {filepath}...")


    return filepath

In [None]:
def load_and_clean_zipcodes(zipcode_datafile):
    raise NotImplementedError()

In [5]:
def download_and_clean_311_data():
    #data downloading
    download_nyc_csv_data(2015,"2015-01-01T00:00:00.000","2015-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2016,"2016-01-01T00:00:00.000","2016-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2017,"2017-01-01T00:00:00.000","2017-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2018,"2018-01-01T00:00:00.000","2018-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2019,"2019-01-01T00:00:00.000","2019-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2020,"2020-01-01T00:00:00.000","2020-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2021,"2021-01-01T00:00:00.000","2021-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2022,"2022-01-01T00:00:00.000","2022-12-31T23:59:59.999","erm2-nwe9",'nyc_311_data')
    download_nyc_csv_data(2023,"2023-01-01T00:00:00.000","2015-09-30T23:59:59.999","erm2-nwe9",'nyc_311_data')

    # After downloading all chunks
    csv_files = glob.glob('data/nyc_311_data_*.csv')
    # Remove unnecessary columns by keeping only the ones you need for each file
    dfs=[]
    for file in csv_files:
        df=pd.read_csv(file)
        columns_needed = ['unique_key', 'created_date', 'complaint_type','incident_zip','latitude', 'longitude']  # Replace with actual column names
        df = df[columns_needed]
        #eliminate duplicate
        df.drop_duplicates(inplace=True)
        # Remove invalid data points
        # This is highly dependent on the context of your data, but as an example:
        df.dropna(inplace=True) 
        # Normalize column names
        df.columns = [column_name.lower().replace(' ', '_') for column_name in df.columns]
        dfs.append(df) # processed df and append to a list

    # Concatenate all DataFrames into one
    NYC311_df = pd.concat(dfs,ignore_index=True)
    NYC311_df.drop_duplicates(inplace=True)# header duplicate elimination
    # Normalize Column Types

    # unique_key 
    NYC311_df['unique_key'] = NYC311_df['unique_key'].astype(int)
    # change name into 'id_NYC311'
    NYC311_df.rename(columns={'unique_key': 'id_NYC311'}, inplace=True)

    #incident zip
    #rename from incident_zip to zipcode
    NYC311_df.rename(columns={'incident_zip': 'zipcode'}, inplace=True)
    # Convert the 'zipcode' column to a string type, then filter
    NYC311_df['zipcode']=NYC311_df['zipcode'].astype(float).astype(int)
    NYC311_df = NYC311_df[NYC311_df['zipcode'].apply(lambda x: str(x).isdigit() and len(str(x)) == 5)] 

    #created_date
    #rename "date"
    NYC311_df.rename(columns={'created_date': 'date'}, inplace=True)
    # sorting by date
    NYC311_df = NYC311_df.sort_values(by='date')
    #change date format into yyyy-mm-dd
    NYC311_df['date'] = pd.to_datetime(NYC311_df['date']).dt.strftime('%Y-%m-%d')
        
    # Assuming df is your existing DataFrame with latitude and longitude columns
    NYC311_df = gpd.GeoDataFrame(NYC311_df, geometry=gpd.points_from_xy(NYC311_df['longitude'], NYC311_df['latitude']))
    NYC311_df.crs = "EPSG:4326"  # Set the original CRS to WGS84
    target_srid = "EPSG:3857"  # Define the target CRS (Web Mercator)
    NYC311_df = NYC311_df.to_crs(target_srid)  # Transform the CRS to the target


    #save the combined DataFrame to a new CSV file
    # NYC311_df.to_csv('data/nyc_311_data.csv', index=False)
    return NYC311_df
    

In [23]:
def download_and_clean_tree_data():
    download_nyc_csv_data(2015,"2015-01-01T00:00:00.000","2015-12-31T23:59:59.999","5rq2-4hqu",'tree')
    tree_df=pd.read_csv('data/tree_2015.csv')
    # Remove unnecessary columns by keeping only the ones you need
    columns_needed = ['created_at', 'tree_id', 'status','zipcode','health','spc_common', 'latitude', 'longitude']  # Replace with actual column names
    tree_df = tree_df[columns_needed]

    # Remove invalid data points
    # This is highly dependent on the context of your data, but as an example:
    tree_df.drop_duplicates(inplace=True)
    tree_df.dropna(inplace=True)  

    # Normalize column names
    tree_df.columns = [column_name.lower().replace(' ', '_') for column_name in tree_df.columns]
    #created_at
    tree_df.rename(columns={'created_at': 'date'}, inplace=True)
    tree_df['date'] = pd.to_datetime(tree_df['date']).dt.strftime('%Y-%m-%d')#change date format into yyyy-mm-dd

    #zipcode
    tree_df['zipcode'] = tree_df['zipcode'].astype(int)
    tree_df=tree_df.sort_values('date')
    return tree_df


In [None]:
def load_and_clean_zillow_data():
    raise NotImplementedError()

In [27]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [24]:
geodf_tree_data = download_and_clean_tree_data()

Reading from data/tree_2015.csv...


In [6]:
geodf_311_data = download_and_clean_311_data()

Reading from data/nyc_311_data_2015.csv...
Reading from data/nyc_311_data_2016.csv...
Reading from data/nyc_311_data_2017.csv...
Reading from data/nyc_311_data_2018.csv...
Reading from data/nyc_311_data_2019.csv...
Reading from data/nyc_311_data_2020.csv...
Reading from data/nyc_311_data_2021.csv...
Reading from data/nyc_311_data_2022.csv...
Reading from data/nyc_311_data_2023.csv...


  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)
  df=pd.read_csv(file)


In [None]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [None]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

In [None]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

In [16]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 23030505 entries, 291587 to 16420015
Data columns (total 7 columns):
 #   Column          Dtype   
---  ------          -----   
 0   id_NYC311       int64   
 1   date            object  
 2   complaint_type  object  
 3   zipcode         int64   
 4   latitude        float64 
 5   longitude       float64 
 6   geometry        geometry
dtypes: float64(2), geometry(1), int64(2), object(2)
memory usage: 1.4+ GB


In [9]:
geodf_311_data.head()

Unnamed: 0,id_NYC311,date,complaint_type,zipcode,latitude,longitude,geometry
291587,29616011,2015-01-01,HEAT/HOT WATER,11207,40.667093,-73.891719,POINT (-8225588.556 4963361.308)
291738,29615514,2015-01-01,HEAT/HOT WATER,10034,40.868366,-73.916422,POINT (-8228338.477 4992945.088)
291739,29615513,2015-01-01,HEAT/HOT WATER,11377,40.744999,-73.892968,POINT (-8225727.524 4974801.536)
291740,29615512,2015-01-01,HEAT/HOT WATER,10472,40.833156,-73.87054,POINT (-8223230.924 4987763.294)
291741,29615511,2015-01-01,HEAT/HOT WATER,10024,40.787862,-73.976899,POINT (-8235070.736 4981101.492)


In [25]:
geodf_tree_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189969 entries, 13800 to 196344
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        189969 non-null  object 
 1   tree_id     189969 non-null  int64  
 2   status      189969 non-null  object 
 3   zipcode     189969 non-null  int64  
 4   health      189969 non-null  object 
 5   spc_common  189969 non-null  object 
 6   latitude    189969 non-null  float64
 7   longitude   189969 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 13.0+ MB


In [26]:
geodf_tree_data.head()

Unnamed: 0,date,tree_id,status,zipcode,health,spc_common,latitude,longitude
13800,2015-05-19,347,Alive,10459,Fair,American hornbeam,40.821445,-73.892916
45386,2015-05-19,317,Alive,10459,Fair,Amur maackia,40.825308,-73.897495
1890,2015-05-19,306,Alive,10459,Good,Siberian elm,40.823933,-73.897177
22732,2015-05-19,9,Alive,10474,Poor,silver birch,40.814107,-73.889021
4301,2015-05-19,307,Alive,10459,Good,Siberian elm,40.824059,-73.89726


In [None]:
df_zillow_data.info()

In [None]:
df_zillow_data.head()

## Part 2: Storing Data

In [None]:
def setup_new_postgis_database(username, db_name):
    

In [None]:
setup_new_postgis_database(DB_USER, DB_NAME)

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [None]:
engine = db.create_engine(DB_URL)

#### Option 1: SQL

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the SQL statements to create your 4 tables
ZIPCODE_SCHEMA = """
TODO
"""

NYC_311_SCHEMA = """
CREATE TABLE IF NOT EXISTS nyc311_complaints (
    id_NYC311 INT PRIMARY KEY,
    date DATE,
    complaint_type VARCHAR(255),
    zipcode INT,
    latitude FLOAT,
    longitude FLOAT,
    geometry GEOMETRY
);

"""

NYC_TREE_SCHEMA = """
CREATE TABLE IF NOT EXISTS tree_data (
    date DATE,
    tree_id INT PRIMARY KEY,
    status VARCHAR(255),
    zipcode INT,
    health VARCHAR(255),
    spc_common VARCHAR(255),
    latitude FLOAT,
    longitude FLOAT
);

"""

ZILLOW_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [None]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with engine.connect() as connection:
    pass

#### Option 2: SQLAlchemy

In [None]:
Base = declarative_base()

class Tree(Base):
    __tablename__ = "trees"

    ...


In [None]:
Base.metadata.create_all(engine)

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

#### Option 1: SQL

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    # write INSERT statements or use pandas/geopandas to write SQL
    raise NotImplemented()

In [None]:
tablename_to_dataframe = {
    "zipcodes": geodf_zipcode_data,
    "complaints": geodf_311_data,
    "trees": geodf_tree_data,
    "rents": df_zillow_data,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

#### Option 2: SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
for row in geodf_tree_data.iterrows():
    tree = Tree(...)
    session.add(tree)

In [None]:
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)