# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an outline to help you with your own approach.**_

## Project Setup

In [None]:
# all import statements needed for the project, for example:

import math
import os

import bs4
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import pyarrow.parquet as pq
import pandas as pd
import requests
import sqlalchemy as db
import folium
from folium.plugins import HeatMap
import geopandas as gpd
import numpy as np
import scipy.stats as st
from geopy.distance import distance

In [None]:
"""any constants you might need; some have been added for you, and some you need to fill in"""

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "data/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = ""
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [None]:
"""Make sure the QUERY_DIRECTORY exists"""
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [None]:
""" This function takes the shapefile and returns an object
    consisting of each zone, locationId and its geomtry coordinates """

def load_taxi_zones(shapefile: str) -> dict:
    gdf = gpd.read_file(shapefile)

    taxi_zones = []

    for index, row in gdf.iterrows():
        zone = row.iloc[3]
        locationId = row.iloc[4]
        geometry = row.iloc[6]
        
        row_object = { "zone": zone, "locationId": locationId, "geometry": geometry }
        taxi_zones.append(row_object)
    
    return taxi_zones

In [None]:
""" This function accepts the zone id and the taxi zones
    and matches the zone id with its relevant coordinates """

def lookup_coords_for_taxi_zone_id(zone_loc_id: int, loaded_taxi_zones: list) -> int:
    for i in loaded_taxi_zones:
        if i['locationId'] == zone_loc_id:
            return i['geometry']

In [None]:
""" test - lookup_coords_for_taxi_zone_id() """

zones = [{ "zone": 3, "locationId": 1, "geometry": 5 }, { "zone": 8, "locationId": 7, "geometry": 3 }]
assert lookup_coords_for_taxi_zone_id(1, zones)  == 5

### Calculate distance

In [None]:
""" This function calculate the distance giving the pick up
    point and drop off point and returns a distance integer """

def calculate_distance_with_coords(from_coord: tuple, to_coord: tuple) -> int:
    pickup_latitude, pickup_longitude = from_coord
    dropoff_latitude, dropoff_longitude = to_coord

    coords = [pickup_latitude, dropoff_latitude, pickup_longitude, dropoff_longitude]

    for i in coords:
        if i < -90 or i > 90:
            return -1

    return distance((pickup_latitude, pickup_longitude), (dropoff_latitude, dropoff_longitude)).miles

In [None]:
""" test - calculate_distance_with_coords() """

from_coord = (37.7749, -122.4194)  # San Francisco coordinates
to_coord = (34.0522, -118.2437)  # Los Angeles coordinates
assert round(calculate_distance_with_coords(from_coord, to_coord), 2) == 347.37


from_coord = (105, -122.4194)  # San Francisco coordinates
to_coord = (34.0522, -118.2437)  # Los Angeles coordinates
assert calculate_distance_with_coords(from_coord, to_coord) == -1

In [None]:
""" This function adds a new column with the distance between coordinates to the Dataframe.
    The input is a dataframe and the output is the new dataframe """
 
def add_distance_column(dataframe: pd.DataFrame) -> pd.DataFrame:
    # Apply the calculate_distance_with_coords function to each row of the DataFrame
    distances = dataframe.apply(lambda row: calculate_distance_with_coords(
        (row["pickup_latitude"], row["pickup_longitude"]),
        (row["dropoff_latitude"], row["dropoff_longitude"])
    ), axis=1)
    
    # Add the distances as a new column to the DataFrame
    dataframe["distance"] = distances
    
    return dataframe

In [None]:
""" test - add_distance_column() """
 # TODO

### Process Taxi Data

In [None]:
""" This function downloads all the relevant files from the taxi webpage
    and places it into our local directory """

def download_files(month: int, year: int):
    formatted_month = f"{month:02d}"
    current_dir = os.getcwd()
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{formatted_month}.parquet"

    response = requests.get(url, stream=True)
    with open(f"{current_dir}\yellow_taxi_{year}_{formatted_month}.parquet", "wb") as f:
        for chunk in response.iter_content(chunk_size=1024): 
            if chunk:
                f.write(chunk)

years = list(range(2009, 2016))
months = list(range(1, 13))

for year in years:
    if year < 2015:
        for month in months:
            download_files(month, year)
    else:
        for month in range(1, 7):
            download_files(month, year)

In [None]:
""" This function gets all the URLs from the taxi web page and returns
    it as an array of strings """

def get_all_urls_from_taxi_page(taxi_page: str) -> list[str]:
    try:
        response = requests.get(taxi_page)

        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        urls = []

        for link in soup.find_all('a'):
            href = link.get('href')
            if href is not None:
                urls.append(href)

        return urls
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
"""" test for get_all_urls_from_taxi_page() """

assert len(get_all_urls_from_taxi_page(TAXI_URL)) == 483

In [None]:
""" This function goes through all the URLs on the taxi web page
    and returns only the ones ending in .parquet since we want
    parquet files and also the ones from the years 2009 to 2015
    to avoid iterating through unecessary files. """

def filter_taxi_parquet_urls(all_urls: list[str]) -> list[str]:
    parquet_urls = []
    years = list(range(2009, 2016))

    if all_urls is not None:
        for i in all_urls:
            str = re.search('.parquet$', i)

            if(str != None and "yellow_tripdata" in i):
                year = int(i.split("_")[2][:4])

                if year in years:
                    parquet_urls.append(i)
    return parquet_urls

In [None]:
""" test for filter_taxi_parquet_urls() """

allUrlsData = get_all_urls_from_taxi_page(TAXI_URL)
assert len(filter_taxi_parquet_urls(allUrlsData)) == 84

In [None]:
""" This function takes a URL and extracts the month from it
    The example url can look like:
    https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet """

def get_and_clean_month(url: str) -> str:
    str = url[len(url) - 10:]
    [month, fileType] = str.split('.')
    return month

In [None]:
""" test for get_and_clean_month function """

url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet'
assert get_and_clean_month(url) == '06'

In [None]:
""" This function takes a URL and extracts the year from it
    The example url can look like:
    https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet """

def get_and_clean_year(url: str) -> str:
    str = url[len(url) - 15:]
    [year, other] = str.split('-')
    return year

In [None]:
""" test for get_and_clean_year function """

url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet'
assert get_and_clean_year(url) == '2022'

In [None]:
""" This fucntion adds a new column with the distance between coordinates to the taxi Dataframe.
    The input is a dataframe and the output is the new modified dataframe """
 
def add_distance_column_taxi(dataframe: pd.DataFrame) -> pd.DataFrame:
    # Apply the calculate_distance_with_coords function to each row of the DataFrame
    distances = dataframe.apply(lambda row: calculate_distance_with_coords(
        (row["Start_Lat"], row["Start_Lon"]),
        (row["End_Lat"], row["End_Lon"])
    ), axis=1)
    
    # Add the distances as a new column to the DataFrame
    dataframe["distance"] = distances
    
    return dataframe["distance"]

In [None]:
""" This function collects all the parquet urls from the taxi website.
    It will then get the actual data from the parquet files and do various forms of cleaning.
    For example, we will remove unnecessary columns and invalid data and will return
    one gigantic dataframe with data from every month """

def convert_taxi_data(parquet_urls: list[str]) -> pd.DataFrame:
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        month = get_and_clean_month(parquet_url)
        year = get_and_clean_year(parquet_url)

        cwd = os.getcwd()
        files = os.listdir(cwd)

        fileName = f"yellow_taxi_{year}_{month}.parquet"
        if fileName in files :

            dataframe = pd.read_parquet(fileName)
            sample_dataframe = dataframe.sample(n=20000)
            all_taxi_dataframes.append(sample_dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    return taxi_data

In [None]:
""" This function gets all the urls from the taxi page, specifically the parquet urls,
    gets and cleans it, and returns the valid data """

def get_taxi_data() -> pd.DataFrame:
    all_urls = get_all_urls_from_taxi_page(TAXI_URL)
    all_parquet_urls = filter_taxi_parquet_urls(all_urls)
    taxi_data = convert_taxi_data(all_parquet_urls)

    return taxi_data


In [None]:
selected_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance','PULocationID', 'DOLocationID','fare_amount','tip_amount','total_amount','pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude',  'dropoff_longitude', 'dropoff_latitude', 'Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime', 'Passenger_Count', 'Trip_Distance', 'Start_Lon', 'Start_Lat',  'End_Lon', 'End_Lat',  'Fare_Amt', 'Tip_Amt', 'Total_Amt']
df_selected = taxi_data[selected_cols]
# list of column pairs to join
column_pairs = [("tpep_pickup_datetime", 'pickup_datetime'), 
                ("tpep_dropoff_datetime", 'dropoff_datetime'),
                ('Trip_Distance', 'trip_distance'),
                ('Passenger_Count', 'passenger_count'),
                ('Start_Lon', 'PULocationID'),
                ('Start_Lat', 'PULocationID'),
                ('End_Lon', 'DOLocationID'),
                ('End_Lat', 'DOLocationID'),
                ('Fare_Amt', 'fare_amount'),
                ('Tip_Amt', 'tip_amount'),
                ('Total_Amt', 'total_amount')]

# loop over column pairs and join them
for pair in column_pairs:
    # fill missing values in the first column with values from the second column
    df_selected[pair[0]] = df_selected[pair[0]].fillna(df_selected[pair[1]])
    # drop the second column
    df_selected_final = df_selected.drop(pair[1], axis=1)

df_selected_final = df_selected_final.drop(['pickup_datetime',	'dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount', 'pickup_longitude',	'pickup_latitude',	'dropoff_longitude'	,'dropoff_latitude'], axis=1)


column_pairs = [("tpep_pickup_datetime", 'Trip_Pickup_DateTime'), 
                ("tpep_dropoff_datetime", 'Trip_Dropoff_DateTime')]


# loop over column pairs and join them
for pair in column_pairs:
    # fill missing values in the first column with values from the second column
    df_selected_final[pair[0]] = df_selected_final[pair[0]].fillna(df_selected_final[pair[1]])
    # drop the second column
    df_selected_final = df_selected_final.drop(pair[1], axis=1)

df_selected_final


In [None]:
'''
We then filter based on coordinates to make sure the rides are within the coordinates we want.
We also remove trips with 0 passangers and no fares. We further remove trips with passangers above 6 as that 
is uber policy. Lastly we remove trips with no distace between dropoff and pickup. The output is the
cleaned dataframe
'''

df_selected_final = df_selected_final[(df_selected_final["Start_Lat"] >= 40.560445) & 
                                      (df_selected_final["Start_Lon"] >= -74.242330) & 
                                      (df_selected_final["Start_Lat"] <= 40.908524) & 
                                      (df_selected_final["Start_Lon"] <= -73.717047) &
                                      (df_selected_final["End_Lat"] >= 40.560445) & 
                                      (df_selected_final["End_Lon"] >= -74.242330) & 
                                      (df_selected_final["End_Lat"] <= 40.908524) & 
                                      (df_selected_final["End_Lon"] <= -73.717047)]

    # Removing rows where distance is 0
    uber_data = uber_data[uber_data['distance']==0]

df_selected_final = df_selected_final[df_selected_final['Passenger_Count'] != 0]

add_distance_column_taxi(df_selected_final)

df_selected_final = df_selected_final.drop(index=df_selected_final[df_selected_final['distance'] == 0].index)

df_selected_final = df_selected_final[df_selected_final['Passenger_Count']<=6.0]
df_selected_final = df_selected_final.reset_index(drop=True)
df_selected_final = df_selected_final.rename(columns={
    "tpep_pickup_datetime": "pickup_datetime",
    "tpep_dropoff_datetime": "dropoff_datetime",
    "Passenger_Count": "Passenger_Count",
    "Trip_Distance": "Trip_Distance",
    "Start_Lon": "Start_Lon",
    "Start_Lat": "Start_Lat",
    "End_Lon": "End_Lon",
    "End_Lat": "End_Lat",
    "Fare_Amt": "Fare_Amt",
    "Tip_Amt": "Tip_Amt",
    "Total_Amt": "Total_Amt",
    "distance": "distance"
})

df_selected_final


In [None]:
Taxi_Data = df_selected_final.copy()
Taxi_Data.head()

### Processing Uber Data

In [None]:
"""This function first loads the uber data from the csv file. 
We then filter based on coordinates to make sure the rides are within the coordinates we want.
We also remove trips with 0 passangers and no fares. We further remove trips with passangers above 6 as that 
is uber policy. Lastly we remove trips with no distace between dropoff and pickup. The output is the
cleaned dataframe"""

def load_and_clean_uber_data(csv_file):

    # Reading in file into a data frame 
    uber_data = pd.read_csv(csv_file)

    # Filter data based on pickup and dropoff latitude/longitude(40.560445, -74.242330) and (40.908524, -73.717047).

    uber_data = uber_data[(uber_data["pickup_latitude"] >= 40.560445) & 
                      (uber_data["pickup_longitude"] >= -74.242330) & 
                      (uber_data["pickup_latitude"] <= 40.908524) & 
                      (uber_data["pickup_longitude"] <= -73.717047) &
                      (uber_data["dropoff_latitude"] >= 40.560445) & 
                      (uber_data["dropoff_longitude"] >= -74.242330) & 
                      (uber_data["dropoff_latitude"] <= 40.908524) & 
                      (uber_data["dropoff_longitude"] <= -73.717047)]
    
    # Checking if there are any null values for pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
    null_drop_lat = uber_data[uber_data['dropoff_latitude'].isnull()]
    null_drop_long = uber_data[uber_data['dropoff_longitude'].isnull()]
    null_pick_lat= uber_data[uber_data['pickup_latitude'].isnull()]
    null_pick_long = uber_data[uber_data['pickup_longitude'].isnull()]

    # Return True, if none of the colums have null values 

   # if null_drop_lat.empty & null_drop_long.empty & null_pick_lat.empty & null_pick_long.empty :
        #print(True)
    #else:
       # print(False)

    
    # Removing rows where passamger count is 0 
    uber_data = uber_data[uber_data['passenger_count']!=0]


    # Removing rows with passanger data is abnormally large 
    uber_data = uber_data[uber_data['passenger_count']<=6]

    # Checking datatypes for all columns 
    #print(uber_data.dtypes)

    #Making sure pickup time is a datetime object and normalizing the name 
    uber_data ['pickup_time'] = pd.to_datetime(uber_data ['pickup_datetime'])
 


    return uber_data


In [None]:
load_and_clean_uber_data("uber_rides_sample.csv")

In [None]:
""" We use the add distance column fcuntion we had defined before to add a new column with the distance 
of the ride to our uber data. We also drop columns where the distance of the ride is ==0"""

def get_uber_data() -> pd.DataFrame:
    uber_dataframe = load_and_clean_uber_data("uber_rides_sample.csv")
    add_distance_column(uber_dataframe)
    uber_dataframe = uber_dataframe.drop(index=uber_dataframe[uber_dataframe['distance'] == 0].index)
    return uber_dataframe


In [None]:
final_uber_data = get_uber_data()

In [None]:
#Removing unnecessary columns 
final_uber_data = final_uber_data.drop('Unnamed: 0', axis=1)
final_uber_data = final_uber_data.drop('key', axis=1)

In [None]:
final_uber_data

### Processing Weather Data

In [None]:
"""This function takes all the weather files, iterates through them and merges them 
into one dataframe. The output is the combined dataframe"""

def get_all_weather_csvs() -> pd.DataFrame:
    years = list(range(2009, 2016))

    # Initialize an empty list to store the dataframes
    dataframes = []

    # Iterate over the weather files
    for year in years:
        filepath = f"{year}_weather.csv"
        df = pd.read_csv(filepath)
        dataframes.append(df)

    # Concatenate all the dataframes into a single dataframe
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

In [None]:
"""This function first loads the uber data from the csv file. 
We then filter based on coordinates to make sure the rides are within the coordinates we want.
We also remove trips with 0 passangers and no fares. We further remove trips with passangers above 6 as that 
is uber policy. Lastly we remove trips with no distace between dropoff and pickup. The output is the
cleaned dataframe"""

def load_and_clean_weather_data() -> pd.DataFrame:

    df = get_all_weather_csvs()

    df1 = df[['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'NAME','HourlyPrecipitation','HourlyWindGustSpeed', 'HourlyWindSpeed', 'DailyAverageWindSpeed','DailyPrecipitation']]
    df2 = df1.dropna(subset=['HourlyPrecipitation', 'HourlyWindGustSpeed'])

    #column_types = df2.dtypes

    #print(column_types)

    # we see that the averages for wind speed and precipitation are null for all values so we can drop the columns 

    # We also doing need the hourly wind gust speed as we will be using the hourly wind speed, we can drop that column as well

    df2 = df2.drop(columns=['DailyAverageWindSpeed','DailyPrecipitation', 'HourlyWindGustSpeed','LATITUDE', 'LONGITUDE'])
    df2['DATE'] = pd.to_datetime(df['DATE'])

    df2

    # Removing all rows where Hourly preicipitation has the value "T" as we do not need to measure trace amounts 

    df3 = df2[df2['HourlyPrecipitation'] != "T"]

    df4 = df3.drop(columns=["STATION"])

    df4 = df4.reset_index()

    df4['DATE'] = df4['DATE'].apply(lambda x: x.to_pydatetime())

    df4['DATE'] = pd.to_datetime(df4['DATE'])

    df4['HourlyPrecipitation'] = df4['HourlyPrecipitation'].str.replace(r'(\d+)\s*[sS]$', r'\1', regex=True)
    
    # convert column "A" from object to float
    df4['HourlyPrecipitation'] = df4['HourlyPrecipitation'].astype(float)

    Weather_Data = df4.drop('index', axis=1)

    return  Weather_Data


In [None]:
load_and_clean_weather_data()

In [None]:
"""Roll up the data to daily"""
def clean_month_weather_data_daily() -> pd.DataFrame:

    daily_data = load_and_clean_weather_data()

    daily_data_final = daily_data.groupby([daily_data['DATE'].dt.year, daily_data['DATE'].dt.month, daily_data['DATE'].dt.day]).sum()[['HourlyPrecipitation', "HourlyWindSpeed" ]]

    daily_data_final = daily_data_final.rename_axis(index=['Year', 'Month', 'Day'])
    
    return daily_data_final

In [None]:
def clean_month_weather_data_hourly() -> pd.DataFrame:

    hourly_data = load_and_clean_weather_data()

    hourly_data_final = hourly_data.groupby([hourly_data['DATE'].dt.year, hourly_data['DATE'].dt.month, hourly_data['DATE'].dt.day, hourly_data['DATE'].dt.hour]).sum()[['HourlyPrecipitation', "HourlyWindSpeed" ]]

    hourly_data_final = hourly_data_final.rename_axis(index=['Year', 'Month', 'Day', 'Hour'])
    
    return hourly_data_final

In [None]:
#hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data = clean_month_weather_data_hourly()
hourly_weather_data.head()


In [None]:
daily_weather_data = clean_month_weather_data_daily()
daily_weather_data = daily_weather_data.reset_index()

daily_weather_data

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
    CREATE TABLE IF NOT EXISTS HOURLY_WEATHER (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        year INTEGER,
        month INTEGER,
        day INTEGER,
        hour INTEGER,
        precipitation REAL,
        wind REAL
);
"""

DAILY_WEATHER_SCHEMA = """
    CREATE TABLE IF NOT EXISTS DAILY_WEATHER (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        year INTEGER,
        month INTEGER,
        day INTEGER,
        precipitation REAL,
        wind REAL
    );
"""

TAXI_TRIPS_SCHEMA = """
    CREATE TABLE IF NOT EXISTS TAXI_TRIPS (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        pickup_datetime TEXT,
        dropoff_datetime TEXT,
        Passenger_Count REAL
        Trip_Distance REAL,
        Start_Lon REAL,
        Start_Lat REAL,
        End_Lon REAL, 
        End_Lat REAL,
        Fare_Amt REAL, 
        Tip_Amt REAL, 
        Total_Amt REAL,
        distance REAL,

    );
"""

UBER_TRIPS_SCHEMA = """
    CREATE TABLE IF NOT EXISTS UBER_TRIPS (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        pickup_datetime TEXT,
        pickup_longitude REAL,
        pickup_latitude REAL,
        dropoff_longitude REAL,
        dropoff_latitude REAL,
        fare_amount REAL,
        distance REAL,
        passenger_count INTEGER,
    );
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table():

    hourly_weather_data.to_sql(name='HOURLY_WEATHER', con=engine, if_exists='replace', index=False)
    daily_weather_data.to_sql(name='DAILY_WEATHER', con=engine, if_exists='replace', index=False)
    final_uber_data.to_sql(name='UBER_TRIPS', con=engine, if_exists='replace', index=False)
    Taxi_Data.to_sql(name='TAXI_TRIPS', con=engine, if_exists='replace', index=False)

    
write_dataframes_to_table()

In [None]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table
query = "SELECT * FROM TAXI_TRIPS LIMIT 10;"
result = engine.execute(query)


for row in result:
    print(row)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table

with open('1_hour_day.sql', 'r') as file:
    query = file.read()

result = engine.execute(query)


for row in result:
    print(row)



### Query 2

In [None]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table

with open('2_day_week.sql', 'r') as file:
    query = file.read()

result = engine.execute(query)


for row in result:
    print(row)

### Query 3

In [None]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table

with open('3_95_percentile.sql', 'r') as file:
    query = file.read()

result = engine.execute(query)


for row in result:
    print(row)

### Query 4

In [None]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table

with open('4_top_10_days.sql', 'r') as file:
    query = file.read()

result = engine.execute(query)


for row in result:
    print(row)

### Query 5

In [None]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table

with open('5_10_windiest_days.sql', 'r') as file:
    query = file.read()

result = engine.execute(query)


for row in result:
    print(row)

### Query 6

In [None]:
QUERY_6 = """
    SELECT strftime('%w', pickup_datetime) AS day_of_week, COUNT(*) AS frequency
    FROM UBER_TRIPS
    WHERE pickup_datetime BETWEEN '2009-01-01 00:00:00 UTC' AND '2015-06-30 23:59:59 UTC'
    GROUP BY day_of_week
    ORDER BY day_of_week;
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
def plot_frequency_hour(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    hour = [1, 2, 3, 4, 5]
    values = [1, 5, 3, 2, 5]

    axes.bar(hour, values)

    axes.set_ylabel('Popularity')
    axes.set_xlabel('Hour')
    axes.set_title("Frequency per Hour")
    axes.set_xlim(-1, 11)
    axes.set_ylim(-1.5, 1.5)
    
    plt.show()

In [None]:
def get_data_frequency_hour():
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_frequency_hour()
plot_frequency_hour(some_dataframe)

### Visualization 2

In [None]:
def plot_avg_distance_month(dataframe):
    figure, axes = plt.subplots(figsize=(30, 20))
    
    month = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
    distance = [1, 5, 3, 2, 5, 7, 8, 1, 9, 23, 6, 7]

    ci = 1.90 * np.std(distance)/np.sqrt(len(month))

    axes.plot(month,distance)
    axes.plot(month, distance, 'o', color='tab:brown')
    
    axes.fill_between(month, (distance - ci), (distance + ci), color= 'b', alpha = 0.1)

    axes.set_ylabel('Average Distance')
    axes.set_xlabel('Month')
    axes.set_title("Average Distance per Month")


    # a, b = np.polyfit(x, y, deg=1)
    # y_est = a * x + b
    # y_err = x.std() * np.sqrt(1/len(x) +
    #                         (x - x.mean())**2 / np.sum((x - x.mean())**2))

    # fig, ax = plt.subplots(figsize=(10, 6))
    # ax.plot(x, y_est, '-')
    # ax.fill_between(x, y_est - y_err, y_est + y_err, alpha=0.2)
    # ax.plot(x, y, 'o', color='tab:brown');

    
    plt.show()

In [None]:
def get_data_avg_distance_month():
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_avg_distance_month()
plot_avg_distance_month(some_dataframe)

### Visualization 3

In [None]:
def plot_dropoffs_ny_area(dataframe):

    # Click boxes on plot to see animation + which day of the week was most popular for drop offs for each airport

    map_obj = folium.Map(location = [40.730610, -73.935242], zoom_start = 11, min_zoom = 11, tiles='CartoDB positron')

    folium.Rectangle([(40.778865,-73.854838), (40.763589,-73.891745)], fill=True, fill_color='#ff7800', fill_opacity=0.2).add_child(folium.Popup('Tuesday')).add_to(map_obj) # LGA

    folium.Rectangle([(40.651376, -73.766264), (40.639263, -73.795642)], fill=True, fill_color='#ff7800', fill_opacity=0.2).add_child(folium.Popup('Tuesday')).add_to(map_obj) # JFK

    folium.Rectangle([(40.699680, -74.165205), (40.686794, -74.194028)], fill=True, fill_color='#ff7800', fill_opacity=0.2).add_child(folium.Popup('Tuesday')).add_to(map_obj) # EWR
    
    return map_obj

In [None]:
def get_data_dropoffs_ny_area():
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_dropoffs_ny_area()
plot_dropoffs_ny_area(some_dataframe)

### Visualization 4

In [None]:
def plot_trips_area(dataframe):    
    map_obj = folium.Map(location = [40.730610, -73.935242], zoom_start = 10, min_zoom = 10, tiles='CartoDB positron')

    lats_longs = [
                    [40.7554, -73.9862],
                    [40.7794, -73.9654],
                    [40.7223, -73.9982],
                    [40.7455, -74.0071],
                ]

    HeatMap(lats_longs).add_to(map_obj)
    return map_obj

In [None]:
def get_data_trips_area():
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_trips_area()
plot_trips_area(some_dataframe)

### Visualization 5

In [None]:
def plot_tips_distance(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    tips = [2, 6, 3, 7, 8, 1, 9, 22, 9, 6, 22, 1]
    distance = [1, 5, 3, 2, 5, 7, 8, 1, 9, 23, 6, 7]

    axes.scatter(distance, tips, marker='o', alpha=0.5)
    axes.set_title("Yellow Tips - Tips vs. Distance")
    axes.set_ylabel('Popularity')
    axes.set_xlabel('Distance')
    axes.set_xlim(-1, 11)
    axes.set_ylim(-1, 10)
    
    plt.show()

In [None]:
def get_data_tips_distance():
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_tips_distance()
plot_tips_distance(some_dataframe)

### Visualization 6

In [None]:
def plot_tips_precipitation(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    tips = [2, 6, 3, 7, 8, 1, 9, 22, 9, 6, 22, 1]
    precipication = [1, 5, 3, 2, 5, 7, 8, 1, 9, 23, 6, 7]

    axes.scatter(precipication, tips, marker='o', alpha=0.5)
    axes.set_title("Yellow Taxi - Tips vs. Precipitation")
    axes.set_ylabel('Precipitation')
    axes.set_xlabel('Tips')
    axes.set_xlim(-1, 11)
    axes.set_ylim(-1, 10)
    
    plt.show()

In [None]:
def get_data_tips_precipitation():
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_tips_precipitation()
plot_tips_precipitation(some_dataframe)