# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an outline to help you with your own approach.**_

## Project Setup

In [231]:
# all import statements needed for the project, for example:

import math
import os

import bs4
import re
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pandas as pd
import requests
import sqlalchemy as db
import geopandas as gpd
from geopy.distance import distance

In [204]:
"""any constants you might need; some have been added for you, and some you need to fill in"""

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "data/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = ""
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [205]:
"""Make sure the QUERY_DIRECTORY exists"""
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Load Taxi Zones

In [206]:
""" This function takes the shapefile and returns an object
    consisting of each zone, locationId and its geomtry coordinates """

def load_taxi_zones(shapefile):
    gdf = gpd.read_file(shapefile)

    taxi_zones = []

    for index, row in gdf.iterrows():
        zone = row.iloc[3]
        locationId = row.iloc[4]
        geometry = row.iloc[6]
        
        row_object = { "zone": zone, "locationId": locationId, "geometry": geometry }
        taxi_zones.append(row_object)
    
    return taxi_zones

In [207]:
""" This function accepts the zone id and the taxi zones
    and matches the zone id with its relevant coordinates """

def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    for i in loaded_taxi_zones:
        if i['locationId'] == zone_loc_id:
            return i['geometry']

In [208]:
""" test - lookup_coords_for_taxi_zone_id() """

zones = [{ "zone": 3, "locationId": 1, "geometry": 5 }, { "zone": 8, "locationId": 7, "geometry": 3 }]
assert lookup_coords_for_taxi_zone_id(1, zones)  == 5

### Calculate distance

In [209]:
""" This function calculate the distance giving the pick up
    point and drop off point and returns a distance integer """

def calculate_distance_with_coords(from_coord, to_coord):
    pickup_latitude, pickup_longitude = from_coord
    dropoff_latitude, dropoff_longitude = to_coord

    coords = [pickup_latitude, dropoff_latitude, pickup_longitude, dropoff_longitude]

    for i in coords:
        if i < -90 or i > 90:
            return -1

    return distance((pickup_latitude, pickup_longitude), (dropoff_latitude, dropoff_longitude)).miles

In [211]:
""" test - calculate_distance_with_coords() """

from_coord = (37.7749, -122.4194)  # San Francisco coordinates
to_coord = (34.0522, -118.2437)  # Los Angeles coordinates
assert round(calculate_distance_with_coords(from_coord, to_coord), 2) == 347.37


from_coord = (105, -122.4194)  # San Francisco coordinates
to_coord = (34.0522, -118.2437)  # Los Angeles coordinates
assert calculate_distance_with_coords(from_coord, to_coord) == -1

AssertionError: 

In [212]:
""" This function adds a new column with the distance between coordinates to the Dataframe.
    The input is a dataframe and the output is the new dataframe """
 
def add_distance_column(dataframe):
    # Apply the calculate_distance_with_coords function to each row of the DataFrame
    distances = dataframe.apply(lambda row: calculate_distance_with_coords(
        (row["pickup_latitude"], row["pickup_longitude"]),
        (row["dropoff_latitude"], row["dropoff_longitude"])
    ), axis=1)
    
    # Add the distances as a new column to the DataFrame
    dataframe["distance"] = distances
    
    return dataframe

In [213]:
""" test - add_distance_column() """
 # TODO

' test - add_distance_column() '

### Process Taxi Data

In [132]:
""" This function downloads all the relevant files from the taxi webpage
    and places it into our local directory """

def download_files(month, year):
    formatted_month = f"{month:02d}"
    current_dir = os.getcwd()
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{formatted_month}.parquet"

    response = requests.get(url, stream=True)
    with open(f"{current_dir}\yellow_taxi_{year}_{formatted_month}.parquet", "wb") as f:
        for chunk in response.iter_content(chunk_size=1024): 
            if chunk:
                f.write(chunk)

years = list(range(2009, 2016))
months = list(range(1, 13))

for year in years:
    if year < 2015:
        for month in months:
            download_files(month, year)
    else:
        for month in range(1, 7):
            download_files(month, year)

In [214]:
"""FOR MAC
def download_files(month, year):
    formatted_month = f"{month:02d}"
    current_dir = os.getcwd()
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{formatted_month}.parquet"

    response = requests.get(url, stream=True)
    with open(f"yellow_taxi_{year}_{formatted_month}.parquet", "wb") as f:
        for chunk in response.iter_content(chunk_size=1024): 
            if chunk:
                f.write(chunk)

years = list(range(2009, 2016))
months = list(range(1, 13))

for year in years:
    if year < 2015:
        for month in months:
            download_files(month, year)
    else:
        for month in range(1, 7):
            download_files(month, year)
"""

'FOR MAC\ndef download_files(month, year):\n    formatted_month = f"{month:02d}"\n    current_dir = os.getcwd()\n    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{formatted_month}.parquet"\n\n    response = requests.get(url, stream=True)\n    with open(f"yellow_taxi_{year}_{formatted_month}.parquet", "wb") as f:\n        for chunk in response.iter_content(chunk_size=1024): \n            if chunk:\n                f.write(chunk)\n\nyears = list(range(2009, 2016))\nmonths = list(range(1, 13))\n\nfor year in years:\n    if year < 2015:\n        for month in months:\n            download_files(month, year)\n    else:\n        for month in range(1, 7):\n            download_files(month, year)\n'

In [215]:
""" This function gets all the URLs from the taxi web page and returns
    it as an array of strings """

def get_all_urls_from_taxi_page(taxi_page):
    try:
        response = requests.get(taxi_page)

        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        urls = []

        for link in soup.find_all('a'):
            href = link.get('href')
            if href is not None:
                urls.append(href)

        return urls
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [216]:
"""" test for get_all_urls_from_taxi_page() """

assert len(get_all_urls_from_taxi_page(TAXI_URL)) == 483

In [217]:
""" This function goes through all the URLs on the taxi web page
    and returns only the ones ending in .parquet since we want
    parquet files. """

def filter_taxi_parquet_urls(all_urls):
    parquet_urls = []

    if all_urls is not None:
        for i in all_urls:
            str = re.search('.parquet$', i)
            if(str != None):
                parquet_urls.append(i)
    return parquet_urls

In [218]:
""" test for filter_taxi_parquet_urls() """

allUrlsData = get_all_urls_from_taxi_page(TAXI_URL)
assert len(filter_taxi_parquet_urls(allUrlsData)) == 428

In [219]:
""" This function takes a URL and extracts the month from it
    The example url can look like:
    https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet """

def get_and_clean_month(url):
    str = url[len(url) - 10:]
    [month, fileType] = str.split('.')
    return month

In [220]:
""" test for get_and_clean_month function """

url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet'
assert get_and_clean_month(url) == '06'

In [221]:
""" This function takes a URL and extracts the year from it
    The example url can look like:
    https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet """

def get_and_clean_year(url):
    str = url[len(url) - 15:]
    [year, other] = str.split('-')
    return year

In [222]:
""" test for get_and_clean_year function """

url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-06.parquet'
assert get_and_clean_year(url) == '2022'

In [223]:
""" This fucntion adds a new column with the distance between coordinates to the taxi Dataframe.
    The input is a dataframe and the output is the new modified dataframe """
 
def add_distance_column_taxi(dataframe):
    # Apply the calculate_distance_with_coords function to each row of the DataFrame
    distances = dataframe.apply(lambda row: calculate_distance_with_coords(
        (row["Start_Lat"], row["Start_Lon"]),
        (row["End_Lat"], row["End_Lon"])
    ), axis=1)
    
    # Add the distances as a new column to the DataFrame
    dataframe["distance"] = distances
    
    return dataframe["distance"]

In [233]:
""" This function collects all the parquet urls from the taxi website.
    It will then get the actual data from the parquet files and do various forms of cleaning.
    For example, we will remove unnecessary columns and invalid data and will return
    one gigantic dataframe with data from every month """

def convert_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        month = get_and_clean_month(parquet_url)
        year = get_and_clean_year(parquet_url)

        cwd = os.getcwd()
        files = os.listdir(cwd)

        fileName = f"yellow_taxi_{year}_{month}.parquet"
        if fileName in files :

            dataframe = pd.read_parquet(fileName)
            sample_dataframe = dataframe.sample(n=2000)
            #add_distance_column_taxi(sample_dataframe)
            all_taxi_dataframes.append(sample_dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)

    print(len(taxi_data))
    print(len(taxi_data.head()))

    return taxi_data


In [234]:
all_urls = get_all_urls_from_taxi_page(TAXI_URL)
all_parquet_urls = filter_taxi_parquet_urls(all_urls)
taxi_data = convert_taxi_data(all_parquet_urls)

267500


In [226]:
""" This function gets all the urls from the taxi page, specifically the parquet urls,
    gets and cleans it, and returns the valid data """

def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TAXI_URL)
    all_parquet_urls = filter_taxi_parquet_urls(all_urls)
    taxi_data = convert_taxi_data(all_parquet_urls)

    return taxi_data


In [235]:
taxi_data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,Tip_Amt,Tolls_Amt,Total_Amt
811716,2.0,2015-01-03 08:34:22,2015-01-03 08:40:27,5.0,1.8,1.0,N,142.0,43.0,1,...,,,,,,,,,,
1928642,1.0,2015-01-06 10:42:38,2015-01-06 11:08:03,1.0,1.3,1.0,N,236.0,75.0,2,...,,,,,,,,,,
10980337,2.0,2015-01-28 08:23:39,2015-01-28 08:32:12,1.0,1.44,1.0,N,231.0,249.0,1,...,,,,,,,,,,
3580296,1.0,2015-01-10 01:52:05,2015-01-10 01:55:19,1.0,0.8,1.0,N,48.0,48.0,1,...,,,,,,,,,,
1545519,2.0,2015-01-05 09:29:16,2015-01-05 09:50:43,1.0,3.3,1.0,N,238.0,186.0,2,...,,,,,,,,,,


### Processing Uber Data

In [172]:
"""This function first loads the uber data from the csv file. 
We then filter based on coordinates to make sure the rides are within the coordinates we want.
We also remove trips with 0 passangers and no fares. We further remove trips with passangers above 6 as that 
is uber policy. Lastly we remove trips with no distace between dropoff and pickup. The output is the
cleaned dataframe"""

def load_and_clean_uber_data(csv_file):

    # Reading in file into a data frame 
    uber_data = pd.read_csv(csv_file)

    # Filter data based on pickup and dropoff latitude/longitude(40.560445, -74.242330) and (40.908524, -73.717047).

    uber_data = uber_data[(uber_data["pickup_latitude"] >= 40.560445) & 
                      (uber_data["pickup_longitude"] >= -74.242330) & 
                      (uber_data["pickup_latitude"] <= 40.908524) & 
                      (uber_data["pickup_longitude"] <= -73.717047) &
                      (uber_data["dropoff_latitude"] >= 40.560445) & 
                      (uber_data["dropoff_longitude"] >= -74.242330) & 
                      (uber_data["dropoff_latitude"] <= 40.908524) & 
                      (uber_data["dropoff_longitude"] <= -73.717047)]
    
    # Checking if there are any null values for pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
    null_drop_lat = uber_data[uber_data['dropoff_latitude'].isnull()]
    null_drop_long = uber_data[uber_data['dropoff_longitude'].isnull()]
    null_pick_lat= uber_data[uber_data['pickup_latitude'].isnull()]
    null_pick_long = uber_data[uber_data['pickup_longitude'].isnull()]

    # Return True, if none of the colums have null values 

   # if null_drop_lat.empty & null_drop_long.empty & null_pick_lat.empty & null_pick_long.empty :
        #print(True)
    #else:
       # print(False)

    
    # Removing rows where passamger count is 0 
    uber_data = uber_data[uber_data['passenger_count']!=0]


    # Removing rows with passanger data is abnormally large 
    uber_data = uber_data[uber_data['passenger_count']<=6]

    # Checking datatypes for all columns 
    #print(uber_data.dtypes)

    #Making sure pickup time is a datetime object and normalizing the name 
    uber_data ['pickup_time'] = pd.to_datetime(uber_data ['pickup_datetime'])
 


    return uber_data


In [173]:
""" We use the add distance column fcuntion we had defined before to add a new column with the distance 
of the ride to our uber data. We also drop columns where the distance of the ride is ==0"""

def get_uber_data():
    uber_dataframe = load_and_clean_uber_data("uber_rides_sample.csv")
    add_distance_column(uber_dataframe)
    
    # Removing rows where distance is 0

    uber_dataframe = uber_dataframe.drop(index=uber_dataframe[uber_dataframe['distance'] == 0].index)
    return uber_dataframe


In [174]:
final_uber_data = get_uber_data()
final_uber_data

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_time,distance
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,2015-05-07 19:52:06+00:00,1.044594
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1,2009-07-17 20:04:56+00:00,1.525071
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1,2009-08-24 21:45:00+00:00,3.131464
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,2009-06-26 08:22:21+00:00,1.032372
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,2014-08-28 17:47:00+00:00,2.786061
...,...,...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1,2012-10-28 10:49:00+00:00,0.069673
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1,2014-03-14 01:09:00+00:00,1.167951
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2,2009-06-29 00:42:00+00:00,7.995752
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1,2015-05-20 14:56:25+00:00,2.197512


In [175]:
#Removing unnecessary columns 
final_uber_data = final_uber_data.drop('Unnamed: 0', axis=1)
final_uber_data = final_uber_data.drop('key', axis=1)

In [176]:
final_uber_data

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_time,distance
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,2015-05-07 19:52:06+00:00,1.044594
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1,2009-07-17 20:04:56+00:00,1.525071
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1,2009-08-24 21:45:00+00:00,3.131464
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,2009-06-26 08:22:21+00:00,1.032372
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,2014-08-28 17:47:00+00:00,2.786061
...,...,...,...,...,...,...,...,...,...
199995,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1,2012-10-28 10:49:00+00:00,0.069673
199996,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1,2014-03-14 01:09:00+00:00,1.167951
199997,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2,2009-06-29 00:42:00+00:00,7.995752
199998,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1,2015-05-20 14:56:25+00:00,2.197512


### Processing Weather Data

In [187]:
"""This function takes all the weather files, iterates through them and merges them 
into one dataframe. The output is the combined dataframe"""

def get_all_weather_csvs():
    years = list(range(2009, 2016))

    # Initialize an empty list to store the dataframes
    dataframes = []

    # Iterate over the weather files
    for year in years:
        filepath = f"{year}_weather.csv"
        df = pd.read_csv(filepath)
        dataframes.append(df)

    # Concatenate all the dataframes into a single dataframe
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df



In [188]:
"""This function first loads the uber data from the csv file. 
We then filter based on coordinates to make sure the rides are within the coordinates we want.
We also remove trips with 0 passangers and no fares. We further remove trips with passangers above 6 as that 
is uber policy. Lastly we remove trips with no distace between dropoff and pickup. The output is the
cleaned dataframe"""

def load_and_clean_weather_data():

    df = get_all_weather_csvs()

    df1 = df[['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'NAME','HourlyPrecipitation','HourlyWindGustSpeed', 'HourlyWindSpeed', 'DailyAverageWindSpeed','DailyPrecipitation']]
    df2 = df1.dropna(subset=['HourlyPrecipitation', 'HourlyWindGustSpeed'])

    #column_types = df2.dtypes

    #print(column_types)

    # we see that the averages for wind speed and precipitation are null for all values so we can drop the columns 

    # We also doing need the hourly wind gust speed as we will be using the hourly wind speed, we can drop that column as well

    df2 = df2.drop(columns=['DailyAverageWindSpeed','DailyPrecipitation', 'HourlyWindGustSpeed','LATITUDE', 'LONGITUDE'])
    df2['DATE'] = pd.to_datetime(df['DATE'])

    df2

    # Removing all rows where Hourly preicipitation has the value "T" as we do not need to measure trace amounts 

    df3 = df2[df2['HourlyPrecipitation'] != "T"]

    df4 = df3.drop(columns=["STATION"])

    df4 = df4.reset_index()

    df4['DATE'] = df4['DATE'].apply(lambda x: x.to_pydatetime())

    df4['DATE'] = pd.to_datetime(df4['DATE'])

    df4['HourlyPrecipitation'] = df4['HourlyPrecipitation'].str.replace(r'(\d+)\s*[sS]$', r'\1', regex=True)
    
    # convert column "A" from object to float
    df4['HourlyPrecipitation'] = df4['HourlyPrecipitation'].astype(float)

    Weather_Data = df4.drop('index', axis=1)

    return  Weather_Data







In [189]:
load_and_clean_weather_data()

  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


Unnamed: 0,DATE,NAME,HourlyPrecipitation,HourlyWindSpeed
0,2009-01-06 20:00:00,"NY CITY CENTRAL PARK, NY US",0.01,10.0
1,2009-01-06 23:38:00,"NY CITY CENTRAL PARK, NY US",0.02,11.0
2,2009-01-07 02:51:00,"NY CITY CENTRAL PARK, NY US",0.09,13.0
3,2009-01-07 03:51:00,"NY CITY CENTRAL PARK, NY US",0.06,15.0
4,2009-01-07 04:51:00,"NY CITY CENTRAL PARK, NY US",0.07,16.0
...,...,...,...,...
7098,2015-12-29 10:51:00,"NY CITY CENTRAL PARK, NY US",0.02,10.0
7099,2015-12-29 11:33:00,"NY CITY CENTRAL PARK, NY US",0.02,8.0
7100,2015-12-29 11:51:00,"NY CITY CENTRAL PARK, NY US",0.02,6.0
7101,2015-12-31 11:51:00,"NY CITY CENTRAL PARK, NY US",0.00,9.0


In [190]:
"""Roll up the data to daily"""
def clean_month_weather_data_daily():

    daily_data = load_and_clean_weather_data()

    daily_data_final = daily_data.groupby([daily_data['DATE'].dt.year, daily_data['DATE'].dt.month, daily_data['DATE'].dt.day]).sum()[['HourlyPrecipitation', "HourlyWindSpeed" ]]

    daily_data_final = daily_data_final.rename_axis(index=['Year', 'Month', 'Day'])


    
    return daily_data_final

In [191]:
def clean_month_weather_data_hourly():

    hourly_data = load_and_clean_weather_data()

    hourly_data_final = hourly_data.groupby([hourly_data['DATE'].dt.year, hourly_data['DATE'].dt.month, hourly_data['DATE'].dt.day, hourly_data['DATE'].dt.hour]).sum()[['HourlyPrecipitation', "HourlyWindSpeed" ]]

    hourly_data_final = hourly_data_final.rename_axis(index=['Year', 'Month', 'Day', 'Hour'])
    
    return hourly_data_final



In [192]:
""" I dont think this is needed as my daily and hourly functions retunr the datframes that we need"""
"""
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs()
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly()
        daily_dataframe = clean_month_weather_data_daily()
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data
    """

'\ndef load_and_clean_weather_data():\n    weather_csv_files = get_all_weather_csvs()\n    \n    hourly_dataframes = []\n    daily_dataframes = []\n        \n    for csv_file in weather_csv_files:\n        hourly_dataframe = clean_month_weather_data_hourly()\n        daily_dataframe = clean_month_weather_data_daily()\n        hourly_dataframes.append(hourly_dataframe)\n        daily_dataframes.append(daily_dataframe)\n        \n    # create two dataframes with hourly & daily data from every month\n    hourly_data = pd.concat(hourly_dataframes)\n    daily_data = pd.concat(daily_dataframes)\n    \n    return hourly_data, daily_data\n    '

In [193]:
#hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [194]:
hourly_weather_data = clean_month_weather_data_hourly()
hourly_weather_data.head()


  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HourlyPrecipitation,HourlyWindSpeed
Year,Month,Day,Hour,Unnamed: 4_level_1,Unnamed: 5_level_1
2009,1,6,20,0.01,10.0
2009,1,6,23,0.02,11.0
2009,1,7,2,0.09,13.0
2009,1,7,3,0.06,15.0
2009,1,7,4,0.07,16.0


In [195]:

hourly_weather_data = hourly_weather_data.reset_index()
hourly_weather_data


Unnamed: 0,Year,Month,Day,Hour,HourlyPrecipitation,HourlyWindSpeed
0,2009,1,6,20,0.01,10.0
1,2009,1,6,23,0.02,11.0
2,2009,1,7,2,0.09,13.0
3,2009,1,7,3,0.06,15.0
4,2009,1,7,4,0.07,16.0
...,...,...,...,...,...,...
5809,2015,12,29,9,0.07,19.0
5810,2015,12,29,10,0.13,29.0
5811,2015,12,29,11,0.04,14.0
5812,2015,12,31,11,0.00,9.0


In [196]:
daily_weather_data = clean_month_weather_data_daily()
daily_weather_data = daily_weather_data.reset_index()

daily_weather_data

  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)
  df = pd.read_csv(filepath)


Unnamed: 0,Year,Month,Day,HourlyPrecipitation,HourlyWindSpeed
0,2009,1,6,0.03,21.0
1,2009,1,7,1.13,224.0
2,2009,1,10,0.06,48.0
3,2009,1,11,0.26,67.0
4,2009,1,17,0.69,7.0
...,...,...,...,...,...
1026,2015,12,26,0.00,76.0
1027,2015,12,27,0.02,58.0
1028,2015,12,28,0.00,70.0
1029,2015,12,29,0.75,167.0


## Part 2: Storing Cleaned Data

In [197]:
engine = db.create_engine(DATABASE_URL)

In [198]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
    CREATE TABLE HOURLY_WEATHER (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        year INTEGER,
        month INTEGER,
        day INTEGER,
        hour INTEGER,
        precipitation REAL,
        wind REAL
);
"""

DAILY_WEATHER_SCHEMA = """
    CREATE TABLE DAILY_WEATHER (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        year INTEGER,
        month INTEGER,
        day INTEGER,
        precipitation REAL,
        wind REAL
    );
"""

TAXI_TRIPS_SCHEMA = """
    CREATE TABLE IF NOT EXISTS TAXI_TRIPS (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        pickup_datetime TEXT,
        pickup_longitude REAL,
        pickup_latitude REAL,
        dropoff_longitude REAL,
        dropoff_latitude REAL,
        fare_amount REAL,
        distance REAL,
        passenger_count INTEGER,
    );
"""

UBER_TRIPS_SCHEMA = """
    CREATE TABLE IF NOT EXISTS UBER_TRIPS (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        pickup_datetime TEXT,
        pickup_longitude REAL,
        pickup_latitude REAL,
        dropoff_longitude REAL,
        dropoff_latitude REAL,
        fare_amount REAL,
        distance REAL,
        passenger_count INTEGER,
    );
"""

In [199]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [200]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [59]:
def write_dataframes_to_table():

    hourly_weather_data.to_sql(name='HOURLY_WEATHER', con=engine, if_exists='replace', index=False)
    daily_weather_data.to_sql(name='DAILY_WEATHER', con=engine, if_exists='replace', index=False)
    final_uber_data.to_sql(name='UBER_TRIPS', con=engine, if_exists='replace', index=False)
    #.to_sql(name='UBER_TRIPS', con=engine, if_exists='replace', index=False)

    
write_dataframes_to_table()

In [60]:
from sqlalchemy import create_engine

# establish a connection to the SQL database
engine = create_engine(DATABASE_URL)

# execute a SELECT query on the HOURLY_WEATHER table
query = "SELECT * FROM UBER_TRIPS LIMIT 10;"
result = engine.execute(query)


for row in result:
    print(row)

(7.5, '2015-05-07 19:52:06 UTC', -73.99981689453125, 40.73835372924805, -73.99951171875, 40.72321701049805, 1, '2015-05-07 19:52:06.000000', 1.0445937861491572)
(7.7, '2009-07-17 20:04:56 UTC', -73.994355, 40.728225, -73.99471, 40.750325, 1, '2009-07-17 20:04:56.000000', 1.5250706123331683)
(12.9, '2009-08-24 21:45:00 UTC', -74.005043, 40.74077, -73.962565, 40.772647, 1, '2009-08-24 21:45:00.000000', 3.1314639281870544)
(5.3, '2009-06-26 08:22:21 UTC', -73.976124, 40.790844, -73.965316, 40.803349, 3, '2009-06-26 08:22:21.000000', 1.0323719006396994)
(16.0, '2014-08-28 17:47:00 UTC', -73.925023, 40.744085, -73.97308199999999, 40.761247, 5, '2014-08-28 17:47:00.000000', 2.7860607099399095)
(24.5, '2014-10-12 07:04:00 UTC', -73.96144699999999, 40.693965000000006, -73.871195, 40.774297, 5, '2014-10-12 07:04:00.000000', 7.291584115691231)
(9.7, '2012-02-17 09:32:00 UTC', -73.975187, 40.745767, -74.00272, 40.743537, 1, '2012-02-17 09:32:00.000000', 1.4531136025314275)
(12.5, '2012-03-29 19:0

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """

"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)

### Visualization 2

In [None]:
# use a more descriptive name for your function
def plot_visual_2(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_2():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_2()
plot_visual_2(some_dataframe)

### Visualization 3

In [None]:
# use a more descriptive name for your function
def plot_visual_3(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_3():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_3()
plot_visual_3(some_dataframe)

### Visualization 4

In [None]:
# use a more descriptive name for your function
def plot_visual_4(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_4():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_4()
plot_visual_4(some_dataframe)

### Visualization 5

In [None]:
# use a more descriptive name for your function
def plot_visual_5(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_5():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_5()
plot_visual_5(some_dataframe)

### Visualization 6

In [None]:
# use a more descriptive name for your function
def plot_visual_6(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_6():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_6()
plot_visual_6(some_dataframe)