In [1]:
import psycopg2
import glob

In [2]:
# List of NY taxi data files ready to iter
filenames = sorted(glob.glob('../data/NY_taxi_data/*')) # list all files in ascending order
print(len(filenames))
filenames[:3]

77


['../data/NY_taxi_data/yellow_tripdata_2015-01_00',
 '../data/NY_taxi_data/yellow_tripdata_2015-01_01',
 '../data/NY_taxi_data/yellow_tripdata_2015-01_02']

In [3]:
# Separate Jan, Apr, Jul data
Jan = []
Apr = []
Jul = []
for i in range(len(filenames)):
    if filenames[i][-5:-3] == '01':
        Jan.append(filenames[i])
    elif filenames[i][-5:-3] == '04':
        Apr.append(filenames[i])
    else:
        Jul.append(filenames[i])

print(len(Jan)) # list of data files from 2015-01
print(len(Apr))
print(len(Jul))

26
27
24


In [4]:
tablenames = ['taxi_jan', 'taxi_apr', 'taxi_jul']

In [5]:
def connect_to_db():
    # Try connection to the database 'carto'
    try:
        #conn for connection
        conn =  psycopg2.connect(dbname='carto', user='carto', password='carto', host='postgis', port='5432')
    except psycopg2.DatabaseError:
        print ("I am unable to connect the database")
    return conn

In [6]:
# Create a table called 'taxi_x' inside the postgis database
def create_table(tablename):
    # connect to the db
    conn = connect_to_db()

    # Prepare a query to create table for NY taxi data
    q_create_table = f"""
                    drop table if exists {tablename};
                    create table {tablename}
                    (
                        vendorID int,
                        tpep_pickup_datetime timestamp,
                        tpep_dropoff_datetime timestamp,
                        passenger_count int,
                        trip_distance numeric,
                        pickup_longitude numeric,
                        pickup_latitude numeric,
                        RateCodeID int,
                        store_and_fwd_flag char(1),
                        dropoff_longitude numeric,
                        dropoff_latitude numeric,
                        payment_type int,
                        fare_amount numeric,
                        extra numeric,
                        mta_tax numeric,
                        tip_amount numeric,
                        tolls_amount numeric,
                        improvement_surcharge numeric,
                        total_amount numeric
                    )
                    """

    try:
        cur = conn.cursor()  # initiate cursor (communication with db)
        cur.execute(q_create_table)  # execute the query
        conn.commit()
        print(f'{tablename} created')

    except psycopg2.DatabaseError:
        print ("Failed to create the table")

    # Close the communication & connection with the postgis
    finally:
        cur.close()
        conn.close ()

In [7]:
# Create a table for month X
for tablename in tablenames:
    create_table(tablename)

taxi_jan created
taxi_apr created
taxi_jul created


In [8]:
def fill_table_with_data(filename, tablename):
    # connect to the db
    conn = connect_to_db()

    # A query to insert data (row by row) to the table taxi
    q_insert_data = f'''insert into {tablename}
                    (
                                vendorID,
                                tpep_pickup_datetime,
                                tpep_dropoff_datetime,
                                passenger_count,
                                trip_distance,
                                pickup_longitude,
                                pickup_latitude,
                                RateCodeID,
                                store_and_fwd_flag,
                                dropoff_longitude,
                                dropoff_latitude,
                                payment_type,
                                fare_amount,
                                extra,
                                mta_tax,
                                tip_amount,
                                tolls_amount,
                                improvement_surcharge,
                                total_amount
                        )
                        values (%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s)'''

    try:
        cur = conn.cursor()
        total = 0  # count how many rows are inserted
        with open(filename, 'r') as file:
            # Result: a tuple of 19 string values
            rows = [tuple(line.strip().split(',')) for line in file if line]
            
            # Fill the table by inserting data row by row iteration
            for row in rows:
                if row[0].isalpha(): # Skip header if exists
                    pass
                else:
                    cur.execute(q_insert_data, row)
                    total += 1

            conn.commit() # Commit the changes to the db
            print(f'{total} rows inserted out of {len(rows)}')

    # Print error message if query fails
    except psycopg2.DatabaseError:
        print ("Failed to copy data to the table")

    finally:
        cur.close()
        conn.close ()

In [9]:
# Check the function outcome with a subset of data
for i in range(len(Jan)):
    fill_table_with_data(Jan[i], 'taxi_jan')

499999 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
Failed to copy data to the table
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
500000 rows inserted out of 500000
248987 rows inserted out of 248987


In [10]:
# Copy all the NY taxi data available to the database
for i in range(len(Apr)):
    fill_table_with_data(Apr[i], 'taxi_apr')

In [None]:
q_make_points = '''
                select
                ST_MakePoint(pickup_longiutde, pickup_latitude) as pickup_point,
                ST_MakePoint(dropoff_longiutde, dropoff_latitude) as dropoff_point
                from taxi
                '''
cur.execute(q_make_points)