# ETL

1. Connect to the database (postgis)
2. Create tables for each month's data ('taxi_jan', 'taxi_apr', 'taxi_jul')
3. Read original text data and insert those values to the corresponding tables

In [1]:
# Import libraries
import psycopg2
import glob

In [2]:
# List of NY taxi data files ready to iter
filenames = sorted(glob.glob('../data/NY_taxi_data/*')) # list all files in ascending order
print(len(filenames))
filenames[:3]

77


['../data/NY_taxi_data/yellow_tripdata_2015-01_00',
 '../data/NY_taxi_data/yellow_tripdata_2015-01_01',
 '../data/NY_taxi_data/yellow_tripdata_2015-01_02']

In [3]:
# Separate Jan, Apr, Jul data
Jan = []
Apr = []
Jul = []
for i in range(len(filenames)):
    if filenames[i][-5:-3] == '01':
        Jan.append(filenames[i])
    elif filenames[i][-5:-3] == '04':
        Apr.append(filenames[i])
    else:
        Jul.append(filenames[i])

print(len(Jan)) # list of data files from 2015-01
print(len(Apr))
print(len(Jul))

26
27
24


In [4]:
tablenames = ['taxi_jan', 'taxi_apr', 'taxi_jul']

In [5]:
def connect_to_db():
    # Try connection to the database 'carto'
    try:
        #conn for connection
        conn =  psycopg2.connect(dbname='carto', user='carto', password='carto', host='postgis', port='5432')
    except psycopg2.DatabaseError:
        print ("I am unable to connect the database")
    return conn

In [6]:
# Create a table called 'taxi_x' inside the postgis database
def create_table(tablename):
    # connect to the db
    conn = connect_to_db()

    # Prepare a query to create table for NY taxi data
    q_create_table = f"""
                    drop table if exists {tablename};
                    create table {tablename}
                    (
                        vendorID int,
                        tpep_pickup_datetime timestamp,
                        tpep_dropoff_datetime timestamp,
                        passenger_count int,
                        trip_distance numeric,
                        pickup_longitude numeric,
                        pickup_latitude numeric,
                        RateCodeID int,
                        store_and_fwd_flag char(1),
                        dropoff_longitude numeric,
                        dropoff_latitude numeric,
                        payment_type int,
                        fare_amount numeric,
                        extra numeric,
                        mta_tax numeric,
                        tip_amount numeric,
                        tolls_amount numeric,
                        improvement_surcharge numeric,
                        total_amount numeric
                    )
                    """

    try:
        cur = conn.cursor()  # initiate cursor (communication with db)
        cur.execute(q_create_table)  # execute the query
        conn.commit()
        print(f'{tablename} created')

    except psycopg2.DatabaseError:
        print ("Failed to create the table")

    # Close the communication & connection with the postgis
    finally:
        cur.close()
        conn.close ()

In [7]:
# Create a table for month X
for tablename in tablenames:
    create_table(tablename)

taxi_jan created
taxi_apr created
taxi_jul created


In [8]:
def fill_table_with_data(filename, tablename):
    # connect to the db
    conn = connect_to_db()

    # A query to insert data (row by row) to the table taxi
    q_insert_data = f'''insert into {tablename}
                    (
                                vendorID,
                                tpep_pickup_datetime,
                                tpep_dropoff_datetime,
                                passenger_count,
                                trip_distance,
                                pickup_longitude,
                                pickup_latitude,
                                RateCodeID,
                                store_and_fwd_flag,
                                dropoff_longitude,
                                dropoff_latitude,
                                payment_type,
                                fare_amount,
                                extra,
                                mta_tax,
                                tip_amount,
                                tolls_amount,
                                improvement_surcharge,
                                total_amount
                        )
                        values (%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s)'''

    try:
        cur = conn.cursor()
        total = 0  # count how many rows are inserted
        
        with open(filename, 'r') as file:
            # rows: a list of tuples
            # a row: a tuple of 19 string values
            rows = [tuple(line.strip().split(',')) for line in file if line]
            
            # Fill the table by inserting data row by row iteration
            for row in rows:
                if row[0].isalpha(): # Skip header if exists
                    pass
                elif row[5:7] == ('0','0') or row[9:11] == ('0','0'): # Skip if no geo data
                    pass
                else:
                    cur.execute(q_insert_data, row)
                    total += 1

            conn.commit() # Commit the changes to the db
            print(f'{total} rows inserted out of {len(rows)}')

    # Print error message if query fails
    except psycopg2.DatabaseError:
        print ("Failed to copy data to the table")

    finally:
        cur.close()
        conn.close ()

In [9]:
# Check the function outcome with a subset of data
for i in range(len(Jan)):
    fill_table_with_data(Jan[i], 'taxi_jan')

489619 rows inserted out of 500000
489654 rows inserted out of 500000
489574 rows inserted out of 500000
489456 rows inserted out of 500000
489867 rows inserted out of 500000
489604 rows inserted out of 500000
489581 rows inserted out of 500000
489417 rows inserted out of 500000
489577 rows inserted out of 500000
489493 rows inserted out of 500000
489664 rows inserted out of 500000
489508 rows inserted out of 500000
489887 rows inserted out of 500000
489905 rows inserted out of 500000
489556 rows inserted out of 500000
489652 rows inserted out of 500000
489618 rows inserted out of 500000
489624 rows inserted out of 500000
489529 rows inserted out of 500000
489676 rows inserted out of 500000
489647 rows inserted out of 500000
489721 rows inserted out of 500000
489660 rows inserted out of 500000
489608 rows inserted out of 500000
489424 rows inserted out of 500000
243813 rows inserted out of 248987


## Note:
Failed to insert data from a file (Jan[17]) because this file has many rows have wrong coordinates of pickup and/or dropoff: (0,0)

Refactored the fill_table_with_data function to fix this issue.

In [11]:
# Load all the NY taxi data from 2015-Apr to the database
for i in range(len(Apr)):
    fill_table_with_data(Apr[i], 'taxi_apr')

490624 rows inserted out of 500000
490729 rows inserted out of 500000
490839 rows inserted out of 500000
490676 rows inserted out of 500000
490739 rows inserted out of 500000
490844 rows inserted out of 500000
490963 rows inserted out of 500000
490871 rows inserted out of 500000
490740 rows inserted out of 500000
490836 rows inserted out of 500000
490870 rows inserted out of 500000
490717 rows inserted out of 500000
490856 rows inserted out of 500000
490639 rows inserted out of 500000
490806 rows inserted out of 500000
490667 rows inserted out of 500000
490719 rows inserted out of 500000
490925 rows inserted out of 500000
490799 rows inserted out of 500000
490681 rows inserted out of 500000
490700 rows inserted out of 500000
490798 rows inserted out of 500000
490891 rows inserted out of 500000
490781 rows inserted out of 500000
490679 rows inserted out of 500000
490704 rows inserted out of 500000
70505 rows inserted out of 71790


In [12]:
# Load all the NY taxi data from 2015-Jul to the database
for i in range(len(Jul)):
    fill_table_with_data(Jul[i], 'taxi_jul')

492271 rows inserted out of 500000
492293 rows inserted out of 500000
492249 rows inserted out of 500000
492168 rows inserted out of 500000
492148 rows inserted out of 500000
492519 rows inserted out of 500000
492174 rows inserted out of 500000
491991 rows inserted out of 500000
492203 rows inserted out of 500000
492355 rows inserted out of 500000
492647 rows inserted out of 500000
492037 rows inserted out of 500000
492405 rows inserted out of 500000
492104 rows inserted out of 500000
492033 rows inserted out of 500000
492356 rows inserted out of 500000
492418 rows inserted out of 500000
492063 rows inserted out of 500000
492692 rows inserted out of 500000
492394 rows inserted out of 500000
492548 rows inserted out of 500000
492169 rows inserted out of 500000
490694 rows inserted out of 500000
61091 rows inserted out of 62784


## End of basic ETL

-----------------------------------------------------------

## Next step: Data transformation to simplify EDA

* Create a query to generate new geometry columns using longitudes & latitudes
