# This is the Final build for the Project

### There are 3 main stages in this Project and one auxiliary stage which is run to set up the project. 
First, the `final_project.ipynb` is run to populate the PostGresDB and do some data cleaning. 
Later this file is run.  

This file contains 3 phases:
Data Generation -> Graph Building -> Visualization


## Stage 1: Data Generation


<div class="alert alert-block alert-danger">
<b>Danger:</b> Run Stage 0 before starting this
</div>

### Input: 
* Start time ``(2019-01-01 00:00:00)``
* End time ``(2019-01-01 23:59:59)``
* Poolsize ``(300, 420, 600)``
    
### Output: 
* CSV / Pandas DF that contains data for the next cycle
    * The dataframe will contain these fields: 
         1. id
         2. tpep_pickup_datetime
         3. tpep_dropoff_datetime
         4. passenger_count
         5. trip_distance - acquired from OSRM
         6. PULocationID
         7. DOLocationID


In [1]:
import numpy as np
import pandas as pd 
import geopandas as gpd
import psycopg2

import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sqlalchemy import create_engine
from itertools import combinations
import networkx as nx

plt.style.use('ggplot')




# Connect to DB
conn_string = "postgresql://nycrideshare:nycrideshare@127.0.0.1:5432/nyc_taxi"
nyc_database = create_engine(conn_string)

In [2]:
#Constants

# Get the adjacency matrix
interzonal_dist = pd.read_csv("./data/interzonal.csv")

# Constants
interval_dict = {
    "1hour": """1 HOURS""",
    "1day": """1 DAYS""",
    "1month": """1 MONTHS"""

}
pool_size = 600
day_in_seconds = 24 * 3600
max_passenger_count = 3

# The start time representing the start of the analysis 
start_time = datetime(2019, 1,1, 00, 00,00)
# Change the End time as required
end_time = start_time + timedelta(minutes=60)

# this flag represents if the pickup is at LGD or Dropoff
lgd_flag = {
    "pickup": 'PULocationID',
    "drop": 'DOLocationID'
} 

In [3]:
def generate_data(start_time, data_size_duration):
    '''
    This function aims to get a chunk of data for a month or day. Depending on the need.

    [start_time]: datetime object
    [data_size_duration]: one of the values specified in the interval_dict 

    '''
    
    time_string = start_time.strftime("%Y%m%d_%H%M%S")
    query = \
        f"""select 
        id, 
        tpep_pickup_datetime, 
        tpep_dropoff_datetime, 
        passenger_count, 
        "PULocationID", 
        "DOLocationID"
        from nyc_taxi_schema.get_cust_between_timestamps_lgd('{start_time.strftime("%Y-%m-%d %H:%M:%S")}', '{interval_dict[data_size_duration]}');"""



    # Get the dataframe
    df_temp = pd.read_sql_query(query, nyc_database)
    # Add the distance to all the the rows
    df_temp["Distance"] = df_temp.apply(lambda row: interzonal_dist.iloc[row["PULocationID"]-1, row["DOLocationID"]-1], axis=1)

    return df_temp.set_index("id", drop=True)


In [4]:
# Testing for an hour of data
data_size_duration = "1hour" # This is to get the initial dataframe
df_generated = generate_data(start_time, data_size_duration)

In [5]:

def date_iterator(ts_start, ts_end, delta_in_minutes, flag):
    '''
    This function is a generator function that returns filtered df rows between the paramerters passed

    [ts_start]: datetime - start timestamp
    [ts_end]: datetime - end timestamp
    [delta_in_minutes]: int - The value specifies the timedelta for the poolsize
    [flag]: string: it's value is either "pickup" or "drop"
    '''
    current = ts_start
    delta = timedelta(minutes=delta_in_minutes)
    while current < ts_end:
        yield df_generated[
            (df_generated['tpep_pickup_datetime'] >= current) & 
            (df_generated['tpep_pickup_datetime'] < current + delta) &
            (df_generated[lgd_flag[flag]] == 138)]

        current += delta
        

## Stage 2: Graph Construction

This stage is responsible to construct graphs using networX to model the relationships between passengers. 
The connected edges represent the rides that are merged. 

### Input Parameters: 
* Poolsize
* Weight calculating functions as arguments
 

In [8]:
# All the functions that calculate the weight

def check_passenger_count(pool, indexA, indexB, max_passenger_count):
    row1 = pool.loc[indexA, :]
    row2 = pool.loc[indexB, :]
    Pa, Pb = row1["passenger_count"], row2["passenger_count"]
    return (max_passenger_count - (Pa + Pb) >= 0)

def calc_distance_weight(row1, row2, flag):
    Da = row1["Distance"]
    Db = row2["Distance"]
    column_name = "DOLocationID" if flag=="pickup" else "PULocationID"
    Dab = interzonal_dist.iloc[
        row1[column_name]-1, 
        row2[column_name]-1
    ]
    Dba = interzonal_dist.iloc[
        row1[column_name]-1, 
        row2[column_name]-1
    ]
    Dmin = min(Da + Dab, Db + Dba)
    savings = Da + Db - Dmin
    return savings / (Da + Db)
    
def calc_time_weight(row1, row2, pool_size):
    Ta = row1["tpep_pickup_datetime"]
    Tb = row2["tpep_pickup_datetime"]
    Tab = abs(Tb - Ta).seconds
    return (pool_size - Tab) / pool_size

In [9]:
def calc_edge_weight(pool, indexA, indexB, distance_fn, time_fn, flag):
    
    row1 = pool.loc[indexA, :]
    row2 = pool.loc[indexB, :]
        
    weight = distance_fn(row1, row2, flag) + time_fn(row1, row2, pool_size)
    return weight

In [46]:
for i, df_filtered in enumerate(date_iterator(start_time, end_time, pool_size // 60, "pickup")):
    
    G = nx.Graph()
    index_list = df_filtered.index.tolist()
    G.add_nodes_from(index_list)
    for indexA, indexB in combinations(index_list, 2):
        
        if not check_passenger_count(df_filtered, indexA, indexB, max_passenger_count):
            continue
        G.add_edge(indexA, indexB, weight=calc_edge_weight(df_filtered, indexA, indexB, calc_distance_weight, calc_time_weight, "pickup"))
        
    edge_set = nx.algorithms.matching.max_weight_matching(G)  
    no_of_nodes = set(G.nodes)

    pairs = set()

    for i in edge_set:
        tmp_set = set(i)
        pairs = pairs.union(tmp_set)
    missing_val = b.difference(pairs) 
    edge_set.union(missing_val)

    show_viz(edge_set, df_filtered)      

## Stage 3: Visualization

This stage is responsible to gather data from stage to for Visualization

The idea for this phase is, Phase2 at each iteration calls this method. Parameters are TBD. 
When this function is called, the merged data and individual data is collated and stored as a DF/File. This can be used later to build graphs. 

### Input Parameters: 
* TBD
* TBD

In [43]:
def show_viz(edge_set, df_filtered):
    pass