In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm

In [2]:
# read file with information about washington intersection info

file_path = r"C:\project_files\dc_intersection\intersection_points_unique.csv"
df_int = pd.read_csv(file_path)
df_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8437 entries, 0 to 8436
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              8437 non-null   int64  
 1   unique_intersection_id  8437 non-null   int64  
 2   INTERSECTIONID          8437 non-null   int64  
 3   FULLINTERSECTION        8437 non-null   object 
 4   LONGITUDE               8437 non-null   float64
 5   LATITUDE                8437 non-null   float64
dtypes: float64(2), int64(3), object(1)
memory usage: 395.6+ KB


In [3]:
# function to attribute part of the day based on hour of the day.

def assign_part_of_day(x):
    if x == 8 or x == 9:
        return "morning"
    elif x == 10 or x == 11 or x == 12:
        return "late_morning"
    elif x == 13 or x == 14 or x == 15:
        return "afternoon"
    elif x == 16 or x == 17 or x == 18:
        return "evening"
    elif x == 19 or x == 20:
        return "late_evening"
    

# function to modify the trip dataframe by removing trips
# that fall outside the 8AM to 8PM time period
# and assign part of the day based on the hour of day.

def modify_dataframe(df):
    df['hour'] = pd.to_datetime(df['start_time'])
    df['hour'] = df['hour'].apply(lambda x: x.hour)
    df = df[(df['hour'] >= 8) & (df['hour'] <= 20)]
    df['part_of_day'] = df['hour'].apply(lambda x: assign_part_of_day(x))
    
    return df

In [4]:
# load all the trip dataset files in 2019

dataset_files = ['all_trips_2019_' + str(i) for i in range(10)]
file_path = r"D:\trip_datasets\dataset_modified" 

df_list = []

for file in dataset_files:
    df = pd.read_csv(file_path + "/" + file + "_mod.csv")
    df = df.reset_index(drop = True)
    df = modify_dataframe(df)
    df_list.append(df)

# printing the information a trip file contains
print(df_list[0].info())

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<class 'pandas.core.frame.DataFrame'>
Int64Index: 566210 entries, 0 to 648297
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  566210 non-null  int64  
 1   Unnamed: 0.1                566210 non-null  int64  
 2   trip_id                     566210 non-null  object 
 3   vehicle_id                  566210 non-null  object 
 4   vehicle_type                566210 non-null  object 
 5   start_lat                   566208 non-null  float64
 6   start_lon                   566208 non-null  float64
 7   end_lat                     566208 non-null  float64
 8   end_lon                     566208 non-null  float64
 9   start_time                  566210 non-null  object 
 10  end_time                    566210 non-null  object 
 11  trip_length                 566210 non-null  float64
 12  provider                    566210 non-null  object 
 13  start_min_dist

In [5]:
# function to add the number of pickups and number of dropoffs to each of the intersections. 


def main_function(part_of_day, df_list, df_int):
    
    
    if part_of_day == 'all':
        dict_pickups = defaultdict(lambda : 0)
        dict_dropoffs = defaultdict(lambda : 0)
        
        
        
        for df in df_list:
            
            df = df.reset_index(drop = True)
            
            for i in tqdm(range(len(df))):
                dict_pickups[df['start_nearest_intersection'][i]] += 1
                dict_dropoffs[df['end_nearest_intersection'][i]] += 1  
        
        df_int['all_dropoffs'] = df_int['unique_intersection_id'].apply(lambda x: dict_dropoffs[x])
        df_int['all_pickups'] = df_int['unique_intersection_id'].apply(lambda x: dict_pickups[x])
    
    else:
        dict_pickups = defaultdict(lambda :0)
        dict_dropoffs = defaultdict(lambda :0)
        
        for df in df_list:
            
            df = df[df['part_of_day'] == part_of_day]
            df = df.reset_index(drop = True)
            
            
            for i in tqdm(range(len(df))):
                dict_pickups[df['start_nearest_intersection'][i]] += 1
                dict_dropoffs[df['end_nearest_intersection'][i]] += 1
        
        df_int[part_of_day + '_dropoffs'] = df_int['unique_intersection_id'].apply(lambda x: dict_dropoffs[x])
        df_int[part_of_day + '_pickups'] = df_int['unique_intersection_id'].apply(lambda x: dict_pickups[x])
    
    
    return df_int

In [6]:
part_of_day_array = ['all', 'morning', 'late_morning', 'afternoon', 'evening', 'late_evening']

# assigning pickups and dropoffs for all different periods of the day
for part_of_day in part_of_day_array:
    df_int = main_function(part_of_day, df_list, df_int)

100%|███████████████████████████████████████████████████████████████████████| 566210/566210 [00:13<00:00, 40998.00it/s]
100%|███████████████████████████████████████████████████████████████████████| 550301/550301 [00:13<00:00, 41068.05it/s]
100%|███████████████████████████████████████████████████████████████████████| 529966/529966 [00:12<00:00, 41304.02it/s]
100%|███████████████████████████████████████████████████████████████████████| 519309/519309 [00:12<00:00, 40914.69it/s]
100%|███████████████████████████████████████████████████████████████████████| 522969/522969 [00:12<00:00, 41233.80it/s]
100%|███████████████████████████████████████████████████████████████████████| 552368/552368 [00:13<00:00, 41214.78it/s]
100%|███████████████████████████████████████████████████████████████████████| 588455/588455 [00:14<00:00, 41396.40it/s]
100%|███████████████████████████████████████████████████████████████████████| 567057/567057 [00:13<00:00, 41032.76it/s]
100%|███████████████████████████████████

In [7]:
df_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8437 entries, 0 to 8436
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              8437 non-null   int64  
 1   unique_intersection_id  8437 non-null   int64  
 2   INTERSECTIONID          8437 non-null   int64  
 3   FULLINTERSECTION        8437 non-null   object 
 4   LONGITUDE               8437 non-null   float64
 5   LATITUDE                8437 non-null   float64
 6   all_dropoffs            8437 non-null   int64  
 7   all_pickups             8437 non-null   int64  
 8   morning_dropoffs        8437 non-null   int64  
 9   morning_pickups         8437 non-null   int64  
 10  late_morning_dropoffs   8437 non-null   int64  
 11  late_morning_pickups    8437 non-null   int64  
 12  afternoon_dropoffs      8437 non-null   int64  
 13  afternoon_pickups       8437 non-null   int64  
 14  evening_dropoffs        8437 non-null   

In [8]:
df_int = df_int.sort_values(by = ['all_pickups'], ascending = False)
df_int = df_int.reset_index(drop = True)

df_int.to_csv("total_pickups_dropoffs_at_intersections.csv")