# Creating Clusters for Stochastic Optimization
---

Import packages

In [None]:
from pathlib import Path
import pickle
import pandas as pd
import geopandas as gpd

Import cleaned traces

In [None]:
export_fp = Path.home() / 'Downloads/cleaned_trips'
#network_fp = Path.home() / "Downloads/cleaned_trips/networks/final_network.gpkg"

#load all traces
with (export_fp/'cleaned_traces.pkl').open('rb') as fh:
    coords_dict, trips_df = pickle.load(fh)

Trip data

In [None]:
trip = pd.read_csv(export_fp/"trip.csv", header = None)
col_names = ['tripid','userid','trip_type','description','starttime','endtime','num_points']
trip.columns = col_names
trip.drop(columns=['starttime','endtime','num_points'],inplace=True)

User data

In [None]:
user = pd.read_csv(export_fp/"user.csv", header=None)
user_col = ['userid','created_date','device','email','age',
            'gender','income','ethnicity','homeZIP','schoolZip',
            'workZip','cyclingfreq','rider_history','rider_type','app_version']
user.columns = user_col
user.drop(columns=['device','app_version','app_version','email'],inplace=True)

Add trip and user data to trips_df

In [None]:
# merge trip and users
trips_df = pd.merge(trips_df,trip,on='tripid').merge(user,on='userid')

In [None]:
for tripid, coords in coords_dict.items():
    #get starting location
    start_lon = coords['datetime'].idxmin().map(coords['lon']).item()
    start_lat = coords['datetime'].idxmin().map(coords['lat']).item()

    #get ending location
    end_lon = coords['datetime'].idxmax().map(coords['lon']).item()
    end_lat = coords['datetime'].idxmax().map(coords['lat']).item()

    #assign to trips_df
    trips_df.at[trips_df['tripid']==tripid,'start_lon'] = start_lon
    trips_df.at[trips_df['tripid']==tripid,'start_lat'] = start_lat
    trips_df.at[trips_df['tripid']==tripid,'end_lon'] = end_lon
    trips_df.at[trips_df['tripid']==tripid,'end_lat'] = end_lat

# find euclidean distance between start and end coord (for finding loop trips)
start_geo = gpd.points_from_xy(trips_df['start_lon'],trips_df['start_lat'],crs='epsg:4326').to_crs('epsg:2240')
end_geo = gpd.points_from_xy(trips_df['end_lon'],trips_df['end_lat'],crs='epsg:4326').to_crs('epsg:2240')
trips_df['euclidean_distance'] = start_geo.distance(end_geo)

In [None]:
#%% remove loops and exercies trips
tolerance_ft = 1000
trips_df = trips_df[trips_df['euclidean_distance']>tolerance_ft]
trips_df = trips_df[trips_df['trip_type']!='Exercise']