In [1]:
# import general packages:
import numpy as np
import pandas as pd
from pandas import Timestamp
import os
import io
import datetime as dt
import time 
import dask.dataframe as dd
from datetime import datetime
import timestring
from IPython.core.display import display, HTML
from collections import defaultdict

# import modeling packages

# from sklearn.cluster import AffinityPropagation
# from sklearn.cluster import KMeans
# from sklearn import preprocessing, datasets
from sklearn.metrics import pairwise_distances_argmin
from scipy.spatial.distance import cdist,pdist
# from scipy import stats
# from scipy.sparse import *
from collections import Counter

import zipcodes

# import visualization packages:
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [2]:
import matplotlib

In [3]:
pd.set_option('display.max_columns', None)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
# standardize dataframe colume names
def col_name(df):
    """
    this is to trim the data_frame column names to a unique format:
    all case, replace space to underscore, remove parentheses
    param df:
        raw from share drive for
    return:
        polished data set with new column names
    """
    df.columns = df.columns.str.strip().str.lower().str.replace('-','').str.replace(' ', '_').str.replace('(', '').\
                    str.replace(')', '').str.replace('"','')
    return df

In [5]:
# public source zipcode format standaraization
def clean_zip(file_path = None):
    """
    this is to clean the zip file: 
        - corrdinate format
        - save file as feather format
    
    parameter:
        df: original zipcode msater file 
        file_path: zipcode file directory
        
    return:
        zipcode feather format
    """
    # change zipcodes which contain alphabix letter to 0 (outside of USA)
    def to_string(x):
        try:
            return str(x)
        except:
            return 0   
        
    zipcode = pd.read_csv(file_path)
    zipcode = col_name(zipcode)
    zipcode['zip_code'] = zipcode['zip_code'].apply(lambda x: to_string(x))
    
    return zipcode

In [6]:
def haversine(lon1,lat1,lon2,lat2):
    """
    Calculate the great circle distance between 2 points
    on the earch (specifid in decimal degree)
    """
    # Convert decimal degree to radians
    lon1,lat1,lon2,lat2 = map(np.radians,[lon1,lat1,lon2,lat2])
    # Haversine formula
    dlon = lon2-lon1
    dlat = lat2-lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.math.asin(np.sqrt(a))
    mile = 6371 * c # Radius of earth in kilometers. Use 3956 for miles
#     km = 6367 * c
    return mile

In [7]:
def lon_lat_finding(x,selection = 'long'):
    try:
        return zipcodes.matching(x)[0][selection]
    except:
        None                

In [8]:
def centroid_name (x):
    name_result = str(x[0])+' & '+str(x[1])
    return name_result

In [9]:
def cog_one_wh(df, cog_rule = 'weight'):
    if cog_rule == 'weight':
        center_lon = round(sum(df['weightlbs_sum'] * df['port_weight_total'] * df['origin_lon'])/sum(df['weightlbs_sum'] * df['port_weight_total']),5)
        center_lat = round(sum(df['weightlbs_sum'] * df['port_weight_total'] * df['origin_lat'])/sum(df['weightlbs_sum'] * df['port_weight_total']),5)
    
    elif cog_rule == 'cost':
        center_lon = round(sum(df['total_cost_sum'] * df['port_weight_total'] * df['origin_lon'])/sum(df['total_cost_sum'] * df['port_weight_total']),5)
        center_lat = round(sum(df['total_cost_sum'] * df['port_weight_total'] * df['origin_lat'])/sum(df['total_cost_sum'] * df['port_weight_total']),5)
        
    elif cog_rule == 'freq':
        center_lon = round(sum(df['distance_miles_count'] * df['port_weight_total'] * df['origin_lon'])/sum(df['distance_miles_count'] * df['port_weight_total']),5)
        center_lat = round(sum(df['distance_miles_count'] * df['port_weight_total'] * df['origin_lat'])/sum(df['distance_miles_count'] *df['port_weight_total']),5)

    
    return center_lon, center_lat

In [10]:
input_path = r'C:\Users\U279014\Documents\H_Drive\7.AA Models\15.5.COG_Project Consuela\data_input'
input_file = r'TMC.xls'

output_path = r'C:\Users\U279014\Documents\H_Drive\7.AA Models\15.5.COG_Project Consuela\data_output'
output_file = r'TMC_output_flow_result.csv'
output_file2 = r'TMC_preprocess_result.csv'

In [11]:
df_tmc_orig = col_name(pd.read_excel(os.path.join(input_path, input_file))) # standardize data header

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [12]:
df_tmc_orig['origin_zip'] = df_tmc_orig['origin_zip'].str.split('-', expand = True).loc[:,0] # extract meaningful zipcode
df_tmc_orig['destination_zip'] = df_tmc_orig['destination_zip'].str.split('-', expand = True).loc[:,0] # 

### Step 0 - Origin TMC Preprocess

In [13]:
df_tmc_preprocess = pd.concat([df_tmc_orig, df_tmc_orig['origin_zip'].apply(lambda x: pd.Series({'origin_lon': lon_lat_finding(x,selection = 'long'), 'origin_lat': lon_lat_finding(x, selection = 'lat')}))], axis=1)
df_tmc_preprocess = pd.concat([df_tmc_preprocess, df_tmc_preprocess['destination_zip'].apply(lambda x: pd.Series({'destination_lon': lon_lat_finding(x,selection = 'long'), 'destination_lat': lon_lat_finding(x, selection = 'lat')}))], axis=1)
df_tmc_preprocess.dropna(subset=['origin_lon'], inplace=True)
df_tmc_preprocess[['origin_lon', 'origin_lat', 'destination_lon', 'destination_lat']] = df_tmc_preprocess[['origin_lon', 'origin_lat', 'destination_lon', 'destination_lat']].apply(pd.to_numeric, errors='coerce')
df_tmc_preprocess['distance'] = df_tmc_preprocess[['origin_lon', 'origin_lat', 'destination_lon', 'destination_lat']].apply(lambda x: haversine(x[0], x[1], x[2], x[3]), axis=1)

In [14]:
df_tmc_preprocess.to_csv(os.path.join(output_path, output_file2), index = False)

In [15]:
# exclude distance miles is zero
df_tmc_orig = df_tmc_orig[df_tmc_orig.distance_miles != 0]

df_agg = df_tmc_orig.groupby(['origin_zip','origin_state', 'destination_zip', 'destination_state', 'mode_description'])['total_cost', 'weightlbs', 'distance_miles'].agg(['sum', 'count'])
df_agg.columns = df_agg.columns.map('_'.join).str.strip('_')
df_agg.reset_index(inplace=True)

  after removing the cwd from sys.path.


In [16]:
df_agg = pd.concat([df_agg, df_agg['origin_zip'].apply(lambda x: pd.Series({'origin_lon': lon_lat_finding(x,selection = 'long'), 'origin_lat': lon_lat_finding(x, selection = 'lat')}))], axis=1)
df_agg_full = pd.concat([df_agg, df_agg['destination_zip'].apply(lambda x: pd.Series({'destination_lon': lon_lat_finding(x,selection = 'long'), 'destination_lat': lon_lat_finding(x, selection = 'lat')}))], axis=1)

df_agg_full.dropna(subset=['origin_lon'], inplace=True)
df_agg_full[['origin_lon', 'origin_lat', 'destination_lon', 'destination_lat']] = df_agg_full[['origin_lon', 'origin_lat', 'destination_lon', 'destination_lat']].apply(pd.to_numeric, errors='coerce')

df_agg_full['distance'] = df_agg_full[['origin_lon', 'origin_lat', 'destination_lon', 'destination_lat']].apply(lambda x: haversine(x[0], x[1], x[2], x[3]), axis=1)

### Option 1 - Chicago Consolidation

In [109]:
# exclude_states = ['IN', 'MI', 'IA', 'WI', 'ND', 'IL', 'TX', 'AZ', 'WA', 'MO']
# exclude_states = ['IN','IA', 'WI', 'ND', 'IL','AZ', 'WA', 'MO']
exclude_states = ['IA', 'WI', 'ND', 'IL', 'TX', 'AZ', 'WA', 'MO']

df_agg_east = df_agg_full[~df_agg_full.origin_state.isin(exclude_states)]
df_agg_west = df_agg_full[df_agg_full.origin_state.isin(exclude_states)]
df_agg_west['one_center_lon'], df_agg_west['one_center_lat'] = -87.6289, 41.8781

In [110]:
dict_chicago_agg = {'origin_zip': '60191', 'origin_state': 'IL', 'destination_zip': '07201', 'destination_state': 'NJ', 'mode_description': 'TL'}
dict_chicago_agg.update(dict(df_agg_west.sum()[['total_cost_sum', 'total_cost_count', 'weightlbs_sum', 'weightlbs_count','distance_miles_sum', 'distance_miles_count']]))
dict_chicago_agg.update({'origin_lon': -87.6286, 'origin_lat': 41.8781, 'destination_lon': -74.1779, 'destination_lat': 40.6723, 'distance': 1160})
df_init_chicago = pd.DataFrame(pd.Series(dict_chicago_agg)).T

In [111]:
df_consolidate = pd.concat([df_init_chicago, df_agg_east])

### Step 1 - Port cities selection

In [112]:
port_city_candidates = {'Newark/New York': [43.0467, -77.0953, 4196],
                        'Savannah': [32.0809, -81.0912, 2223],
                        'Norfolk': [36.8508, 76.2859, 1312],
                        'Charleston': [32.7765, -79.9311, 1210],
                        'Philadelphia': [39.9526, -75.1652, 526],
                        'Baltimore': [39.2904, -76.6122, 508],
                        'Willmington_DE': [39.7447, -75.5484, 196],
                        'Boston': [42.3601, -71.0589, 162],
                        'Wilmington_NC':[34.2104, -77.8868, 125],
                        'Chester_PA': [39.8496, -75.3557, 101]}

In [93]:
port_city_candidates = { 'Savannah': [32.0809, -81.0912, 2223],
                        'Charleston': [32.7765, -79.9311, 1210]}

In [94]:
# port_city_candidates = {'Savannah': [32.0809, -81.0912, 2223]}

In [113]:
df_consolidate['port_weight_total'] = 0
port_columns = []
for k in port_city_candidates.keys():
    df_consolidate['origin_to_' +k.lower() + '_distance_inverse'] = df_consolidate[['origin_lon', 'origin_lat']].apply(lambda x: round(1000/(haversine(x[0], x[1], port_city_candidates[k][1], port_city_candidates[k][0])),3), axis=1)
    df_consolidate[k.lower() + '_teq'] = port_city_candidates[k][2]
    df_consolidate['port_weight_total'] += df_consolidate['origin_to_' +k.lower() + '_distance_inverse'] * df_consolidate[k.lower() + '_teq']
    port_columns.append(str('origin_to_' +k.lower() + '_distance_inverse'))
    port_columns.append(str(k.lower() + '_teq'))

### Step 2 - One warehouse allowed

In [114]:
rule = 'weight'
df_consolidate['one_center_lon'], df_consolidate['one_center_lat'] = cog_one_wh(df_consolidate, cog_rule=rule)
df_consolidate = pd.concat([df_consolidate, df_agg_west])

In [115]:
df_consolidate['origin_name'] = df_consolidate[['origin_lon', 'origin_lat']].apply(lambda x: centroid_name(x), axis = 1)
df_consolidate['one_center_name']= df_consolidate[['one_center_lon', 'one_center_lat']].apply(lambda x: centroid_name(x), axis = 1)
df_consolidate['to_center_distance'] = df_consolidate[['origin_lon', 'origin_lat', 'one_center_lon', 'one_center_lat']].apply(lambda x: haversine(x[0], x[1], x[2], x[3]), axis=1)
df_consolidate['agg_distance'] = df_consolidate.to_center_distance * df_consolidate.distance_miles_count
df_consolidate.reset_index(drop = True, inplace=True)

chicago_dock_lon = df_agg_west.one_center_lon.unique()
df_consolidate['adjusted_weight_sum'] = df_consolidate[['one_center_lon', 'weightlbs_sum']].apply(lambda x: x[1] if x[0] != chicago_dock_lon else 0, axis = 1)
df_consolidate.to_csv(os.path.join(output_path, output_file), index=False)

In [116]:
fig = px.scatter_mapbox(df_consolidate, lat="one_center_lat", lon="one_center_lon", hover_name="origin_zip", hover_data=["weightlbs_sum", "total_cost_sum"],
                        color_discrete_sequence=["fuchsia"], zoom=3, height=1200)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [105]:
df_ports = pd.DataFrame(port_city_candidates).T

In [106]:
df_ports.columns = ['port_lat', 'port_lon', 'teq_volume']

In [43]:
df_ports.to_csv(os.path.join(output_path, 'port_info.csv'))

In [44]:
df_ports

Unnamed: 0,port_lat,port_lon,teq_volume
Savannah,32.0809,-81.0912,2223.0
