In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import Image

In [None]:
file_path = '/kaggle/input'

import os
for dirname, _, filenames in os.walk(file_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
bce_data = pd.read_csv('/kaggle/input/border-crossing-entry-data/Border_Crossing_Entry_Data.csv')

In [None]:
bce_data.shape

In [None]:
bce_data.info()

In [None]:
bce_data.columns = bce_data.columns.str.lower()

In [None]:
bce_data.columns = bce_data.columns.str.replace(' ', '_')

In [None]:
bce_data.head()

In [None]:
print('No. of missing values = {}'.format(bce_data.isna().sum().sum()))

In [None]:
def get_df_summary(df):
    
    '''This function is used to summarise especially unique value count and data type for variable'''
    
    unq_val_cnt_df = pd.DataFrame(df.nunique(), columns = ['unq_val_cnt'])
    unq_val_cnt_df.reset_index(inplace = True)
    unq_val_cnt_df.rename(columns = {'index':'variable'}, inplace = True)
    unq_val_cnt_df = unq_val_cnt_df.merge(df.dtypes.reset_index().rename(columns = {'index':'variable', 0:'dtype'}),
                                          on = 'variable')
    unq_val_cnt_df = unq_val_cnt_df.sort_values(by = 'unq_val_cnt', ascending = False)
    
    return unq_val_cnt_df

In [None]:
unq_val_cnt_df = get_df_summary(bce_data)

In [None]:
unq_val_cnt_df

In [None]:
bce_data['border'].value_counts()

In [None]:
bce_data['measure'].value_counts()

In [None]:
bce_data['state'].value_counts().sort_index()

**Variable : date**

In [None]:
bce_data['date'] = pd.to_datetime(bce_data['date'], format = '%m/%d/%Y %I:%M:%S %p')

In [None]:
bce_data.head()

**Variable : location**

In [None]:
bce_data['location_bkup'] = bce_data['location'].copy()

In [None]:
bce_data['location_bkup'] = bce_data['location_bkup'].str.lstrip('POINT (').str.rstrip(')')

In [None]:
tmp_df = bce_data['location_bkup'].str.split(' ', expand = True)
tmp_df.rename(columns = {0:'longitude', 1:'latitude'}, inplace = True)
tmp_df['longitude'] = tmp_df['longitude'].astype('float')
tmp_df['latitude'] = tmp_df['latitude'].astype('float')

bce_data = pd.concat([bce_data, tmp_df], axis = 1)

del tmp_df

In [None]:
bce_data.head()

In [None]:
bce_data.drop(columns = ['location', 'location_bkup'], inplace = True)

In [None]:
bce_data.head()

In [None]:
print('Years : {}'.format(bce_data['date'].dt.year.unique()))
print()
print('No. of years : {}'.format(bce_data['date'].dt.year.nunique()))

In [None]:
print('Months : {}'.format(sorted(bce_data['date'].dt.month.unique())))

In [None]:
print('Days : {}'.format(sorted(bce_data['date'].dt.day.unique())))

As **date** variable contain date values for 1st of a month and year, we can safely split this variable values into two columns i.e. **year** and **month**.

**Variable : year**

In [None]:
bce_data['year'] = bce_data['date'].apply(lambda x : x.year)

**Variable : month**

In [None]:
bce_data['month'] = bce_data['date'].apply(lambda x : x.month)

# Map month number to month name.
month_dict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}

bce_data['month'] = bce_data['month'].apply(lambda x : month_dict[x])

In [None]:
# bce_data['month'].value_counts(dropna = False)

In [None]:
bce_data.drop(columns = 'date', inplace = True)

In [None]:
bce_data.groupby(['year', 'month'])['port_code'].count().reset_index().groupby(['year'])['month'].count()

We have data for the entire year starting year 1996 till 2018, except year 2019 for which we have data till March month.

**Find out if more than one port code has same port names.**

In [None]:
tmp_df = bce_data[['port_name', 'port_code']].drop_duplicates()

In [None]:
tmp_df.groupby('port_name')['port_name'].filter(lambda x : len(x) > 1)

In [None]:
tmp_df.loc[tmp_df['port_name'] == 'Eastport', ]

Same port **Eastport** has two port codes **3302** and **103**. Lets check if they belong to same or different state in U.S.

In [None]:
bce_data.loc[bce_data['port_name'] == 'Eastport', ].groupby(['state', 'port_name', 'port_code'])['state'].count()

We have two ports with same name i.e. **Eastport** in two different states **Idaho** and **Maine** in US.

In [None]:
print('No. of negative values in value field : {}'.format(bce_data['value'].lt(0).sum()))

**Check if port_name is same across two or more different states.**

In [None]:
tmp_df = bce_data.groupby(['state', 'port_name'])['port_code'].count().reset_index().drop(columns = 'port_code')
tmp_df.groupby(['state', 'port_name']).filter(lambda x : len(x) > 1)

In [None]:
print('Given data set has data of {} years.'.format(bce_data['year'].nunique()))

In [None]:
print('# of obs. with invalid latitudinal values : {}'.format(sum(bce_data['latitude'].lt(-90) & bce_data['latitude'].gt(90))))
print('# of obs. with invalid longitudinal values : {}'.format(sum(bce_data['longitude'].lt(-180) & bce_data['longitude'].gt(180))))

# We do not have any obs. with invalid values in latitude and longitude variables.

**Which border provides connectivity to maximum number of states?**

In [None]:
tmp_df = bce_data.groupby('border')['state'].nunique().reset_index().rename(columns = {'state':'count'})

px.bar(x = 'border', 
       y = 'count', 
       data_frame = tmp_df, 
       color = 'border', 
       labels = {'border':'Border', 'count':'No. of States'},
       width = 800,
       height = 500,
       title = 'No. of States entry from each border')

**Which border provides entry to maximum number of ports?**

In [None]:
tmp_df = bce_data.groupby('border')['port_code'].nunique().reset_index().rename(columns = {'port_code':'count'})

px.bar(x = 'border', 
       y = 'count', 
       data_frame = tmp_df, 
       color = 'border', 
       labels = {'border':'Border', 'count':'No. of Ports'},
       width = 800,
       height = 500,
       title = 'No. of Ports entry from each border')

**What's the frequency of border crossings year-wise at each border cross?**

In [None]:
tmp_df = bce_data.groupby(['border', 'year'])['value'].sum().reset_index()

fig = px.line(x = 'year', 
              y = 'value', 
              data_frame = tmp_df, 
              color = 'border',
              labels = {'border':'Border', 'value':'No. of border entries', 'year':'Year'},
              width = 800,
              height = 500,
              title = 'Frequency of Border Crossings Year-wise'
              )
fig.update_traces(mode='markers+lines')
fig

Observations:
- In the year 1999 and 2000, we see a lot border crossing activities on US-Canada Border which lowered gradually till year 2011. Again, year 2018 saw an increase in inbound crossings at US-Canada Border.
- Inbound crossings from Mexico to US on US-Mexico Border were always higher in comparison to border crossing activities on US-Canada Border even though US-Canada Border provides entry is highest number of ports when compared to US-Mexico Border.

In [None]:
bce_data.head()

**Which month of a year saw minimum and maximum number of border crossings?**

In [None]:
tmp_df = bce_data.groupby(['border', 'year', 'month'])['value'].sum().reset_index()

max_indices = tmp_df.groupby(['border', 'year'])['value'].idxmax().values
min_indices = tmp_df.groupby(['border', 'year'])['value'].idxmin().values

tmp_df.loc[max_indices, 'value_type'] = 'max'
tmp_df.loc[min_indices, 'value_type'] = 'min'

tmp_df.dropna(subset = ['value_type'], inplace = True)

In [None]:
# tmp_df.head()

In [None]:
tmp_df = tmp_df.merge(tmp_df.groupby(['border', 'year'])['value'].sum().reset_index().rename(columns = {'value':'total_value'}), 
                      on = ['border', 'year'])

In [None]:
tmp_df['prop'] = round(tmp_df['value'] * 100 / tmp_df['total_value'], 2).astype('str') + ' %'

In [None]:
# tmp_df.head()

In [None]:
fig = px.bar(x = 'year', 
             y = 'value', 
             data_frame = tmp_df.loc[tmp_df['border'] == 'US-Canada Border'], 
             color = 'month',
             labels = {'month':'Month', 'value':'No. of border entries', 'year':'Year Month', 'prop':'Proportion'},
             title = 'Min-Max Border Crossing Frequencies - For US-Canada Border',
             hover_data = ['prop'],
             text = 'month')
fig.show()

<u>**Observations:**</u> w.r.t. **US-Canada Border**

1. **August** month has seen maximum number of border crossings across years 1996-2019.
2. **Februrary** month has seen minimum number of border crossings across years 1996-2019.
3. Month of **December**, we expect a lot of vehicle movements into USA on the eve of Christmas and New Year celebrations. But, the data is not in alignment with this understanding. Maybe, outbound vehicle movements are more compared to inbound vehicle movements.
4. Occassionaly, we see a lot of inbound vehicle movements in **July** and **Aug**.

In [None]:
fig = px.bar(x = 'year', 
             y = 'value', 
             data_frame = tmp_df.loc[tmp_df['border'] == 'US-Mexico Border'], 
             color = 'month',
             labels = {'month':'Month', 'value':'No. of border entries', 'year':'Year Month', 'prop':'Proportion'},
             title = 'Min-Max Border Crossing Frequencies - For US-Mexico Border',
             hover_data = ['prop'],
             text = 'month')
fig.show()

<u>**Observations:**</u> w.r.t. **US-Mexico Border**
1. A lot of border crossings took place in **December** month across years 1996-2019.
2. Years **1996, 2001, 2008, 2010, 2018 and 2019** have occassionaly seen maximum border crossings in **March** month.
3. **February** month has been with the minimum border crossings across years 1996-2019.

**What are the Top-5 ports to which US-Canada Border and US-Mexico Border provides connectivity?**

In [None]:
tmp_df = bce_data.groupby(['border', 'port_name'])['value'].sum().reset_index()
tmp_df['rank']=tmp_df.groupby(['border'])['value'].rank(ascending = False)
tmp_df = tmp_df.loc[tmp_df['rank'] <= 5]

In [None]:
filter_cond_1 = (tmp_df['border'] == 'US-Canada Border')
port_name_list = tmp_df.loc[filter_cond_1, ].sort_values(by = 'value', ascending = False)['port_name'].tolist()

fig = px.bar(x = 'port_name', 
             y = 'value', 
             data_frame = tmp_df.loc[filter_cond_1], 
             labels = {'port_name':'Port Name', 'value':'No. of border entries'},
             title = 'Top-5 Border Crossing Ports - For US-Canada Border',
             category_orders = {'port_name':port_name_list},
             color_discrete_sequence = ['#11abab'])
fig.show()

In [None]:
filter_cond_1 = (tmp_df['border'] == 'US-Mexico Border')
port_name_list = tmp_df.loc[filter_cond_1, ].sort_values(by = 'value', ascending = False)['port_name'].tolist()

fig = px.bar(x = 'port_name', 
             y = 'value', 
             data_frame = tmp_df.loc[filter_cond_1], 
             labels = {'port_name':'Port Name', 'value':'No. of border entries'},
             title = 'Top-5 Border Crossing Ports - For US-Mexico Border',
             category_orders = {'port_name':port_name_list},
             color_discrete_sequence = ['#dc2d55'])
fig.show()

Even though Canada is larger than Mexico in terms of land area, no. of border crossings from Mexico side is much much higher  compared to border crossings from Canada side.

**Which vehicle type has the highest border crossing frequency?**

In [None]:
tmp_df = bce_data.groupby(['border', 'measure'])['value'].sum().reset_index()
tmp_df['rank'] = tmp_df.groupby(['border'])['value'].rank(ascending  = False)
tmp_df = tmp_df.loc[tmp_df['rank'] <= 5]
# tmp_df

In [None]:
fig = px.bar(x = 'border', 
             y = 'value', 
             data_frame = tmp_df, 
             labels = {'border':'Border', 'measure':'Vehicle Type', 'value':'No. of border entries'},
             title = 'Top-5 Border Crossing Vehicle Types',
             color = 'measure')
fig.show()

**Observations**:
1. For **US-Canada Border** : Following are the Top-5 vehicle types for which maximum number of border crossings have been recorded.<br>
   - Personal Vehicle Passengers
   - Personal Vehicles
   - Trucks
   - Truck Containers Full
   - Bus Passengers
<br>
<br>
2. For **US-Mexico Border** : Following are the Top-5 vehicle types for which maximum number of border crossings have been recorded.<br> 
   - Personal Vehicle Passengers
   - Personal Vehicles
   - Pedestrians
   - Trucks
   - Truck Containers Full

In [None]:
tmp_df = bce_data.groupby(['border', 'year', 'measure'])['value'].sum().reset_index()
tmp_df['rank'] = tmp_df.groupby(['border', 'year'])['value'].rank(ascending  = False)
tmp_df = tmp_df.loc[tmp_df['rank'] == 1, ]

In [None]:
tmp_df['measure'].unique()

**Personal Vehicle Passengers** are the top-most consumers of border crossing for both US-Canada and US-Mexico borders.

In [None]:
fig = px.line(x = 'year', 
              y = 'value', 
              data_frame = tmp_df, 
              color = 'border',
              labels = {'border':'Border', 'value':'No. of border entries', 'year':'Year'},
              width = 800,
              height = 500,
              title = 'Frequency of Border Crossings by "Personal Vehicle Passengers" Year-wise'
              )
fig.update_traces(mode = 'markers + lines')
fig.show()

**Personal Vehicle Passengers** being the biggest contributors to the total border crossings for both US-Canada and US-Mexico borders throughout years 1996-2019, this plot is very much similar to **Frequency of Border Crossings Year-wise** plot.

Image("/kaggle/input/canada-us-mexico-map/images_1.jpg")

In [None]:
Image("/kaggle/input/canada-us-mexico-map/images_1.jpg")

<u>**Conclusions:**</u>

1. Owing to Canada sharing the longest land border with United States stretching **8,891 kilometres (5,525 mi)** when compared to land border shared by Mexico which is **3,141 km (1,952 mi)**, it's natural to expect United States to keep their doors more open for Canadians than Mexicans.
2. When we look at the number of sources in Canada and Mexico from where the vehicles/persons originate and the number of destinations the U.S. allows these vehicles to enter, it's highly probably to think that Canada is the main source of inbound vehicles due to the known fact that total land area of Canada is greater than total land area of Mexico. <br>
***Note : Also, we have to keep in mind that frequency of inbound vehicles to a particular destination not only depends on source where the vehicles are originating; it also depends on the destinations as well.*** 
3. In spite of all this, its imperative to see for ourselves based on the given data set and believe that ***Mexico has been the biggest contributor of total number of inbound crossings when compared to Canada***.
4. **Personal Vehicle Passengers** and **Personal Vehicles** are the **Top-2 transportation mode types** for the entire 12 years or more duration.
5. **August** and **December** have been the months when we see a lot of border crossings are observed on **US-Canada Border** and **US-Mexico Border**.

Please upvote if you like my work and this kernel. Thank you.