In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [76]:
data = pd.read_csv("../data/DataWithLocationCleaned.zip")
df = data.copy()

In [77]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position,Month,Year
0,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Truck Containers Full,133,"(48.905266, -95.314404)",6,2019
1,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Truck Containers Empty,298,"(48.905266, -95.314404)",6,2019
2,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicles,10383,"(48.905266, -95.314404)",6,2019
3,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicle Passengers,19459,"(48.905266, -95.314404)",6,2019
4,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Pedestrians,2,"(48.905266, -95.314404)",6,2019


In [78]:
## Plot the map of the USA with importance of ports.
import plotly.express as px
import plotly.graph_objects as go

def getlat(x):
    lat = x.partition(',')[0][1:]
    if lat[0] == "'":
        lat = lat[1:-1]
    return lat

def getlong(x):
    lat = x.partition(',')[2]
    if lat[1] == "'":
        lat = lat[2:-2]
    else:
        lat = lat[1:-1]
    return lat

df = df[df['Measure'].isin(['Bus Passengers','Pedestrians','Personal Vehicle Passengers','Train Passengers'])].reset_index().drop(columns=['index'])

persons_location = df[['position','Value']].groupby('position').sum().reset_index()
persons_location['latitude'] = persons_location['position'].apply(lambda x: getlat(x))
persons_location['longitude'] = persons_location['position'].apply(lambda x: getlong(x))

ps = data[['Port Name','position']].drop_duplicates().set_index('position')
persons_location['Ports'] = persons_location.position.apply(lambda x : ', '.join(ps.loc[x].values.flatten()))
persons_location['text'] = persons_location['Ports'] + '<br>Crossings: ' + (persons_location['Value']/1e6).astype(str)+' million'

color = "crimson"
scale = 500000

fig = go.Figure()
fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = persons_location['longitude'],
    lat = persons_location['latitude'],
    text = persons_location['text'],
    marker = dict(
        size = persons_location['Value']/scale,
        color = color,
        line_color='rgb(40,40,40)',
        line_width=0.5,
        sizemode = 'area')))

fig.update_layout(
        title_text = 'US Borders, total inbound persons since 1996<br>(Click legend to toggle traces)',
        showlegend = False,
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )

fig.show()

In [97]:
# We create 2 Zones
# Zone 1 regroup the ports in the left of the map (US-Mexico)
Zone1 = df[df['Port Name'].isin(['San Ysidro','Otay Mesa','Tecate','Calexico','Calexico East','San Luis','Andrade'])].reset_index(drop=True)
# Zone 2 regroup all the ports for the US-Mexico ports
Zone2 = df[df['Border'] == 'US-Mexico Border'].reset_index(drop=True)
Zone3 = df[df['Port Name'].isin(['El Paso','Hidalgo'])].reset_index(drop=True)

In [98]:
Zone1.shape[0]

7623

In [99]:
Zone2.shape[0]

27304

In [100]:
Zone3.shape[0]

2221

In [135]:
def logisitic(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    LR = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000)
    LR.fit(X_train,y_train)
    test_score = LR.score(X_test, y_test)
    train_score = LR.score(X_train,y_train)
    print("train_score :",train_score,"test_score =",test_score)
    
def LogRegressionOneHot(z,f,i):
    test = z
    test['quant'] = pd.qcut(test['Value'],q = i, duplicates = 'drop')
    le = LabelEncoder()
    y = le.fit_transform(test["quant"])
    one_hot = OneHotEncoder(categories="auto")
    if ('Measure' in f):
        f.remove('Measure')
        cat_to_onehot = one_hot.fit_transform(test[["Measure"]]).toarray()
        cat_to_onehot = pd.DataFrame(cat_to_onehot)
        X = pd.concat((test[f],cat_to_onehot),axis=1)
        logRegressionOneHot(X,y)
    else :
        X = test[f]
        logRegressionOneHot(X,y)

In [139]:
# Logistic regression on Zone1 with only Measure as feature and 5 categories(intervals) for y
LogRegressionOneHot(Zone1,['Measure'],5)

train_score : 0.4850770744506396 test_score = 0.47475409836065574


In [140]:
LogRegressionOneHot(Zone1,['Port Code'],5)

train_score : 0.20137750081994096 test_score = 0.19672131147540983


In [141]:
LogRegressionOneHot(Zone1,['Year'],5)

train_score : 0.20137750081994096 test_score = 0.19672131147540983


As we have 5 intervalls there is 1/5 = 0,2 to predict the right category.
So Year and Port Code doesn't influence our dependant variable if taking alone

In [145]:
LogRegressionOneHot(Zone1,['Measure','Port Code','Month','Year'],5)

train_score : 0.5126270908494588 test_score = 0.5140983606557377


In [148]:
x = 2
while (x<15):
    LogRegressionOneHot(Zone1,['Measure','Port Code','Month','Year'],x)
    x = x + 1

train_score : 0.9135782223679895 test_score = 0.9186885245901639
train_score : 0.7702525418169892 test_score = 0.7645901639344262
train_score : 0.6566087241718597 test_score = 0.6675409836065573
train_score : 0.5126270908494588 test_score = 0.5140983606557377
train_score : 0.6190554280091833 test_score = 0.6347540983606558
train_score : 0.4178419153820925 test_score = 0.42360655737704916
train_score : 0.3935716628402755 test_score = 0.39934426229508196
train_score : 0.3633978353558544 test_score = 0.37245901639344264
train_score : 0.3581502131846507 test_score = 0.3534426229508197
train_score : 0.37143325680551 test_score = 0.3704918032786885
train_score : 0.3273204329288291 test_score = 0.33704918032786885
train_score : 0.33781567727123646 test_score = 0.3298360655737705
train_score : 0.30649393243686457 test_score = 0.2957377049180328


As we increase the number of categories for y, the test_score decrease :
    There is less luck to predict the correct category.

In [None]:
#The year doesn't influence much our dependant variable.
#The main influencer is the the measure.
#The low test_score could be due to the little sample : ≈ 7000
#We will try to increase the samples with all the US-MEXICO COAST

In [None]:
#Try differents Zones 
#Try differents regressions
#Try differents categories (more category with less samples in)
#Compare models