In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [83]:
data = pd.read_csv("../data/DataWithLocationCleaned.csv.gz")
df = data.copy()

In [90]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

In [91]:
df = df[df['Measure'].isin(['Bus Passengers','Pedestrians','Personal Vehicle Passengers','Train Passengers'])].reset_index().drop(columns=['index'])
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Year,Month,position
0,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicle Passengers,19459,2019,6,"(48.905266, -95.314404)"
1,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Pedestrians,2,2019,6,"(48.905266, -95.314404)"
2,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Bus Passengers,63,2019,6,"(48.905266, -95.314404)"
3,Roseau,Minnesota,3426,US-Canada Border,2019-06-01,Personal Vehicle Passengers,7385,2019,6,"(48.7710371, -95.7697882)"
4,Roseau,Minnesota,3426,US-Canada Border,2019-06-01,Bus Passengers,118,2019,6,"(48.7710371, -95.7697882)"


In [92]:
## Plot the map of the USA with importance of ports.
import plotly.express as px
import plotly.graph_objects as go

def getlat(x):
    lat = x.partition(',')[0][1:]
    if lat[0] == "'":
        lat = lat[1:-1]
    return lat

def getlong(x):
    lat = x.partition(',')[2]
    if lat[1] == "'":
        lat = lat[2:-2]
    else:
        lat = lat[1:-1]
    return lat

df = df[df['Measure'].isin(['Bus Passengers','Pedestrians','Personal Vehicle Passengers','Train Passengers'])].reset_index().drop(columns=['index'])

persons_location = df[['position','Value']].groupby('position').sum().reset_index()
persons_location['latitude'] = persons_location['position'].apply(lambda x: getlat(x))
persons_location['longitude'] = persons_location['position'].apply(lambda x: getlong(x))

ps = data[['Port Name','position']].drop_duplicates().set_index('position')
persons_location['Ports'] = persons_location.position.apply(lambda x : ', '.join(ps.loc[x].values.flatten()))
persons_location['text'] = persons_location['Ports'] + '<br>Crossings: ' + (persons_location['Value']/1e6).astype(str)+' million'

color = "crimson"
scale = 500000

fig = go.Figure()
fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = persons_location['longitude'],
    lat = persons_location['latitude'],
    text = persons_location['text'],
    marker = dict(
        size = persons_location['Value']/scale,
        color = color,
        line_color='rgb(40,40,40)',
        line_width=0.5,
        sizemode = 'area')))

fig.update_layout(
        title_text = 'US Borders, total inbound persons since 1996<br>(Click legend to toggle traces)',
        showlegend = False,
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )

fig.show()

In [93]:
def logistic(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    LR = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000)
    LR.fit(X_train,y_train)
    test_score = LR.score(X_test, y_test)
    train_score = LR.score(X_train,y_train)
    print("train_score :",train_score,"test_score =",test_score)
    return (train_score,test_score)

def encodeOneHot(z,f,i):
    test = z
    test['quant'] = pd.qcut(test['Value'],q = i, duplicates = 'drop')
    le = LabelEncoder()
    y = le.fit_transform(test["quant"])
    one_hot = OneHotEncoder(categories="auto")
    if ('Measure' in f):
        f.remove('Measure')
        cat_to_onehot = one_hot.fit_transform(test[["Measure"]]).toarray()
        cat_to_onehot = pd.DataFrame(cat_to_onehot)
        X = pd.concat((test[f],cat_to_onehot),axis=1)
        logistic(X,y)
    else :
        X = test[f]
        logistic(X,y)
        
def  computeLogistic(ports,f,i):
    Zone = df[df['Port Name'].isin(ports)].reset_index(drop=True)
    encodeOneHot(Zone,f,i)

In [94]:
# Logistic regression on Zone3 (El paso + Hidalgo) with all features and 8 categories(intervals) for y
computeLogistic(['El Paso','Hidalgo'],['Measure','Port Code','Month','Year'],8)

train_score : 0.7972972972972973 test_score = 0.8


In [95]:
computeLogistic(['El Paso'],['Measure','Port Code','Month','Year'],5)

train_score : 0.7073170731707317 test_score = 0.672566371681416


In [96]:
computeLogistic(['Hidalgo'],['Measure','Port Code','Month','Year'],5)

train_score : 0.8146453089244852 test_score = 0.7442922374429224


In [97]:
computeLogistic(['Hidalgo'],['Measure','Month','Year','Year'],5)

train_score : 0.7482837528604119 test_score = 0.7168949771689498


In [98]:
x = 2
train_err = []
test_err = []
while (x<15):
    print("for",x,"categories :")
    computeLogistic(['El Paso','Hidalgo'],['Measure','Port Code','Month','Year'],x)
    x = x + 1

for 2 categories :
train_score : 0.9921171171171171 test_score = 0.9910112359550561
for 3 categories :
train_score : 0.8693693693693694 test_score = 0.8651685393258427
for 4 categories :
train_score : 0.956081081081081 test_score = 0.950561797752809
for 5 categories :
train_score : 0.8265765765765766 test_score = 0.8224719101123595
for 6 categories :
train_score : 0.6773648648648649 test_score = 0.6921348314606741
for 7 categories :
train_score : 0.7511261261261262 test_score = 0.7730337078651686
for 8 categories :
train_score : 0.7972972972972973 test_score = 0.8
for 9 categories :
train_score : 0.6182432432432432 test_score = 0.6112359550561798
for 10 categories :
train_score : 0.5957207207207207 test_score = 0.5910112359550562
for 11 categories :
train_score : 0.5444819819819819 test_score = 0.5438202247191011
for 12 categories :
train_score : 0.5518018018018018 test_score = 0.5348314606741573
for 13 categories :
train_score : 0.5720720720720721 test_score = 0.5550561797752809
for 1

As we increase the number of categories for y, the test_score decrease :
    There is less luck to predict the correct category.
    

In [93]:
for port in ports:
    print(port)
    print
    computeLogistic([port],['Measure','Month','Year','Year'],8)

Warroad
train_score : 0.4046511627906977 test_score = 0.39814814814814814
Roseau
train_score : 0.7683073229291717 test_score = 0.8133971291866029
Detroit
train_score : 0.5601799775028121 test_score = 0.5739910313901345
Del Rio
train_score : 0.6179245283018868 test_score = 0.5943396226415094
Anacortes
train_score : 0.6714801444043321 test_score = 0.6906474820143885
Sumas
train_score : 0.3968957871396896 test_score = 0.336283185840708
Port Angeles
train_score : 0.7326388888888888 test_score = 0.7241379310344828
Point Roberts
train_score : 0.4954128440366973 test_score = 0.4931506849315068
Oroville
train_score : 0.6885057471264368 test_score = 0.6055045871559633
Nighthawk
train_score : 0.7463592233009708 test_score = 0.7378640776699029
Metaline Falls
train_score : 0.661271676300578 test_score = 0.6728110599078341
Lynden
train_score : 0.5103686635944701 test_score = 0.4541284403669725
Laurier
train_score : 0.4936998854524628 test_score = 0.4429223744292237
Frontier
train_score : 0.53555045

ValueError: n_splits=5 cannot be greater than the number of members in each class.