## Create variables

In [1]:
#some imports
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import datetime as dt 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import datetime as dt
sns.set()

In [3]:
#import dataset
data = pd.read_csv("../data/DataWithLocationCleaned.zip")
data.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position
0,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Truck Containers Full,133,"(48.905266, -95.314404)"
1,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Truck Containers Empty,298,"(48.905266, -95.314404)"
2,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicles,10383,"(48.905266, -95.314404)"
3,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicle Passengers,19459,"(48.905266, -95.314404)"
4,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Pedestrians,2,"(48.905266, -95.314404)"


In [4]:
data.dtypes

Port Name    object
State        object
Port Code     int64
Border       object
Date         object
Measure      object
Value         int64
position     object
dtype: object

In [5]:
data['Date'] = pd.to_datetime(data['Date'])

In [6]:
data.groupby("Measure")[['Value']].sum().reset_index()

Unnamed: 0,Measure,Value
0,Bus Passengers,143388212
1,Buses,8604318
2,Pedestrians,1056699751
3,Personal Vehicle Passengers,5504073464
4,Personal Vehicles,2584776249
5,Rail Containers Empty,21477185
6,Rail Containers Full,38891558
7,Train Passengers,6287351
8,Trains,912969
9,Truck Containers Empty,64852271


In [7]:
#Let's consider only the measure which indicate a traffic of person : 'Bus passengers', 'Pedestrians', 'Personal Vehicle Passengers' and 'Train Passengers'
persons = data[data['Measure'].isin(['Bus Passengers','Pedestrians','Personal Vehicle Passengers','Train Passengers'])].reset_index().drop(columns=['index'])
persons.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position
0,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicle Passengers,19459,"(48.905266, -95.314404)"
1,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Pedestrians,2,"(48.905266, -95.314404)"
2,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Bus Passengers,63,"(48.905266, -95.314404)"
3,Roseau,Minnesota,3426,US-Canada Border,2019-06-01,Personal Vehicle Passengers,7385,"(48.7710371, -95.7697882)"
4,Roseau,Minnesota,3426,US-Canada Border,2019-06-01,Bus Passengers,118,"(48.7710371, -95.7697882)"


In [8]:
#To regress on Date Values, we have to translate them in ordinal numbers.
persons['Ordinal_Date']=persons['Date'].map(dt.datetime.toordinal)
persons.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position,Ordinal_Date
0,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicle Passengers,19459,"(48.905266, -95.314404)",737211
1,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Pedestrians,2,"(48.905266, -95.314404)",737211
2,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Bus Passengers,63,"(48.905266, -95.314404)",737211
3,Roseau,Minnesota,3426,US-Canada Border,2019-06-01,Personal Vehicle Passengers,7385,"(48.7710371, -95.7697882)",737211
4,Roseau,Minnesota,3426,US-Canada Border,2019-06-01,Bus Passengers,118,"(48.7710371, -95.7697882)",737211


In [9]:
persons['year'] = persons['Date'].dt.year
persons = persons.groupby(['year','Port Name','Measure'])['Value'].sum().reset_index()


In [10]:
persons

Unnamed: 0,year,Port Name,Measure,Value
0,1996,Alcan,Bus Passengers,22066
1,1996,Alcan,Pedestrians,7
2,1996,Alcan,Personal Vehicle Passengers,122310
3,1996,Alcan,Train Passengers,0
4,1996,Alexandria Bay,Bus Passengers,68098
5,1996,Alexandria Bay,Pedestrians,80
6,1996,Alexandria Bay,Personal Vehicle Passengers,1966213
7,1996,Alexandria Bay,Train Passengers,0
8,1996,Ambrose,Bus Passengers,10
9,1996,Ambrose,Pedestrians,0


In [11]:
dfDummies1 = pd.get_dummies(persons['Port Name'], prefix = 'category')
dfDummies2 = pd.get_dummies(persons['Measure'], prefix = 'category')

persons = pd.concat([persons, dfDummies1], axis=1)
persons = pd.concat([persons, dfDummies2], axis=1)

In [12]:
persons = persons.drop(['Port Name','Measure'], 1)

In [13]:
persons.head()

Unnamed: 0,year,Value,category_Alcan,category_Alexandria Bay,category_Ambrose,category_Anacortes,category_Andrade,category_Antler,category_Bar Harbor,category_Baudette,...,category_Warroad,category_Westhope,category_Whitetail,category_Whitlash,category_Wildhorse,category_Willow Creek,category_Bus Passengers,category_Pedestrians,category_Personal Vehicle Passengers,category_Train Passengers
0,1996,22066,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1996,7,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1996,122310,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1996,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1996,68098,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [14]:
x = b_cross_en[['year','State','Border']]
y = persons['Value']

NameError: name 'b_cross_en' is not defined

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 52)

NameError: name 'x' is not defined

In [16]:
mod = LinearRegression()

In [17]:
model = mod.fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [18]:
model.intercept_

NameError: name 'model' is not defined

In [19]:
model.coef_

NameError: name 'model' is not defined

In [20]:
y_predict = mod.predict(x_test)

NameError: name 'x_test' is not defined

In [21]:
r2_score(y_test, y_predict)

NameError: name 'y_test' is not defined

In [22]:
mean_squared_error(y_test, y_predict)

NameError: name 'y_test' is not defined

In [23]:
plt.scatter(x_train.iloc[:,0], y_train)
plt.scatter(x_test.iloc[:,0], y_test)

plt.xlabel("dsad")
plt.ylabel('Value')
plt.show()


NameError: name 'x_train' is not defined

In [24]:
print(model.predict(np.array([2019,0,0]).reshape(-1,3)))

NameError: name 'model' is not defined