# Cleaning Master

### Cleaning "Border_Crossing_Entry_Data.zip"

In [6]:
# load helpful packages 
import pandas as pd # data processing
import numpy as np # linear algebra

Let's first import the data and let's see what the dataset is like

In [7]:
# load dataset
df=pd.read_csv("../data/Border_Crossing_Entry_Data.zip")
# df = pd.read_csv("../data/data_with_position.zip") to be delated
# "each row has a counting ("Value") for the crossing method ("Measure")"
df

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value
0,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Truck Containers Full,133
1,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Truck Containers Empty,298
2,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Personal Vehicles,10383
3,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Personal Vehicle Passengers,19459
4,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Pedestrians,2
5,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Buses,3
6,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Bus Passengers,63
7,Roseau,Minnesota,3426,US-Canada Border,06/01/2019 12:00:00 AM,Trucks,290
8,Roseau,Minnesota,3426,US-Canada Border,06/01/2019 12:00:00 AM,Truck Containers Full,110
9,Roseau,Minnesota,3426,US-Canada Border,06/01/2019 12:00:00 AM,Truck Containers Empty,182


Now we look at the data structure

In [3]:
# check data types of features
df.dtypes

Port Name    object
State        object
Port Code     int64
Border       object
Date         object
Measure      object
Value         int64
dtype: object

We see that the data types need some reformating

In [8]:
# convert 'Date' to date format
df['Date'] = pd.to_datetime(df['Date'])
# recheck data type of 'Date'
df.dtypes

Port Name            object
State                object
Port Code             int64
Border               object
Date         datetime64[ns]
Measure              object
Value                 int64
dtype: object

In [5]:
# check if 2 possible values for 'Border'?
# check years included in dataset?

In [6]:
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value
0,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Truck Containers Full,133
1,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Truck Containers Empty,298
2,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicles,10383
3,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Personal Vehicle Passengers,19459
4,Warroad,Minnesota,3423,US-Canada Border,2019-06-01,Pedestrians,2


In [9]:
# Look for missing values
df.isnull().any()

Port Name    False
State        False
Port Code    False
Border       False
Date         False
Measure      False
Value        False
dtype: bool

We don't have any missing values. Now let's check the number of different values in each columns

In [10]:
print('Attribute '+ 'Values')
for i in df.columns:
    print( i,len(df.loc[:,i].unique()) )

Attribute Values
Port Name 116
State 15
Port Code 117
Border 2
Date 282
Measure 12
Value 53725


In [11]:
# check if number of unique elements in "Port Code" and "Port Name" equal 
print('port names : {}' .format(len(df['Port Name'].unique())))
print('port codes : {}' .format(len(df['Port Code'].unique())))

port names : 116
port codes : 117


We see that there are one more port code than port name so we have to fix that.

In [12]:
ports = df[['Port Code','Port Name']].drop_duplicates()
ports[ports['Port Name'].duplicated(keep=False)]

Unnamed: 0,Port Code,Port Name
525,3302,Eastport
562,103,Eastport


In [13]:
df.loc[df['Port Name'] == 'Eastport'].drop_duplicates(['Port Code'], keep='first')

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value
525,Eastport,Idaho,3302,US-Canada Border,2019-06-01,Bus Passengers,43
562,Eastport,Maine,103,US-Canada Border,2019-06-01,Bus Passengers,179


It is because there are 2 ports with the same name in different state so we will fix that by changing the port names

In [14]:
# change 'Port Name' of Eastport Idaho and Eastport Maine
df.loc[(df['Port Name'] == 'Eastport') & (df['State'] == 'Idaho'), 'Port Name'] = 'Eastport_ID'
df.loc[(df['Port Name'] == 'Eastport') & (df['State'] == 'Maine'), 'Port Name'] = 'Eastport_MA'
df.loc[(df['Port Code'] == 3302) | (df['Port Code'] == 103)].drop_duplicates(['Port Code'], keep='first')

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value
525,Eastport_ID,Idaho,3302,US-Canada Border,2019-06-01,Bus Passengers,43
562,Eastport_MA,Maine,103,US-Canada Border,2019-06-01,Bus Passengers,179


In [15]:
# recheck if number of unique elements in "Port Code" and "Port Name" equal 
print('port names : {}' .format(len(df['Port Name'].unique())))
print('port codes : {}' .format(len(df['Port Code'].unique())))

port names : 117
port codes : 117


## Adding Positions to "Border_Crossing_Entry_Data.zip"

In [17]:
from geopy.geocoders import Nominatim

ModuleNotFoundError: No module named 'geopy'

In [None]:
import pandas as pd

In [None]:
data.head()

We'll add a new column with coordoonates of the ports. These coordoonates will allow to plot the port on a map.

We create a new locator with geopy

In [None]:
locator = Nominatim(user_agent="myGeocoder")

In [None]:
nom = Nominatim(user_agent="myGeocoder",timeout=2)

Create an array with all the names of the port
and add ",USA" at the end of every port names to make sure to find the one in the USA.

In [None]:
lieux = data['Port Name'].map(lambda x : x).unique()
lieux = lieux + ",USA"
#lieux

Find the geocode of every port and store them in a dictionnary.

In [None]:
coordoonates = {}

def do_geocode(address):
    try:
        return nom.geocode(address)
    except:
        return do_geocode(address)
    
for lieu in lieux:
    coordoonates[lieu] = do_geocode(lieu)
    #print(do_geocode(lieu))

In [None]:
#print(coordoonates)

Compute a new dictionnary with the exact name of the port (whitout USA at the end) and the coordoonates.

In [None]:
portAndCoordoonates = {}
for key in coordoonates:
    if coordoonates[key] is None:
        portAndCoordoonates[key[:-4]] = None
    else:
        portAndCoordoonates[key[:-4]] = (coordoonates[key].latitude,coordoonates[key].longitude)
#print(portAndCoordoonates)

Check missing values in the ports coordoonates

In [None]:
missings = []
for key in portAndCoordoonates:
    if portAndCoordoonates[key] is None:
        missings.append(key)
print(missings)

We see that these 5 ports have no coordoonates. We'll add them manually.

In [None]:
portAndCoordoonates['Blaine']  = do_geocode("Blaine, Washington")
portAndCoordoonates['International Falls-Ranier'] = do_geocode("International Falls,USA")
portAndCoordoonates['Dalton Cache'] = do_geocode("Pleasant Camp")
portAndCoordoonates['Van Buren'] = ("47.16207","-67.94271")
portAndCoordoonates['Turner'] = ("48.84361","108.40599")

Add the Coordoonates to the data as a new column

In [None]:
data["position"] = data['Port Name'].apply(lambda loc:portAndCoordoonates[loc])

In [None]:
data.head()

Check if data are still missing

In [None]:
data.isnull().sum()

Export the data with the position

In [None]:
export_csv = data.to_csv (r'/Users/Arnaud/Desktop/data_with_position.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path

## Cleaning "data_with_position.zip"

The problem is now solved. Let's export the cleaned dataset:

In [16]:
export_csv = df.to_csv (r'../../DataWithLocationCleaned.csv', index = None, header=True)