In [22]:
import pandas as pd 
import numpy as np

Let's first import the data with the positions added and let's see what the dataset is like

In [7]:
df = pd.read_csv("../data/data_with_position.zip")
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position
0,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Truck Containers Full,133,"(48.905266, -95.314404)"
1,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Truck Containers Empty,298,"(48.905266, -95.314404)"
2,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Personal Vehicles,10383,"(48.905266, -95.314404)"
3,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Personal Vehicle Passengers,19459,"(48.905266, -95.314404)"
4,Warroad,Minnesota,3423,US-Canada Border,06/01/2019 12:00:00 AM,Pedestrians,2,"(48.905266, -95.314404)"


Now we look at the data structure

In [5]:
df.dtypes

Port Name    object
State        object
Port Code     int64
Border       object
Date         object
Measure      object
Value         int64
position     object
dtype: object

We see that the data types need some reformating

In [20]:
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

Port Name            object
State                object
Port Code             int64
Border               object
Date         datetime64[ns]
Measure              object
Value                 int64
position             object
dtype: object

In [9]:
df.isnull().any()

Port Name    False
State        False
Port Code    False
Border       False
Date         False
Measure      False
Value        False
position     False
dtype: bool

We don't have any missing values. Now let's check the number of different values in each columns

In [11]:
print('Attribute '+ 'Values')
for i in df.columns:
    print( i,len(df.loc[:,i].unique()) )

Attribute Values
Port Name 116
State 15
Port Code 117
Border 2
Date 282
Measure 12
Value 53725
position 116


In [12]:
print('port names : {}' .format(len(df['Port Name'].unique())))
print('port codes : {}' .format(len(df['Port Code'].unique())))

port names : 116
port codes : 117


We see that there are one more port code than port name so we have to fix that.

In [13]:
ports = df[['Port Code','Port Name']].drop_duplicates()
ports[ports['Port Name'].duplicated(keep=False)]

Unnamed: 0,Port Code,Port Name
525,3302,Eastport
562,103,Eastport


In [14]:
df.loc[df['Port Name'] == 'Eastport'].drop_duplicates(['Port Code'], keep='first')

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position
525,Eastport,Idaho,3302,US-Canada Border,2019-06-01,Bus Passengers,43,"(40.8441033, -72.7200324802622)"
562,Eastport,Maine,103,US-Canada Border,2019-06-01,Bus Passengers,179,"(40.8441033, -72.7200324802622)"


It is because there are 2 ports with the same name in different state so we will fix that by changing the port names

In [15]:
df.loc[(df['Port Name'] == 'Eastport') & (df['State'] == 'Idaho'), 'Port Name'] = 'Eastport_ID'
df.loc[(df['Port Name'] == 'Eastport') & (df['State'] == 'Maine'), 'Port Name'] = 'Eastport_MA'
df.loc[(df['Port Code'] == 3302) | (df['Port Code'] == 103)].drop_duplicates(['Port Code'], keep='first')

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,position
525,Eastport_ID,Idaho,3302,US-Canada Border,2019-06-01,Bus Passengers,43,"(40.8441033, -72.7200324802622)"
562,Eastport_MA,Maine,103,US-Canada Border,2019-06-01,Bus Passengers,179,"(40.8441033, -72.7200324802622)"


In [17]:
print('port names : {}' .format(len(df['Port Name'].unique())))
print('port codes : {}' .format(len(df['Port Code'].unique())))

port names : 117
port codes : 117


The problem is now solved. Let's export the cleaned dataset:

In [25]:
export_csv = df.to_csv (r'../../DataWithLocationCleaned.csv', index = None, header=True)