This notebook looks among the Anchorages Data Set of the global fishing watch **No further Investigation Research and experimentation.**

In [1]:
import pandas as pd
import numpy as np

In [13]:
# Specify data types for columns with mixed types
dtype_spec = {
    'label': 'object',
    'sublabel': 'object',
    'iso3': 'object',
    'label_source': 'object',
    'distance_from_shore_m': 'float64',
    'drift_radius': 'float64',
    'at_dock': 'object',
    'dock': 'object'
}

# Load the data
anchorages19 = pd.read_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/Anchorages/named_anchorages_v1_20191205.csv', dtype=dtype_spec)
anchorages20 = pd.read_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/Anchorages/named_anchorages_v2_20201104.csv', dtype=dtype_spec)
anchorages22 = pd.read_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/Anchorages/named_anchorages_v2_20221206.csv', dtype=dtype_spec)


In [20]:
# List of anchorages dataframes
anchorages_dataframes = [anchorages19, anchorages20, anchorages22]

# Inspect each dataframe
for df, year in zip(anchorages_dataframes, ['2018', '2019', '2020', '2022']):
    print(f"\nAnchorages {year}:")
    print(df.head())
    print(f"\nMissing values in Anchorages {year}:")
    print(df.isnull().sum())
    print(f"\nData types in Anchorages {year}:")
    print(df.dtypes)



Anchorages 2018:
       s2id        lat        lon     label     label_source iso3  \
0  3e4e429b  26.914042  52.220320   SHARJAH  top_destination  IRN   
1  1a575de7  -7.715992  11.724560  BLOCK 17  top_destination  AGO   
2  3fcf5295  29.642077  48.696705  KAZ IRAQ  top_destination  KWT   
3  3fcf52bf  29.644148  48.701873  KAZ IRAQ  top_destination  KWT   
4  3fcf52bd  29.639744  48.701769  UMM QASR  top_destination  KWT   

   distance_from_shore_m  drift_radius   dock  
0                63000.0      0.056322  FALSE  
1               134000.0      0.111111  FALSE  
2                33000.0      0.162583  FALSE  
3                33000.0      0.161623  FALSE  
4                33000.0      0.149964  FALSE  

Missing values in Anchorages 2018:
s2id                     0
lat                      0
lon                      0
label                    0
label_source             0
iso3                     0
distance_from_shore_m    0
drift_radius             0
dock                     1


In [22]:
# Function to fill missing values and drop sublabel
def fill_missing_values_and_drop_sublabel(df):
    #df.drop(columns=['sublabel'], inplace=True)
    df['label'].fillna('Unknown', inplace=True)
    df['iso3'].fillna('Unknown', inplace=True)
    
    # Fill missing values if the column exists
    if 'distance_from_shore_m' in df.columns:
        df['distance_from_shore_m'].fillna(df['distance_from_shore_m'].median(), inplace=True)
    if 'drift_radius' in df.columns:
        df['drift_radius'].fillna(df['drift_radius'].median(), inplace=True)
    if 'dock' in df.columns:
        df['dock'].fillna(df['dock'].mode()[0], inplace=True)

# Apply the function to each dataframe
for df in [anchorages19, anchorages20, anchorages22]:
    fill_missing_values_and_drop_sublabel(df)

# Standardize the schema: Ensure all columns are consistent
for df in [anchorages19, anchorages20, anchorages22]:
    df.columns = ['s2id', 'lat', 'lon', 'label', 'label_source', 'iso3', 'distance_from_shore_m', 'drift_radius', 'dock']

# Merge all dataframes into one
df_anchorages_combined = pd.concat([anchorages19, anchorages20, anchorages22], ignore_index=True)

# Check the combined dataframe
print("\nCombined Anchorages DataFrame:")
print(df_anchorages_combined.info())
print(df_anchorages_combined.head())

# Save the cleaned and combined dataset
df_anchorages_combined.to_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/integrated_anchorages_data1.csv', index=False)
print("\nData saved to 'integrated_anchorages_data.csv'")



Combined Anchorages DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499505 entries, 0 to 499504
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   s2id                   499505 non-null  object 
 1   lat                    499505 non-null  float64
 2   lon                    499505 non-null  float64
 3   label                  499505 non-null  object 
 4   label_source           499505 non-null  object 
 5   iso3                   499505 non-null  object 
 6   distance_from_shore_m  499505 non-null  float64
 7   drift_radius           499505 non-null  float64
 8   dock                   499505 non-null  object 
dtypes: float64(4), object(5)
memory usage: 34.3+ MB
None
       s2id        lat        lon     label     label_source iso3  \
0  3e4e429b  26.914042  52.220320   SHARJAH  top_destination  IRN   
1  1a575de7  -7.715992  11.724560  BLOCK 17  top_destination  AGO   
2  3fcf529

In [12]:
# Inspect column names in each dataframe
print("Columns in Anchorages 2018:", anchorages18.columns)
print("Columns in Anchorages 2019:", anchorages19.columns)
print("Columns in Anchorages 2020:", anchorages20.columns)
print("Columns in Anchorages 2022:", anchorages22.columns)


Columns in Anchorages 2018: Index(['s2id', 'label', 'iso3', 'lat', 'lon', 'anchorage_group'], dtype='object')
Columns in Anchorages 2019: Index(['s2id', 'lat', 'lon', 'label', 'sublabel', 'label_source', 'iso3',
       'distance_from_shore_m', 'drift_radius', 'at_dock'],
      dtype='object')
Columns in Anchorages 2020: Index(['s2id', 'lat', 'lon', 'label', 'sublabel', 'label_source', 'iso3',
       'distance_from_shore_m', 'drift_radius', 'at_dock'],
      dtype='object')
Columns in Anchorages 2022: Index(['s2id', 'lat', 'lon', 'label', 'sublabel', 'label_source', 'iso3',
       'distance_from_shore_m', 'drift_radius', 'dock'],
      dtype='object')


In [5]:
df_anchorages.columns

Index(['s2id', 'label', 'sublabel', 'iso3', 'lat', 'lon', 'anchorage_group',
       'label_source', 'distance_from_shore_m', 'drift_radius', 'at_dock',
       'dock'],
      dtype='object')

In [6]:
df_anchorages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619253 entries, 0 to 619252
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   s2id                   619253 non-null  object 
 1   label                  619239 non-null  object 
 2   sublabel               175538 non-null  object 
 3   iso3                   619232 non-null  object 
 4   lat                    619253 non-null  float64
 5   lon                    619253 non-null  float64
 6   anchorage_group        119748 non-null  object 
 7   label_source           499505 non-null  object 
 8   distance_from_shore_m  499430 non-null  float64
 9   drift_radius           499021 non-null  float64
 10  at_dock                332987 non-null  object 
 11  dock                   166462 non-null  object 
dtypes: float64(4), object(8)
memory usage: 56.7+ MB


In [7]:
df_anchorages.isna().sum()

s2id                          0
label                        14
sublabel                 443715
iso3                         21
lat                           0
lon                           0
anchorage_group          499505
label_source             119748
distance_from_shore_m    119823
drift_radius             120232
at_dock                  286266
dock                     452791
dtype: int64

In [8]:
df_anchorages.shape

(619253, 12)

In [9]:
df_anchorages = df_anchorages.dropna(subset=['lat', 'lon'])

In [10]:
df_anchorages.isna().sum()

s2id                          0
label                        14
sublabel                 443715
iso3                         21
lat                           0
lon                           0
anchorage_group          499505
label_source             119748
distance_from_shore_m    119823
drift_radius             120232
at_dock                  286266
dock                     452791
dtype: int64

We will now save it to a csv file and use it later to merge it with ais 



In [12]:
# Last thing we will need to save the data into an csv file
df_anchorages.to_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/Anchorages/anchorages.csv',index=False)

In [11]:
df = pd.read_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/Anchorages/anchorages.csv')

  df = pd.read_csv('/Users/rodrigo/Desktop/BrainStation/Capstone/DATASETSDIF/Anchorages/anchorages.csv')


In [12]:
df.head()

Unnamed: 0,s2id,label,sublabel,iso3,lat,lon,anchorage_group,label_source,distance_from_shore_m,drift_radius,at_dock,dock
0,89c28329,0R,,USA,40.878103,-73.516031,5155,,,,,
1,89c7939b,0R,,USA,39.369316,-76.029973,5168,,,,,
2,89e826ed,0R,,USA,40.919673,-73.398545,5224,,,,,
3,89006359,0R,,USA,33.801497,-78.746408,89006359,,,,,
4,88fe7eb3,4E,,USA,32.754622,-80.011341,5056,,,,,
