#### Clean and normalize container csvs from all years

In [2]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_columns', None)

In [3]:
container_sample = pd.read_csv(r"..\data\bronze_raw\CBP AMS Shipping Data\raw-layer\ams_container_combined.csv", nrows=100)
container_sample.head(0)

Unnamed: 0,identifier,container_number,seal_number_1,seal_number_2,equipment_description_code,container_length,container_height,container_width,container_type,load_status,type_of_service,source_year


#### Create a new colume for volume by multiplying the dimensions
##### The length, height, and width are measured in mm

In [None]:
input_path = r"..\data\bronze_raw\CBP AMS Shipping Data\raw-layer\ams_container_combined.csv"
output_path = r"..\data\silver_cleaned\ams_container_cleaned.csv"

chunk_size = 1000000

##### New column Volume (meters cubed) created from multiplying the dimensions (mm)

In [20]:
first = True
for chunk in pd.read_csv(input_path, chunksize=chunk_size):
    # Creates new column for Volume in cubic meters, rounds to nearest thousandth
    chunk["Volume (meters cubed)"] = (chunk["container_length"] * chunk["container_width"] * chunk["container_height"] / 1000000000).round(3)
    chunk.to_csv(output_path, mode = "w" if first else "a", index = False, header = first)
    first = False

In [None]:
sample_cleaned = pd.read_csv(output_path, nrows=100)
sample_cleaned.head(10)

Unnamed: 0,identifier,container_number,seal_number_1,seal_number_2,equipment_description_code,container_length,container_height,container_width,container_type,load_status,type_of_service,source_year,Volume (meters cubed)
0,201801010,FCIU9250931,EMCCES9186,,Container,4000,906,800,,Loaded,Container Station,2018,2.899
1,201801011,EITU1595313,EMCCES9076,,Container,4000,906,802,4EB0,Loaded,Container Yard,2018,2.906
2,201801012,FCIU9250931,EMCCES9186,,Container,4000,906,800,,Loaded,Container Station,2018,2.899
3,201801013,BMOU5389685,EMCCES8776,,Container,0,0,0,,Loaded,,2018,0.0
4,201801014,EMCU5289450,EMCCES8446,,Container,4000,900,800,45R1,Loaded,Container Yard,2018,2.88
5,201801015,EMCU1425938,EMCCES9026,,40 ft. IL Container (Closed Top),0,0,0,,Loaded,,2018,0.0
6,201801016,IMTU9063752,EMCCES9066,,40 ft. IL Container (Closed Top),0,0,0,,Loaded,Container Station,2018,0.0
7,201801017,EMCU9900438,EMCBMW4905,,Container,0,0,0,,Loaded,Pier to Pier,2018,0.0
8,201801018,EMCU1425938,EMCCES9026,,40 ft. IL Container (Closed Top),0,0,0,,Loaded,,2018,0.0
9,201801019,EITU1276023,EMCCES9116,,Container,4000,900,800,45G0,Loaded,Container Yard,2018,2.88


#### Drop unneeded columns

In [1]:
drop_columns = [
    "seal_number_1",
    "seal_number_2",
    "equipment_description_code",
    "container_type",
    "load_status",
    "type_of_service",
]

file_path = r"..\data\silver_cleaned\ams_container_cleaned.csv"
temp_path = r"..\data\silver_cleaned\silver_container_temp.csv"
chunk_size = 1000000

In [5]:
first = True
for chunk in pd.read_csv(file_path, chunksize = chunk_size):
    # Keep only existing columns to avoid errors
    existing_drop_columns = [c for c in drop_columns if c in chunk.columns]
    
    chunk = chunk.drop(columns = existing_drop_columns)
    
    chunk.to_csv(temp_path, mode = "w" if first else "a", index = False, header = first)
    first = False

# replaces original with new dropped columns file
os.replace(temp_path, file_path)

In [None]:
container_sample = pd.read_csv(file_path, nrows=100)
container_sample.head(5)

Unnamed: 0,identifier,container_number,container_length,container_height,container_width,source_year,Volume (meters cubed)
0,201801010,FCIU9250931,4000,906,800,2018,2.899
1,201801011,EITU1595313,4000,906,802,2018,2.906
2,201801012,FCIU9250931,4000,906,800,2018,2.899
3,201801013,BMOU5389685,0,0,0,2018,0.0
4,201801014,EMCU5289450,4000,900,800,2018,2.88
