In [1]:
import pandas as pd

In [3]:
import pandas as pd
import os
import glob
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

def process_json_file(json_file):
    """Process a single JSON file and return its data"""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # If data is a dictionary, convert to list
            if isinstance(data, dict):
                return [data]
            elif isinstance(data, list):
                return data
            else:
                return []
    except Exception as e:
        print(f"Error reading {json_file}: {e}")
        return []

# Find all JSON files in data/ directory and subdirectories
all_json_files = glob.glob("data/**/*.json", recursive=True)
print(f"Found {len(all_json_files)} JSON files")

# Read and combine all JSON files into a single DataFrame using thread-based parallel processing
data_list = []
max_workers = min(8, len(all_json_files))  # Use fewer threads to avoid overwhelming the system

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_file = {executor.submit(process_json_file, json_file): json_file 
                      for json_file in all_json_files}
    
    # Process completed tasks with progress bar
    for future in tqdm(as_completed(future_to_file), 
                       total=len(all_json_files), 
                       desc="Processing JSON files", 
                       unit="file"):
        file_data = future.result()
        if file_data:
            data_list.extend(file_data)

# Convert to DataFrame and save as CSV
if data_list:
    print("Converting to DataFrame and saving...")
    df = pd.DataFrame(data_list)
    output_file = "data/combined_vessels.csv"
    df.to_csv(output_file, index=False)
    print(f"Combined {len(data_list)} records into {output_file}")
    print(f"DataFrame shape: {df.shape}")
else:
    print("No data found to combine")

Found 140627 JSON files


Processing JSON files: 100%|██████████| 140627/140627 [00:05<00:00, 24323.06file/s]


Converting to DataFrame and saving...
Combined 140627 records into data/combined_vessels.csv
DataFrame shape: (140627, 23)


In [4]:
df

Unnamed: 0,IMO number,MMSI,Former names,Vessel type,Operating status,Flag,Length,Breadth,Year of build,Builder,...,Description,Vessel name,Gross tonnage (tons),Deadweight (tons),source_url,Classification society,Engine type,Engine model,Engine power,Draft
0,7426150,271000249,BURCIN KALKAVAN(2000)KAPTAN T.M.(1998)DURSUN R...,General cargo vessel,Active,Turkey,130m,20m,1975,"MIE SHIPYARD - YOKKAICHI, JAPAN",...,LIMA II is a General cargo vessel built in 197...,LIMA II,8033tons,12303tons,https://www.balticshipping.com/vessel/imo/7426150,,,,,
1,7330715,431401588,KAZU MARU NO.8(2013),Crane vessel,Active,Japan,,14m,1973,"MATSUBARA KOKI SHIPBUILDING - ONOMICHI, JAPAN",...,DONRICH is a Crane vessel built in 1973 by MAT...,DONRICH,446tons,1118tons,https://www.balticshipping.com/vessel/imo/7330715,,,,,
2,5353933,,TOPMAST NO.14(1949)EMPIRE FLORA(1948),Tug boat,Decommissioned or lost,Italy,37m,8m,1945,"COCHRANE SHIPBUILDERS - SELBY, U.K.",...,TAURUS is a Tug boat built in 1945 by COCHRANE...,TAURUS,299tons,,https://www.balticshipping.com/vessel/imo/5353933,REGISTRO ITALIANO NAVALE,,,,
3,6810718,,,Fishing vessel,Decommissioned or lost,Russian Federation,99m,14m,1963,"BLACK SEA SHIPYARD - NIKOLAYEV, UKRAINE",...,OKTYABRSK is a Fishing vessel built in 1963 by...,OKTYABRSK,3301tons,2506tons,https://www.balticshipping.com/vessel/imo/6810718,,,,,
4,9075670,477690400,OCEAN AMBER(2017)NOBLE UNION(2011)NUEVA UNION(...,Bulk carrier,Active,Hong Kong,224m,32m,1994,"HYUNDAI HEAVY INDUSTRIES CO. LTD. - ULSAN, SOU...",...,SOLOMON TRADER is a Bulk carrier built in 1994...,SOLOMON TRADER,38779tons,73592tons,https://www.balticshipping.com/vessel/imo/9075670,NIPPON KAIJI KYOKAI (NKK),MAN-B&W,6K67GF,12900 KW,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140622,9269635,263701290,,Passenger vessel,Active,Portugal,37m,10m,2002,"AUSTAL SHIPS - FREMANTLE, AUSTRALIA",...,CESARIO VERDE is a Passenger vessel built in 2...,CESARIO VERDE,342tons,,https://www.balticshipping.com/vessel/imo/9269635,,,,,
140623,7514012,,HOKUSHO MARU NO.61(1991)TOKAI MARU NO.38(1985)...,Fishing vessel,Decommissioned or lost,Japan,48m,8m,1975,"NIIGATA SHIPBUILDING & REPAIR - NIIGATA, JAPAN",...,HISAYOSHI MARU NO.58 is a Fishing vessel built...,HISAYOSHI MARU NO.58,299tons,,https://www.balticshipping.com/vessel/imo/7514012,,,,,
140624,5163819,,MELENGER(1962)YW 93,Chemical/Oil tanker,Decommissioned or lost,Bahamas,53m,9m,1944,CONSOLIDATED WESTERN STEEL SAN PEDRO - SAN PED...,...,IRVING HICKORY is a Chemical/Oil tanker built ...,IRVING HICKORY,614tons,1006tons,https://www.balticshipping.com/vessel/imo/5163819,,,,,
140625,9119098,548223000,,Bulk carrier,Decommissioned or lost,Philippines,199m,32m,1995,SANOYAS MIZUSHIMA WORKS & SHIPYARD - KURASHIKI...,...,WORLD SWAN is a Bulk carrier built in 1995 by ...,WORLD SWAN,39023tons,46799tons,https://www.balticshipping.com/vessel/imo/9119098,NIPPON KAIJI KYOKAI (NKK),,,,


In [7]:
df.Flag

['Turkey',
 'Japan',
 'Italy',
 'Russian Federation',
 'Hong Kong',
 'United Kingdom (UK)',
 'Tuvalu',
 'Sierra Leone',
 'Canada',
 'Liberia',
 'Palau',
 'Japan',
 'Angola',
 'Singapore',
 'Panama',
 'Italy',
 'Peru',
 'Ecuador',
 'Indonesia',
 'Brazil',
 'Indonesia',
 'United Kingdom (UK)',
 'Panama',
 'Canada',
 'United States (USA)',
 'Netherlands',
 nan,
 'Russian Federation',
 'Panama',
 'Indonesia',
 'Japan',
 'Russian Federation',
 'Panama',
 'Germany',
 'France',
 'Marshall Islands',
 'Indonesia',
 'Panama',
 'Cyprus',
 'United States (USA)',
 nan,
 'United States (USA)',
 'Russian Federation',
 'Spain',
 'Honduras',
 'Vietnam',
 'India',
 'Russian Federation',
 'Equatorial Guinea',
 'Panama',
 'Russian Federation',
 nan,
 'Spain',
 'United Arab Emirates',
 'Russian Federation',
 'Indonesia',
 'Romania',
 'Denmark',
 nan,
 'Nigeria',
 'Iran',
 'United States (USA)',
 'Singapore',
 nan,
 'Palau',
 'Thailand',
 'Thailand',
 'Singapore',
 'Russian Federation',
 'Iran',
 'Panama',
