In [5]:
import json
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re
from collections import Counter

# -------------------------------------------------------------------
# Maritime Word Cloud Generation
# -------------------------------------------------------------------
def create_maritime_wordcloud(data_path, output_path='maritime_wordcloud.png'):
    """
    Create a professional word cloud from maritime Wikipedia data.
    
    Args:
        data_path: Path to the JSON file containing Wikipedia data
        output_path: Path where the PNG will be saved
    """
    
    print("Loading data...")
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    all_text = ' '.join([item['input'] if 'input' in item else item['content'] for item in data])
    
    custom_stopwords = set(STOPWORDS)
    
    additional_stopwords = {
        'ship', 'vessel', 'also', 'used', 'may', 'one', 'two', 'first',
        'called', 'known', 'however', 'often', 'usually', 'many', 'use',
        'using', 'including', 'include', 'includes', 'since', 'although',
        'within', 'without', 'would', 'could', 'should', 'must', 'shall',
        'will', 'can', 'wikipedia', 'article', 'section', 'page', 'edit',
        'source', 'citation', 'needed', 'retrieved', 'references', 'external',
        'links', 'see', 'main', 'disambiguation', 'redirect'
    }
    custom_stopwords.update(additional_stopwords)
    
    maritime_keywords = {
        'maritime', 'cargo', 'port', 'charter', 'charterparty', 'freight',
        'shipping', 'container', 'tanker', 'bulk', 'carrier', 'terminal',
        'berth', 'navigation', 'tonnage', 'dwt', 'teu', 'laytime', 'demurrage',
        'bill', 'lading', 'voyage', 'time', 'bareboat', 'shipowner', 'charterer',
        'marine', 'ocean', 'sea', 'naval', 'admiralty', 'insurance', 'salvage',
        'towage', 'pilotage', 'stevedoring', 'bunker', 'ballast', 'draft',
        'loading', 'discharge', 'stowage', 'manifest', 'customs', 'quarantine'
    }
    
    print("Processing text...")
    
    text = re.sub(r'\[\d+\]', '', all_text)
    text = re.sub(r'[^\w\s\'-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    words = text.lower().split()
    
    filtered_words = []
    for word in words:

        if word in maritime_keywords:
            filtered_words.append(word)

        elif (word not in custom_stopwords and 
              len(word) > 2 and 
              not word.isdigit() and
              word.isalpha()):
            filtered_words.append(word)
    
    processed_text = ' '.join(filtered_words)
    
    def maritime_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        colors = [
            (0, 60, 100),      # Navy blue
            (0, 86, 119),      # Dark teal
            (0, 119, 139),     # Ocean blue
            (0, 146, 166),     # Medium teal
            (28, 169, 201),    # Light ocean
            (72, 185, 213),    # Sky blue
            (25, 94, 131),     # Deep sea
            (40, 114, 151),    # Maritime blue
        ]
        
        idx = min(int((1 - font_size / 100) * len(colors)), len(colors) - 1)
        return f"rgb{colors[idx]}"
    
    # -------------------------------------------------------------------
    # Word Cloud Config
    # -------------------------------------------------------------------
    wordcloud = WordCloud(
        width=3200,       
        height=2000,      
        background_color='white',
        max_words=150,    
        relative_scaling=0.6,
        min_font_size=18,    
        max_font_size=180,   
        font_path=None,      
        prefer_horizontal=0.7,
        color_func=maritime_color_func,
        collocations=True, 
        random_state=42,
        margin=20,
        contour_width=0,
        contour_color='steelblue'
    ).generate(processed_text)
    
    plt.figure(figsize=(16, 10), dpi=200)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    
    plt.savefig(output_path, 
                format='png', 
                dpi=300,
                bbox_inches='tight',
                pad_inches=0.1,
                facecolor='white',
                edgecolor='none')
    plt.close()
    print(f"Word cloud saved to: {output_path}")

    return wordcloud

In [7]:
data_path = "../data/newsarticles/news_articles_combined.json"

wordcloud = create_maritime_wordcloud(
    data_path=data_path,
    output_path="new_articles_wordcloud.png"
)

Loading data...
Processing text...
Word cloud saved to: new_articles_wordcloud.png
