# SUBMARINE CABLES PROJECT


--------

## Classfication of articles extracted from SubtleForum into different categories :
### - Geopolitical
### - Human Activity
### - Environmental
### - Aging
### and maybe unknown too

## Extracting relevant region names from given data and tags to know the candidate places

------

### The output of this code are 2 csv files :
- "articles_classification_results.csv" which contains the Classification label
- "articles_classified_results_with_countries_cable.csv" which contains the potential region names 

In [None]:
# --- Setup your OpenRouter API key ---
API_KEY = 'put_the_api_key'
API_URL = 'https://openrouter.ai/api/v1/chat/completions'

------

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import re
import time
import csv
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [None]:
# --- Step 1: Scrape article text from SubTelForum ---
def scrape_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        divs_with_paragraphs = []
        for div in soup.find_all("div"):
            paragraphs = div.find_all("p")
            text = " ".join(p.get_text(strip=True) for p in paragraphs)
            if len(text) > 300:
                divs_with_paragraphs.append((text, len(text)))

        if not divs_with_paragraphs:
            return "[Scrape Failed] No valid content found"

        divs_with_paragraphs.sort(key=lambda x: x[1], reverse=True)
        return divs_with_paragraphs[0][0]

    except Exception as e:
        return f"[Scrape Failed] {e}"

# --- Step 2: Classify using OpenRouter with structured output ---
def classify_with_openrouter(title, tags, article_text):
    max_length = 3000
    article_text_to_send = article_text[:max_length]

    data = {
        "model": "deepseek/deepseek-chat-v3-0324",
        "messages": [
            {"role": "user", "content": f"""
        You are a classification assistant. You must return the output strictly in the following format:

        ###Classification###: <Geopolitical | Human Activity | Environmental | Aging>
        ###Reason###: <Clear and concise explanation of why the classification was chosen>
        ###Cable name###: <Cable name or names present in the text>

        Only use one of the four labels. Do not include any other information or summary. Focus on the **cause** of the incident.

        Classify the following article into one of these categories: 
        - Geopolitical: Issues related to international relations, military actions, and nation-state conflicts. 
        - Human Activity: Causes from accidents, fishing, ships, construction, etc.
        - Environmental: Natural events like earthquakes, storms, seabed movement, etc.
        - Aging: Old cables, degradation, or maintenance.

        Ignore mentions of countries unless they are the direct cause. Don't assume sabotage unless clearly proven. Also, try to infer cable names even if not explicitly labeled. Don't give extra information just use whatever is given.
        The main task is to understand the context, also consider important words, think and then classify, so please focus on that. The reason should be detailed.

        --- ARTICLE START ---
        {article_text_to_send}
        --- ARTICLE END ---
    """}
        ]
    }

    headers = {
        'Authorization': f'Bearer {API_KEY}',
        'Content-Type': 'application/json'
    }

    try:
        response = requests.post(API_URL, json=data, headers=headers, timeout=20)

        if response.status_code == 200:
            result = response.json()
            choices = result.get('choices', [])

            if not choices:
                return "Unknown", "No classification choices.", "Unknown"

            message = choices[0].get('message', {}).get('content', '').strip()

            # Extract fields via regex
            category = re.search(r"###Classification###:\s*(.*)", message)
            reason = re.search(r"###Reason###:\s*(.*)", message)
            cable = re.search(r"###Cable name###:\s*(.*)", message)

            category = category.group(1).strip() if category else "Unknown"
            reason = reason.group(1).strip() if reason else "No reason provided"
            cable_name = cable.group(1).strip() if cable else "Unknown"

            return category, reason, cable_name
        else:
            return "Unknown", f"API Error: {response.status_code} - {response.text}", "Unknown"

    except Exception as e:
        return "Unknown", f"Failed to classify due to OpenRouter API error: {e}", "Unknown"



# --- Step 3: Process articles from DataFrame and store row by row ---
def classify_articles_from_df(df, output_file):
    file_exists = os.path.isfile(output_file)

    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_ALL)
        
        if not file_exists:
            writer.writerow(["Title", "Date", "Tags", "URL", "Classification", "Justification", "Cable Name"])

        for idx, row in df.iterrows():
            print(f"\n Processing article {idx + 1} of {len(df)}")
            try:
                title = row["Title"]
                date = row["Date"]
                tags = row["Tags"]
                url = row["URL"]

                article_text = scrape_article_text(url)
                if not article_text or "failed" in article_text.lower():
                    classification = "Unknown"
                    reason = "Failed to scrape article."
                    cable_name = "Unknown"
                else:
                    category, reason, cable_name = classify_with_openrouter(title, tags, article_text)
                    classification = category

                writer.writerow([title, date, tags, url, classification, reason, cable_name])
                print(f"[{idx + 1}] Category: {classification} | Cable: {cable_name}\n")
                time.sleep(2.0) 

            except Exception as e:
                print(f"Error on article {idx + 1}: {e}")
                writer.writerow([title, date, tags, url, "Unknown", f"Error occurred: {e}", "Unknown"])

    print("Processing complete.")


df = pd.read_csv("articles_cable_faults.csv")

output_file = "articles_classification_results.csv"

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    file.write("Title,Date,Tags,URL,Classification,Justification,Cable Name\n")

classify_articles_from_df(df, output_file)


In [None]:
df = pd.read_csv("articles_classification_results.csv")

nlp = spacy.load("en_core_web_sm")

# Predefined list of countries for better accuracy
COUNTRIES = set([
    "United States", "China", "Russia", "India", "France", "Germany", "United Kingdom", "Japan", 
    "Canada", "Australia", "Brazil", "South Africa", "Italy", "Spain", "Netherlands", "Norway",
    "Denmark", "Sweden", "Finland", "Iceland", "Greece", "Mexico", "Argentina", "Turkey",
    "Indonesia", "Malaysia", "Philippines", "Vietnam", "Thailand", "South Korea", "Saudi Arabia",
    "Egypt", "Pakistan", "Ukraine", "Poland", "Ireland", "New Zealand", "Singapore",
    "Portugal", "Bangladesh", "Chile", "Taiwan", "UAE", "Iran", "Iraq", "Afghanistan",
    "Belgium", "Switzerland", "Austria", "Czech Republic", "Romania", "Bulgaria",
    "Kazakhstan", "Belarus", "Colombia", "Peru", "Venezuela", "Nigeria", "Kenya",
    "Ethiopia", "Morocco", "Algeria", "Tunisia", "Libya", "Sudan", "Israel",
    "Bahrain", "Qatar", "Oman", "Lebanon", "Syria", "Jordan", "Kuwait",
    "BALTIC"  # Including Baltic as a country
])

# Known cable system names (expandable)
CABLE_NAMES = {
    "2Africa", "AAE-1", "AAG", "AC-1", "AC-2", "ACC-1", "ACE", "Aden-Djibouti", "ADRIA-1", "AEConnect", 
    "Africa-1", "AIS", "AJC", "Alaska Communications System", "ALETAR", "Alonso de Ojeda", "ALPAL-2", 
    "AMERICAS-1 NORTH", "AMERICAS-1 SOUTH", "AMERICAS-II", "Amitié", "AMX-1", "ANNIBAL", "ANTILLAS I", 
    "Antilles Crossing Phase 1", "ANZAC Cable System", "ANZCAN", "APC", "APCN", "APCN 2", "APHRODITE-1", 
    "APHRODITE-2", "APNG-1", "APNG-2", "APG", "Apollo", "ARCOS-1", "Arctic Fibre", "Arctic Link", "ARIANE-2", 
    "ASE", "ASEAN", "ASH", "Atlantica-1/GlobeNet", "ATLANTIS", "ATLANTIS-2", "ATLAS", "Atlas Offshore", 
    "Australia Singapore Cable", "Australia West Express", "BAHAMAS 2", "BALTICA", "Banjoewangi", "Barcelona", 
    "BARGEN", "BARSAV", "BCS", "BDSNi", "BERYTAR", "Bharat Lanka Cable System", "BICS", "BMP", "Botnia", 
    "BRCS", "BS", "BSCS", "BSFOCS", "BT-MT1", "BT-TE1", "BUS-1", "BDM", "BBG", "C-J FOSC", "CADMOS", "Canal Zone", 
    "CANTAT-1", "CANTAT-2", "CANTAT-3", "CANUS-1", "CARAC", "C-Lion1", "Cayman-Jamaica", "CELTIX CONNECT", 
    "CFX", "Challenger", "CIOS", "CIRCE NORTH", "CIRCE SOUTH", "CKC", "CNSFTC", "Colombia-Jamaica-Florida", 
    "COLUMBUS II", "COLUMBUS III", "Commonwealth Pacific Cable", "Concerto 1", "CS2", "CSCN", "Corfù–Bar", 
    "CORSAR", "Cuba-Venezuela", "CUCN", "Danica North", "Danica South", "DANICE", "Darwin", "Denmark-Norway 5", 
    "Denmark-Norway 6", "Denmark-Poland 2", "Denmark-Russia 1", "Denmark-Sweden 15", "Denmark-Sweden 16", 
    "Denmark-Sweden 17", "Denmark-Sweden 18", "DMCS", "Dunant", "DSCS", "EAC-C2C", "Eagle", "EASSy", "EC-1", 
    "ECFS", "ECSC", "EDF1", "EDF2", "EE-S1", "EESF-2", "EESF-3", "EIG", "EMOS 1", "ESAT 1", "ESAT 2", "Estlink-2",
    "Estepona–Tetuán", "EURAFRICA", "FALCON", "FARICE-1", "FARLAND", "FASTER", "FEC", "Fehmarn Belt", "Fibralink", 
    "FLAG FA-1", "FLAG FALCON", "FLAG FEA", "FLAG FNAL", "Florida-Jamaica", "FLAG FP-1", "FOG", "FOG2", 
    "France-Algeria", "France-Greece", "France-Morocco", "France-Tunisia", "G-P", "GCN", "Gemini", "Georgia-Russia", 
    "Germany-Denmark 1", "Germany-Denmark 2", "Germany-Sweden 4", "Germany-Sweden 5", "GLO-1", "GO-1", "Gondwana-1", 
    "Gotland-Ventspils", "GPT", "Grace Hopper", "Greenland Connect", "Gulf Bridge International", "GWEN", "HANNIBAL", 
    "HANTRU-1", "Havfrue", "Hawaiki", "Hawk", "HERMES-1", "HERMES-2", "Hibernia Atlantic", "HJK", "Honotua", 
    "HONTAI-2", "HSCS", "HUGO", "HUGO East", "I-ME-WE", "i2i", "India-UAE", "INDIGO-West", "IOCOM", "Ir-UK Seg A", 
    "Ir-UK Seg B", "Italy-Albania", "Italy-Croatia", "Italy-Greece", "Italy-Libya", "Italy-Malta", "Italy-Monaco", 
    "Italy-Tunisia", "ITUR", "Japan-US", "JAKABARE", "JASURAUS", "Jersey-Guernsey 4", "JKC", "JNAC", "JONAH", 
    "KAFOS", "KATTEGAT-1", "KATTEGAT-2", "KELTRA-2", "Key West-Havana 5", "Key West-Havana 6", "KDN-Reliance", 
    "KJCN", "Kuwait-Iran", "La Perouse-Nelson", "La Perouse-Wakapuaka", "LANIS-1", "LANIS-2", "LANIS-3", "LEV", 
    "Liberty", "LION", "LV-SE 1", "LSP", "MAC", "Main One", "SLT-Dhiraagu Cable System", "Malaysia-Thailand", 
    "MAREA", "MARS", "Marseille-Palermo", "MARTEL", "MAT-2", "MAYA-1", "MCN", "MCS", "MedNautilus", "MED Cable", 
    "METISS Cable", "MENA", "MIC-1", "Micronesia Cable System", "MINERVA", "Monet", "MOYLE NORTH", "MOYLE SOUTH", 
    "MT", "MTC", "NACS", "NAFSIKA", "NCP", "New Jersey-Bermuda", "New Zealand-Fiji", "NorSea Com 1", "NPC", "ODIN", 
    "Okinawa-Luzon-Hong Kong", "Oman Australia Cable", "Otranto-Corfù", "ORVAL", "PEACE Cable", "PAC", 
    "Pacific Caribbean Cable System", "PacRimEast", "PacRimWest", "PAN AM", "Pangea", "PC-1", "PEC", "PLCN", 
    "PPC-1", "Portugal-UK", "Project Express", "PTAT-1", "Qatar-UAE 1", "Qatar-UAE 2", "Quantum Cable", 
    "REMBRANDT-1", "RIOJA-1", "RJCN", "Russian Optical Trans-Arctic Submarine Cable System", "SAC", "SACS", 
    "SAFE", "SAm-1", "SAT-1", "SAT-2", "SAT-3/WASC", "SCAN", "Scandinavian Ring", "SEA-ME-WE 1", "SEA-ME-WE 2", 
    "SEA-ME-WE 3", "SEA-ME-WE 4", "SEA-ME-WE 5", "SEA-ME-WE 6", "Seabras-1", "SEACOM", "SG-SCS", "SHEFA-2", 
    "SIRIUS NORTH", "SIRIUS SOUTH", "SJC", "SMPR-1", "SOLAS", "Southern Caribbean Fiber", "Southern Cross", 
    "T-V-H", "TAGIDE-2", "TAIGU", "TAINO CARIB", "TampNet", "Tangerine", "TASMAN 1", "TASMAN 2", "TAT-1", 
    "TAT-2", "TAT-3", "TAT-4", "TAT-5", "TAT-6", "TAT-7", "TAT-8", "TAT-9"
}

def extract_countries_and_cables(tags):
    """Extract country names and cable names from the Tags column."""
    if pd.isna(tags):
        return "Unknown", "Unknown"

    doc = nlp(tags)
    found_countries = set()
    found_cables = set()

    for ent in doc.ents:
        if ent.label_ == "GPE" or ent.text.upper() in COUNTRIES:  # Country detection
            found_countries.add(ent.text)
        elif ent.text in CABLE_NAMES:  # Cable system detection
            found_cables.add(ent.text)

    if "Baltic" in tags or "BALTIC" in tags:
        found_countries.add("Baltic")

    return ", ".join(found_countries) if found_countries else "Unknown", ", ".join(found_cables) if found_cables else "Unknown"

df[["Countries Involved", "Cable Name"]] = df["Tags"].apply(lambda x: pd.Series(extract_countries_and_cables(x)))

df.to_csv("articles_classified_results_with_countries_cables.csv", index=False)

print("Extraction completed! Data saved to articles_classified_results_with_countries_cables.csv")

Extraction completed! Data saved to articles_classified_results_with_countries_cables.csv


In [None]:
import pandas as pd

# Load the CSV file
with open("telegeography_data.csv", 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Find the line where the header starts (based on column names)
header_line = None
for i, line in enumerate(lines):
    if "Cable" in line and "Region" in line and "Fault Location" in line:
        header_line = i
        break

# If header line is found, reload the CSV with the proper header line and skip unwanted rows
if header_line is not None:
    df = pd.read_csv("Telegeography_Data.csv", header=header_line)
    
    # Add the 'Final Inferred Category' column with all 'NA' values
    df['Final Inferred Category'] = 'NA'
    
    # Save the updated DataFrame back to a new CSV file
    df.to_csv("telegeography_data_updated.csv", index=False)
    print("Column 'Final Inferred Category' added with 'NA' values and cleaned data saved.")
else:
    print("Header row not found.")


Column 'Final Inferred Category' added with 'NA' values and cleaned data saved.
