In [1]:
import xml.etree.ElementTree as ET
import csv
import re
import os
from google.colab import files

# --- Instructions for the User ---
# 1. Run this cell.
# 2. A "Choose Files" button will appear.
# 3. Select and upload your three XML files: KO00001.xml, KO00002.xml, and KO00003.xml.
# 4. The script will process them and create a 'konkani_words.csv' file.
# 5. This new file should now open correctly with Devanagari script in programs like Excel.

def upload_files():
    """Function to handle file uploads in Colab."""
    print("Please upload your XML files (KO00001.xml, KO00002.xml, KO00003.xml)...")
    uploaded = files.upload()

    # Verify that files were uploaded
    if not uploaded:
        print("\nNo files were uploaded. Please run the cell again.")
        return []

    # Print the names of the uploaded files
    filenames = list(uploaded.keys())
    print(f"\nUploaded {len(filenames)} file(s): {', '.join(filenames)}")
    return filenames

def extract_words_from_xml(filename):
    """
    Parses a single Konkani XML file and extracts all words from <p> tags.

    Args:
        filename (str): The path to the XML file.

    Returns:
        list: A list of all extracted words.
    """
    words = []
    try:
        # Parse the XML file from the uploaded content
        tree = ET.parse(filename)
        root = tree.getroot()

        # The text is inside <Doc> -> <text> -> <body> -> <p>
        # We use a robust findall to get all <p> tags anywhere in the document
        paragraphs = root.findall('.//p')

        for p in paragraphs:
            if p.text:
                # Use regex to find all sequences of Devanagari characters.
                # This is more accurate than split() as it handles punctuation.
                # The Unicode range for Devanagari is U+0900 to U+097F.
                konkani_words = re.findall(r'[\u0900-\u097F]+', p.text)
                words.extend(konkani_words)

    except ET.ParseError as e:
        print(f"Error parsing {filename}: {e}")
    except FileNotFoundError:
        print(f"Error: The file {filename} was not found.")

    return words

def save_words_to_csv(word_list, output_filename):
    """
    Saves a list of words to a single-column CSV file with a UTF-8 BOM
    to ensure compatibility with programs like Microsoft Excel.

    Args:
        word_list (list): The list of words to save.
        output_filename (str): The name of the output CSV file.
    """
    try:
        # Using 'utf-8-sig' adds a Byte Order Mark (BOM) to the beginning of the file.
        # This is a signal for programs like Excel to correctly read the UTF-8 encoding.
        with open(output_filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.writer(csvfile)
            # Write the header
            writer.writerow(['word'])
            # Write each word on a new row
            for word in word_list:
                writer.writerow([word])
        print(f"\nSuccessfully saved {len(word_list)} words to {output_filename}")
        print(f"You can find '{output_filename}' in the Colab file browser on the left.")
    except Exception as e:
        print(f"An error occurred while writing to CSV: {e}")

# --- Main Execution ---
if __name__ == "__main__":
    # 1. Upload files
    xml_filenames = upload_files()

    if xml_filenames:
        all_konkani_words = []

        # 2. Process each uploaded file
        for filename in xml_filenames:
            print(f"Processing {filename}...")
            words_from_file = extract_words_from_xml(filename)
            all_konkani_words.extend(words_from_file)
            print(f"-> Extracted {len(words_from_file)} words.")

        # 3. Save the combined list to a CSV file
        if all_konkani_words:
            csv_output_filename = 'konkani_words.csv'
            save_words_to_csv(all_konkani_words, csv_output_filename)
        else:
            print("\nCould not extract any words. Please check the XML file structure.")

Please upload your XML files (KO00001.xml, KO00002.xml, KO00003.xml)...


Saving KO00001.xml to KO00001.xml
Saving KO00002.xml to KO00002.xml
Saving KO00003.xml to KO00003.xml
Saving KO00004.xml to KO00004.xml
Saving KO00005.xml to KO00005.xml

Uploaded 5 file(s): KO00001.xml, KO00002.xml, KO00003.xml, KO00004.xml, KO00005.xml
Processing KO00001.xml...
-> Extracted 2574 words.
Processing KO00002.xml...
-> Extracted 2439 words.
Processing KO00003.xml...
-> Extracted 3029 words.
Processing KO00004.xml...
-> Extracted 2752 words.
Processing KO00005.xml...
-> Extracted 4684 words.

Successfully saved 15478 words to konkani_words.csv
You can find 'konkani_words.csv' in the Colab file browser on the left.
