<a href="https://colab.research.google.com/github/romeroarcasandres/PED_lev-dist/blob/main/PED_lev_dist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PED Scoring Tool

This notebook provides a tool for comparing two columns in a CSV file, calculating similarity scores using Levenshtein distance, and generating both a modified CSV with scores and an HTML report showing the differences.

## Setup and Installation

In [1]:
# Install required packages
!pip install python-Levenshtein diff-match-patch

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting diff-match-patch
  Downloading diff_match_patch-20241021-py3-none-any.whl.metadata (5.5 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading diff_match_patch-20241021-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31

## Import Libraries

In [2]:
import os
import pandas as pd
import difflib
import diff_match_patch as dmp_module
import Levenshtein
from google.colab import files

## Helper Functions

In [3]:
# Function to calculate the similarity score based on the raw Levenshtein distance.
# It normalizes the raw distance so that if the strings are identical the score is 100,
# and if they are very different the score will be closer to 0.
def calculate_levenshtein_score(str1, str2):
    # If both strings are empty, consider them identical
    if not str1 and not str2:
        return 100.0
    # Calculate the raw Levenshtein distance
    raw_distance = Levenshtein.distance(str1, str2)
    # Normalize the distance using the length of the longer string
    max_len = max(len(str1), len(str2))
    similarity = 1 - (raw_distance / max_len)
    # Multiply by 100 to convert to a percentage-like score
    return similarity * 100

# Function to calculate the document-level PED score
def calculate_document_ped(col1_data, col2_data):
    """
    Calculate the document-level PED score by concatenating all text
    from both columns and computing the Levenshtein distance on the
    combined text.
    """
    # Concatenate all text from each column
    full_text_col1 = " ".join([str(text) if pd.notna(text) else "" for text in col1_data])
    full_text_col2 = " ".join([str(text) if pd.notna(text) else "" for text in col2_data])

    # Calculate the Levenshtein score for the entire document
    document_score = calculate_levenshtein_score(full_text_col1, full_text_col2)

    return document_score

# Function to calculate weights based on the length of the segments
def calculate_weight(str1, str2):
    return len(str1) + len(str2)

# Function to generate HTML report of differences with row-level Levenshtein scores
def generate_html_report(dmp, filename, col1_data, col2_data, diffs_list, score_column, document_ped, report_name, header1, header2):
    html_report = [
        f'''
        <html>
        <head>
            <style>
                body {{
                    font-family: Arial, sans-serif;
                    margin: 20px;
                }}
                .document-ped {{
                    font-size: 18px;
                    font-weight: bold;
                    margin-bottom: 20px;
                    padding: 15px;
                    background-color: #e8f4f8;
                    border-left: 4px solid #2196F3;
                    border-radius: 4px;
                }}
                table {{
                    width: 100%;
                    border-collapse: collapse;
                }}
                th, td {{
                    border: 1px solid #dddddd;
                    text-align: left;
                    padding: 8px;
                }}
                th {{
                    background-color: #f2f2f2;
                }}
                tr:nth-child(even) {{
                    background-color: #f9f9f9;
                }}
                pre {{
                    white-space: pre-wrap; /* Allows wrapping of long lines */
                    word-wrap: break-word; /* Breaks long lines within the 'pre' tag */
                }}
            </style>
        </head>
        <body>
            <h2>Comparison Report for: {filename}</h2>
            <div class="document-ped">
                PED of the document: {document_ped:.2f}
            </div>
            <table>
                <tr>
                    <th>Index</th>
                    <th>{header1}</th>
                    <th>{header2}</th>
                    <th>Differences</th>
                    <th>Score</th>
                </tr>
        '''
    ]

    for i, (data1, data2, diff_html, score) in enumerate(zip(col1_data, col2_data, diffs_list, score_column)):
        html_report.append(f'<tr><td>{i + 1}</td><td>{data1}</td><td>{data2}</td><td><pre>{diff_html}</pre></td><td>{score:.2f}</td></tr>')

    html_report.append('''
        </table>
        </body>
        </html>
    ''')

    with open(f'{report_name}.html', 'w') as f:
        f.write('\n'.join(html_report))

    print(f"HTML diff report generated: {report_name}.html")

    # Download the HTML report in Colab
    files.download(f'{report_name}.html')

## Main Comparison Function (Modified for Colab)

In [4]:
# Main function for comparison - adapted for Colab environment
def compare_columns_in_csv():
    # Upload a CSV file in Colab
    print("Please upload a CSV file.")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded. Exiting.")
        return

    # Get the first uploaded file
    csv_file = list(uploaded.keys())[0]

    # Load CSV into pandas DataFrame with encoding detection
    # Try different encodings in order of likelihood
    encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-16']
    df = None

    for encoding in encodings:
        try:
            df = pd.read_csv(csv_file, encoding=encoding)
            print(f"Successfully loaded CSV with {encoding} encoding")
            break
        except (UnicodeDecodeError, UnicodeError):
            continue

    if df is None:
        print("Error: Could not read the CSV file with any known encoding.")
        return

    # Display available columns
    print("Available columns:")
    for i, col in enumerate(df.columns):
        print(f"{i + 1}: {col}")

    # Prompt user to select columns using input() instead of tkinter
    col1_index = int(input("Select the first column index - the Candidate MTed text (e.g., 1, 2, ...): ")) - 1
    col2_index = int(input("Select the second column index - the Reference (e.g., 1, 2, ...): ")) - 1

    col1 = df.columns[col1_index]
    col2 = df.columns[col2_index]

    # Get the report name from the user
    report_name = input("Enter the name for the HTML report (without extension): ")

    # Initialize diff_match_patch and lists to store results
    dmp = dmp_module.diff_match_patch()
    diffs_list = []
    score_column = []

    col1_data = df[col1].astype(str).tolist()
    col2_data = df[col2].astype(str).tolist()

    for val1, val2 in zip(col1_data, col2_data):
        str1 = str(val1) if pd.notna(val1) else ""
        str2 = str(val2) if pd.notna(val2) else ""

        # Calculate score using the normalized raw Levenshtein distance approach
        score = calculate_levenshtein_score(str1, str2)
        score_column.append(score)

        # Generate HTML diff using diff_match_patch
        diffs = dmp.diff_main(str1, str2)
        dmp.diff_cleanupSemantic(diffs)
        diff_html = dmp.diff_prettyHtml(diffs)
        diffs_list.append(diff_html)

    # Calculate document-level PED score
    document_ped = calculate_document_ped(col1_data, col2_data)
    print(f"\nDocument-level PED Score: {document_ped:.2f}")

    # Add the row-level score to the dataframe as a new column
    df[f"Score ({col1} vs {col2})"] = score_column

    # Save and download the modified CSV
    output_csv = f"modified_{csv_file}"
    df.to_csv(output_csv, index=False)
    print(f"Modified CSV saved: {output_csv}")
    files.download(output_csv)

    # Generate the HTML report with differences and row-level scores
    generate_html_report(dmp, csv_file, col1_data, col2_data, diffs_list, score_column, document_ped, report_name, col1, col2)

## Execute the Script

In [5]:
# Run the comparison function
compare_columns_in_csv()

Please upload a CSV file.


Saving modified_GoogleTranslate.csv to modified_GoogleTranslate.csv
Successfully loaded CSV with utf-8 encoding
Available columns:
1: EN
2: ES (Reference)
3: Fine-tuned
4: Google Translate
5: ChatGPT
6: Widn
7: Unnamed: 6
8: Unnamed: 7
9: Score (Google Translate vs ES (Reference))
10: Score (ES (Reference) vs Google Translate)
Select the first column index - the Candidate MTed text (e.g., 1, 2, ...): 4
Select the second column index - the Reference (e.g., 1, 2, ...): 2
Enter the name for the HTML report (without extension): report

Document-level PED Score: 73.49
Modified CSV saved: modified_modified_GoogleTranslate.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

HTML diff report generated: report.html


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Preview DataFrame (Optional)

In [None]:
# Code to preview the DataFrame after processing
# Uncomment and run this cell to see the results
# df.head()

## Interactive Report Display (Optional)

In [None]:
# Display the HTML report in the notebook (requires the HTML file to exist)
# from IPython.display import HTML, display
# with open(f'{report_name}.html', 'r') as f:
#     display(HTML(f.read()))