# Process ΑΣΕΠ data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import itables
itables.init_notebook_mode(all_interactive=False)
from ilsp_athenarc_asep_anonymizer.utils import parse_html_to_text, paragraph_to_doc_results
import regex as re
from ilsp_athenarc_asep_anonymizer.pii_anonymizer import PiiAnonymizer
# from pathlib import Path
# import spacy            
# import traceback
# import random
# from tabulate import tabulate
# import sys
# from tqdm import tqdm
# from flair.models import SequenceTagger

import logging
format = '%(asctime)s %(levelname)s %(message)-60s [%(filename)s:%(lineno)d]'
format = '%(asctime)s %(levelname)s %(message)s [%(filename)s:%(lineno)d]'
logging.basicConfig(level=logging.INFO,
                    format=format)
                    #'%(asctime)s - %(levelname)s - %(message)s [%(filename)s:%(lineno)d]')
logger = logging.getLogger(__name__)

NL="\n"
TAB="\t"
anonymizer = PiiAnonymizer()

asep_helpdesk_sample_json_url = "http://nlp42.ilsp.gr/asep/helpdesk-01-01-2025%2031-03-2025.json"

with urllib.request.urlopen(asep_helpdesk_sample_json_url) as url:
    asep_helpdesk_sample_json = json.load(url)
    asep_helpdesk_sample_data = asep_helpdesk_sample_json[2]["data"]

df = pd.DataFrame(asep_helpdesk_sample_data)

# Apply the parsing function to the 'response' column
logger.debug("Parsing 'response' column...")
df['response_str'] = df['response'].apply(parse_html_to_text)
logger.debug("'response_str' column created.")

# Apply the parsing function to the 'message' column
logger.debug("Parsing 'message' column...")
df['message_str'] = df['message'].apply(parse_html_to_text)
logger.debug("'message_str' column created.")

logger.info(df.info())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a style for the plots
sns.set_style("whitegrid")

# --- Visualization for help_categories (if primarily strings) ---
# Drop NaN values before counting
help_categories_counts = df['help_categories'].dropna().value_counts()
plt.figure(figsize=(12, 7))
sns.barplot(x=help_categories_counts.index, y=help_categories_counts.values, hue=help_categories_counts.values, palette='viridis')
plt.title('Distribution of Help Categories')
plt.xlabel('Help Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# --- Visualization for parent_categories (if primarily strings) ---
# Drop NaN values before counting
parent_categories_counts = df['parent_categories'].dropna().value_counts()
plt.figure(figsize=(12, 7))
sns.barplot(x=parent_categories_counts.index, y=parent_categories_counts.values, hue=parent_categories_counts.values, palette='viridis')
plt.title('Distribution of Parent Categories')
plt.xlabel('Parent Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Define the number of cells to sample from each column
n_samples = 10  # You can change this number to your desired sample size
print(f"Sampling {n_samples} cells from each column:")
# Iterate through each column in the DataFrame

for column_name in ['help_categories', 'parent_categories', 'subject', 'message_str', 'response_str', 
                    #'contest_value', 'department_name', 
                    ]:
    print(f"\n--- Sampling from column: '{column_name}' ---")

    # Check if the number of non-null values is less than n_samples
    # If so, sample all available non-null values to avoid errors
    # Alternatively, you could use replace=True in .sample() if you allow duplicate samples
    if df[column_name].count() < n_samples:
        print(f"Column '{column_name}' has fewer than {n_samples} non-null values. Sampling all {df[column_name].count()} non-null values.")
        sampled_cells = df[column_name].dropna().sample(n=df[column_name].count(), replace=False)
    else:
        # Sample n_samples non-null values from the column
        # .dropna() is used to ensure we only sample non-null values
        sampled_cells = df[column_name].dropna().sample(n=n_samples, replace=False)
    # Print the sampled cells for the current column
    print(sampled_cells)
    print("-" * (len(column_name) + 30)) # Separator for clarity

In [None]:
# df.sample(n=100)

In [None]:
for idx, row in df.iterrows():
    html_content = row["message"] 
    if "konstantonis" in html_content:
        print("")        
        print(f'Input html: {html_content}')
        text = parse_html_to_text(html_content)
        print("")        
        print(f'Text with new lines: {text}')
        paragraph_results = anonymizer.anonymize(text)
        
        print("")        
        print("### Results per para ###")
        for paragraph_result in paragraph_results:
            print(paragraph_result)
        
        print("")
        print("### Results for doc  ###")
        doc_results = paragraph_to_doc_results(paragraph_results)
        print(json.dumps(doc_results, ensure_ascii=False, indent=2))


