In [None]:
import pandas as pd
import numpy as np
import re
import unidecode
import ast
import nltk
import spacy
from sklearn.cluster import DBSCAN
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
import os
import time
import unicodedata

In [None]:

nltk.data.path.append(os.path.expanduser('~/nltk_data'))
nltk.data.find('tokenizers/punkt')

tqdm.pandas()

nlp = spacy.load("en_core_web_lg")
df = pd.read_csv("ml_insurance_challenge.csv", encoding="utf-8")

In [None]:
required_columns = ["description", "business_tags", "sector", "category", "niche"]

df[required_columns] = df[required_columns].applymap(
    lambda x: 'missing_value' if pd.isna(x) or str(x).strip() == '' else x
)

df.reset_index(drop=True, inplace=True)

processed_texts = []
for i in tqdm(range(len(df)), desc="Preprocesare text pentru BERT"):
    text = str(df.loc[i, "description"]).strip()

    text = unidecode.unidecode(text)
    text = re.sub(r"http\S+|www\S+|@\S+", "", text)
    text = re.sub(r"[^\w\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)

    processed_texts.append(text)

In [None]:
clustering_params = {
    "description": {"eps": 0.4, "min_samples": 5},
    "business_tags": {"eps": 0.5, "min_samples": 3},
    "sector": {"eps": 0.7, "min_samples": 2},
    "category": {"eps": 0.7, "min_samples": 2},
    "niche": {"eps": 0.7, "min_samples": 2},
}

In [None]:
for column in required_columns:
    print(f"=== Procesare coloana: {column} ===")

    all_words = set()

    for val in tqdm(df[column].fillna("").astype(str), desc=f"Extragere cuvinte ({column})", unit=" randuri"):
        if column == "business_tags":
            try:
                tags_list = ast.literal_eval(val)
                if isinstance(tags_list, list):
                    for t in tags_list:
                        for w in t.split():
                            all_words.add(w)
            except:
                for w in val.split():
                    all_words.add(w)
        else:
            for w in val.split():
                all_words.add(w)

    if not all_words:
        print(f"Coloana goala: {column}, se sare peste.")
        continue

    words_list = list(all_words)
    embeddings_list = []

    print(f"Generare embeddings pentru {len(words_list)} cuvinte...")
    for w in tqdm(words_list, desc="Creare embeddings", unit=" cuvinte"):
        token_vec = nlp(w).vector
        embeddings_list.append(token_vec)

    embeddings_list = np.array(embeddings_list)

    eps_val = clustering_params[column]["eps"]
    min_samples_val = clustering_params[column]["min_samples"]

    print("Aplic DBSCAN pentru clustering...")
    db = DBSCAN(eps=eps_val, min_samples=min_samples_val, metric="cosine")
    labels = db.fit_predict(embeddings_list)

    noise_indices = [i for i, lab in enumerate(labels) if lab == -1]
    noise_words = {words_list[i] for i in noise_indices}

    print(f"Total cuvinte: {len(words_list)}, Noise detectat: {len(noise_words)}")

    print(f"Curațare {column}...")

    if column == "description":
        for i in tqdm(range(len(df)), desc="Curatare description", unit=" randuri"):
            text = str(df.at[i, column])
            tokens = text.split()
            filtered = [tok for tok in tokens if tok not in noise_words]
            # df.at[i, column] = " ".join(filtered)

    elif column == "business_tags":
        for i in tqdm(range(len(df)), desc="Curatare business_tags", unit=" randuri"):
            val = str(df.at[i, column])
            try:
                tags_list = ast.literal_eval(val)
                if isinstance(tags_list, list):
                    new_tags = []
                    for tg in tags_list:
                        tag_words = tg.split()
                        if any(w in noise_words for w in tag_words):
                            continue
                        new_tags.append(tg)
                    df.at[i, column] = new_tags
                else:
                    df.at[i, column] = []
            except:
                df.at[i, column] = []

    else:
        for i in tqdm(range(len(df)), desc=f"Curatare {column}", unit=" randuri"):
            val = str(df.at[i, column])
            tokens = val.split()
            if any(t in noise_words for t in tokens):
                df.at[i, column] = f"missing_{column}"

In [None]:
df.to_csv("cox.csv", index=False)