In [1]:
# ============================================================
# 1) Imports + constants
# ============================================================
import re, unicodedata, ast, warnings, nltk, pandas as pd
from collections import defaultdict, Counter
from transformers import pipeline
from tokenizers.decoders import WordPiece

warnings.filterwarnings("ignore", category=UserWarning)

try:                           # Colab / Jupyter?
    from google.colab import files
    COLAB = True
except ImportError:
    COLAB = False

# ─── NLTK stop-words (עברית) ────────────────────────────────
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords")

from nltk.corpus import stopwords
HEB_STOP = set(stopwords.words("hebrew"))
HEB_STOP.update({

})

PREFIXES = ["", "ל", "ב", "מ", "כש", "ש", "ה", "כ", "ו"]

def generate_hebrew_variants(name:str):
    return {p+name for p in PREFIXES}

def improved_name_pattern(name:str):
    return rf"(?<![א-ת]){re.escape(name)}(?![א-ת])"

def overlap_range(s1,e1,s2,e2):
    return not (e1<=s2 or s1>=e2)

def multiword_and_subwords(ent):
    out=[ent]
    if " " in ent["word"]:
        start=ent["start"]
        for i,p in enumerate(ent["word"].split(" ")):
            out.append({"start":start,
                        "end":start+len(p),
                        "word":p,
                        "entity_group":ent["entity_group"]})
            start += len(p)+1
    return out

def remove_sub_entities(ents):
    ents=sorted(ents, key=lambda x:(x["end"]-x["start"]), reverse=True)
    keep=[]
    for e in ents:
        if any(e["start"]>=k["start"] and e["end"]<=k["end"] for k in keep):
            continue
        keep.append(e)
    return sorted(keep, key=lambda x:x["start"])

def split_if_too_long(rows,max_chars=1000):
    out,batch,curr=[],[],0
    for r in rows:
        need=len(r["content"])+(1 if batch else 0)
        if batch and curr+need>max_chars:
            out.append(batch); batch=[r]; curr=len(r["content"])
        else:
            if batch: curr+=1
            batch.append(r); curr+=len(r["content"])
    if batch: out.append(batch)
    return out

# ============================================================
# 2) data_processing helper
# ============================================================
class data_processing:
    def __init__(self): self.df=None

    @staticmethod
    def clean_hebrew_text(txt:str):
        txt=unicodedata.normalize("NFC",txt)
        txt=re.sub(r"[\U00010000-\U0010FFFF]+","",txt)
        txt=re.sub(r"\u200f|<המדיה לא נכללה>|<ההודעה נערכה>|הודעה זו נמחקה","",txt)
        return txt

    def load_chat(self,fpath:str):
        print("Loading chat file …")
        with open(fpath,"r",encoding="utf-8") as f: content=f.read()
        pat=r"(\d{1,2}\.\d{1,2}\.\d{4}),\s(\d{1,2}:\d{2})\s-\s([^:]+):\s(.+)"
        rows=[]
        for line in content.split("\n"):
            m=re.match(pat,line)
            if m:
                d,t,s,msg=m.groups()
                rows.append({"date":d,"time":t,"sender":s,
                             "content":self.clean_hebrew_text(msg)})
        df=pd.DataFrame(rows)
        df=df[df["content"].astype(bool)].reset_index(drop=True)
        df["id"]=df.index+1
        df["content_len"]=df["content"].str.len()
        print(f"Loaded {len(df)} messages")
        self.df=df

    def add_message_id(self,time_gap=24):
        df=self.df
        df["dt"]=pd.to_datetime(df["date"]+" "+df["time"],format="%d.%m.%Y %H:%M")
        df.sort_values("dt",inplace=True)
        mids,prev,mid=[],None,1
        for dt in df["dt"]:
            if prev and (dt-prev).total_seconds()>time_gap*3600: mid+=1
            mids.append(mid); prev=dt
        df["message_id"]=mids; df.drop(columns="dt",inplace=True)

    def get_df(self): return self.df.copy()

# ============================================================
# 3) RollingAnonymizer
# ============================================================
class RollingAnonymizer:
    FAMILY_PAT=re.compile(r"(?:[לכשבה]?משפחת)\s+([א-ת]{2,})")

    def __init__(self,max_chars=1000,prob_threshold=0.30):
        self.ner=pipeline("ner",model="dicta-il/dictabert-large-ner",
                          aggregation_strategy="simple")
        self.ner.tokenizer.backend_tokenizer.decoder = WordPiece()
        self.max_chars=max_chars; self.prob_thr=prob_threshold
        self.name_dict_raw=set(); self.freq_total=Counter(); self.freq_per=Counter()
        self.name_dict_final=set(); self.name_dict=set()

    # ---------- helpers ----------
    @staticmethod
    def regex_anonymizer(text):
        pats={"phone":r"972[-\s]?0?(?:[23489]|5[0-9])[-\s]?\d{7}\b",
              "id":r"\b\d{9}\b",
              "email":r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
              "credit_card":r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
              "website":r"(?:https?://\S+|www\.\S+)"}
        out=[]
        for k,pat in pats.items():
            for m in re.finditer(pat,text):
                out.append({"start":m.start(),"end":m.end(),
                            "word":m.group(),"entity_group":k.upper()})
        return out

    def _acc_name(self,name,is_per):
        self.freq_total[name]+=1
        if is_per: self.freq_per[name]+=1
        self.name_dict_raw.add(name)

    # ---------- core ----------
    def process_subbatch(self, rows):
        df = pd.DataFrame(rows).copy()
        df["censored_content"]=df["content"]
        df["ner_output"]="[]"; df["censored_words"]="[]"

        joined, offsets, cur = "", [], 0
        for idx, r in df.iterrows():
            joined += r["content"]
            offsets.append((r["id"], cur, cur+len(r["content"])))
            cur += len(r["content"])
            if idx < len(df)-1:
                joined += " "; cur += 1

        # NER
        per=[]
        for e in self.ner(joined):
            if e["entity_group"]!="PER": continue
            if e["end"]<len(joined) and re.match(r"[א-ת]",joined[e["end"]]): continue
            ent={"start":e["start"],"end":e["end"],
                 "word":e["word"],"entity_group":"PER"}
            for ex in multiword_and_subwords(ent):
                per.append(ex); self._acc_name(ex["word"],True)

        # משפחת X
        for fm in self.FAMILY_PAT.finditer(joined):
            sn=fm.group(1); s,e=fm.start(1),fm.end(1)
            per.append({"start":s,"end":e,"word":sn,"entity_group":"PER"})
            self._acc_name(sn,True)

        reg=self.regex_anonymizer(joined)
        for r in reg: self._acc_name(r["word"], r["entity_group"]=="PER")

        ents=remove_sub_entities(per+reg)   # **** אין הלולאה שמוסיפה הופעות נוספות בשלב-ראשון

        # map back
        row_ents=defaultdict(list)
        for ent in ents:
            for rid,rs,re_ in offsets:
                if rs<=ent["start"]<ent["end"]<=re_:
                    row_ents[rid].append(
                        {"start":ent["start"]-rs,"end":ent["end"]-rs,
                         "word":ent["word"],"entity_group":ent["entity_group"]})
                    break

        # censor
        for i,r in df.iterrows():
            rid=r["id"]; txt=r["content"]; ents=row_ents.get(rid,[])
            if not ents:
                df.at[i,"ner_output"]="[]"; continue
            ents_sorted=sorted(ents,key=lambda x:x["start"])
            new,shift,words=txt,0,[]
            for e in ents_sorted:
                s,e_ = e["start"]+shift, e["end"]+shift
                new=new[:s]+"*"*(e_-s)+new[e_:]
                shift += (e_-s) - (e_-s)  # 0, אבל משאיר מבנה
                words.append(e["word"])
            df.at[i,"censored_content"]=new
            df.at[i,"ner_output"]=str(ents_sorted)
            df.at[i,"censored_words"]=str(words)
        return df

    def _finalize_name_dict(self):
        for n in self.name_dict_raw:
            if n in HEB_STOP:
                continue

            # ➋ כמה פעמים המחרוזת מופיעה בכלל השיחות?
            total_occ = len(re.findall(improved_name_pattern(n), self._all_text))
            if total_occ == 0:
                continue

            # ➌ יחס ההופעות כ-PER / כל-ההופעות
            if self.freq_per[n] / total_occ >= self.prob_thr:
                self.name_dict_final.add(n)

        self.name_dict = self.name_dict_final

    def process_message_id(self,df_grp):
        rows=df_grp.to_dict("records"); comb=pd.DataFrame()
        for sb in split_if_too_long(rows,self.max_chars):
            comb=pd.concat([comb,self.process_subbatch(sb)],ignore_index=True)
        return comb

    def anonymize_chat(self, df):
        # ➊ שמור את כל הטקסט במחרוזת אחת (למכנה)
        self._all_text = " ".join(df["content"].tolist())

        out = pd.DataFrame()
        for mid in sorted(df["message_id"].unique()):
            out = pd.concat(
                [out, self.process_message_id(df[df["message_id"] == mid])],
                ignore_index=True)
        self._finalize_name_dict()
        return out


# ============================================================
# 4) second pass censor
# ============================================================
def second_pass_censor(df,name_dict):
    out=df.copy(deep=True)          # deep → משמר ner_output
    for i,row in out.iterrows():
        txt=row["censored_content"]
        cens=set(ast.literal_eval(row["censored_words"]))
        for name in sorted(name_dict,key=len,reverse=True):
            for m in re.finditer(improved_name_pattern(name),txt):
                s,e=m.start(),m.end()
                txt=txt[:s]+"*"*(e-s)+txt[e:]; cens.add(name)
        out.at[i,"censored_content"]=txt
        out.at[i,"censored_words"]=str(list(cens))
    return out

# ============================================================
# 5) main wrapper
# ============================================================
if __name__=="__main__":
    if COLAB:
        print("We're in Colab – upload a chat file (e.g. BUILDING.txt)")
        uploaded=files.upload()
        input_file=list(uploaded.keys())[0]
    else:
        input_file="BUILDING.txt"        # שנה אם צריך

    dp=data_processing()
    dp.load_chat(input_file)
    dp.add_message_id(time_gap=24)
    df=dp.get_df()

    anonymizer=RollingAnonymizer(max_chars=1000)
    df1=anonymizer.anonymize_chat(df)
    df2=second_pass_censor(df1,anonymizer.name_dict.copy())

    out_file="rolled_anonymized_BUILDING_noIndex.xlsx"
    df2.to_excel(out_file,index=False)
    print("Saved:",out_file)


We're in Colab – upload a chat file (e.g. BUILDING.txt)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving BUILDING.txt to BUILDING.txt
Loading chat file …
Loaded 1007 messages


config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


Saved: rolled_anonymized_BUILDING_noIndex.xlsx
