In [1]:
import pandas as pd
import os
import zipfile
import json
import re
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# base_dir = r"../data/TeleAntiFraud-28k/merged_result"
# rows = []

# for root, dirs, files in os.walk(base_dir):
#     for file in files:
#         if file == "config.json":
#             with open(os.path.join(root, file), "r") as f:
#                 data = json.load(f)
                
#             # Aggregate left/right messages
#             left_texts = [seg["content"] for seg in data["audio_segments"] if seg["role"] == "left"]
#             right_texts = [seg["content"] for seg in data["audio_segments"] if seg["role"] == "right"]

#             # Combine full conversation (in chronological order)
#             full_text = " ".join([seg["content"] for seg in data["audio_segments"]])

#             # Determine label (POS/NEG)
#             label = "POS" if "POS" in root else "NEG"

#             rows.append({
#                 "conversation_id": root.split(os.sep)[-1],
#                 "label": label,
#                 "left_text": " ".join(left_texts),
#                 "right_text": " ".join(right_texts),
#                 "full_text": full_text
#             })

# df = pd.DataFrame(rows)
# df.to_csv("../data/TeleAntiFraud/cleaned_conversations.csv", index=False, encoding="utf-8-sig")
# print(f"Extracted {len(df)} conversations.")


In [10]:
def detect_fraud_type(text):
    if pd.isna(text) or text.strip() == "":
        return None

    text = text.lower()

    # Define keyword patterns using chinese characters
    fraud_patterns = {
        "Investment Scam": r"(投资|理财|收益|基金|股票|回报|分红)",
        "Loan Scam": r"(贷款|借款|信用|征信|额度|分期)",
        "Telecom Scam": r"(电话|公安|银行|账户|冻结|验证码|转账)",
        "Impersonation Scam": r"(警察|客服|工作人员|冒充|身份)",
        "Lottery/Prize Scam": r"(中奖|礼品|领取|抽奖|奖励|活动)"
    }

    for fraud_type, pattern in fraud_patterns.items():
        if re.search(pattern, text):
            return fraud_type

    return "Other Scam"  # default for fraud but unknown type

In [14]:
df = pd.read_csv("../data/TeleAntiFraud/cleaned_conversations.csv")
df["fraud_type"] = df.apply(
    lambda row: detect_fraud_type(row["full_text"]) if row["label"] == "NEG" else None,
    axis=1
)

In [15]:
from deep_translator import GoogleTranslator # could also use other translators like deepl or openai, but google translator is the fastest
from tqdm import tqdm

translator = GoogleTranslator(source='zh-CN', target='en')

if "perp_text_en" not in df.columns:
    df["perp_text_en"] = ""
# df["victim_text_en"] = ""
# df["full_text_en"] = ""

for i in tqdm(range(len(df))):
    try:
        # Only translate fraud cases (NEG) and skip already-translated rows
        if df.loc[i, "label"] == "POS":  # could also use df.loc[i, "fraud_type"] == "Loan Scam" for just translating loan scam cases or comment out for all cases
            if pd.notna(df.loc[i, "left_text"]) and (df.loc[i, "perp_text_en"] == "" or pd.isna(df.loc[i, "perp_text_en"])):
                df.loc[i, "perp_text_en"] = translator.translate(df.loc[i, "left_text"])
            
            # Uncomment below to also translate victim or full text
            # if pd.notna(df.loc[i, "right_text"]) and (df.loc[i, "victim_text_en"] == "" or pd.isna(df.loc[i, "victim_text_en"])):
            #     df.loc[i, "victim_text_en"] = translator.translate(df.loc[i, "right_text"])
            #
            # if pd.notna(df.loc[i, "full_text"]) and (df.loc[i, "full_text_en"] == "" or pd.isna(df.loc[i, "full_text_en"])):
            #     df.loc[i, "full_text_en"] = translator.translate(df.loc[i, "full_text"])
    except Exception as e:
        print(f"Error at row {i}: {e}")
        continue

100%|██████████| 8902/8902 [1:00:17<00:00,  2.46it/s]


In [16]:
df.to_csv("../data/TeleAntiFraud/cleaned_conversations.csv", index=False, encoding="utf-8-sig")