In [1]:
# All imports and setup
import pandas as pd
from sqlalchemy import create_engine, text
from tqdm.notebook import tqdm
tqdm.pandas()


# Add project root to path 
import sys
from pathlib import Path
sys.path.append(str(Path("../").resolve()))



In [2]:
import src.preprocessing_utils as prep
from src.db_utils import get_db_credentials

In [11]:
# Connect to PostgreSQL via SQLAlchemy
creds = get_db_credentials()
conn_str = f"postgresql+psycopg2://{creds['user']}:{creds['password']}@{creds['host']}:{creds['port']}/{creds['db_name']}"
engine = create_engine(conn_str)

# Load tickets (skip demo for pipeline demo purposes)
query = "SELECT ticket_id, subject, body FROM tickets WHERE demo_flag = FALSE;"
tickets_df = pd.read_sql(query, engine)

tickets_df.head()

Unnamed: 0,ticket_id,subject,body
0,TKT-500000,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge..."
1,TKT-500001,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g..."
2,TKT-500002,Can’t pay online — keeps declining 😕,"Hi team, Hi, I'm Jamie Salonen. Not sure if th..."
3,TKT-500003,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble..."
4,TKT-500004,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on..."


In [None]:
#  Preprocessing serial : Detect language and clean text
def preprocess_ticket_row(row):
    """
    Apply language detection and cleaning for a single row.
    Translation, PII masking, keywords handled in batch later.
    """
    subject = row.get("subject", "")
    body = row.get("body", "")

    lang = prep.detect_language(body)
    subject_clean = prep.clean_text(subject)
    body_clean = prep.clean_text(body)

    return {
        "ticket_id": row["ticket_id"],
        "lang": lang,
        "subject_clean": subject_clean,
        "body_clean": body_clean
    }




In [None]:
# Apply row-wise detect language and cleaning
tickets_cleaned = tickets_df.progress_apply(preprocess_ticket_row, axis=1)
tickets_cleaned = pd.DataFrame(list(tickets_cleaned))
tickets_cleaned.head()

In [None]:
# Initialize columns for translated text
tickets_cleaned["subject_translated"] = ""
tickets_cleaned["body_translated"] = ""

# Batch translate by language 

# Only languages with translation models will be processed here
for lang in prep.TRANSLATION_MODELS.keys():
    # Get indices of rows with this language
    idx = tickets_cleaned[tickets_cleaned["lang"] == lang].index
    if len(idx) == 0:
        continue  # skip if no rows in this language

    # Prepare text batches
    subjects = tickets_cleaned.loc[idx, "subject_clean"].tolist()
    bodies = tickets_cleaned.loc[idx, "body_clean"].tolist()

    # Translate in batch using preprocessing.py functions
    tickets_cleaned.loc[idx, "subject_translated"] = prep.translate_texts_batch(subjects, lang)
    tickets_cleaned.loc[idx, "body_translated"] = prep.translate_texts_batch(bodies, lang)

    # Log batch info
    print(f"[INFO] Translated batch of {len(idx)} tickets for language '{lang}'")

# Fill English/unknown rows with original cleaned content 
mask = tickets_cleaned["subject_translated"] == ""
tickets_cleaned.loc[mask, "subject_translated"] = tickets_cleaned.loc[mask, "subject_clean"]
tickets_cleaned.loc[mask, "body_translated"] = tickets_cleaned.loc[mask, "body_clean"]

# Verify
tickets_cleaned.head()

In [None]:
# PII masking
tickets_cleaned["subject_masked"] = tickets_cleaned["subject_translated"].progress_apply(prep.mask_pii_en)
tickets_cleaned["body_masked"] = tickets_cleaned["body_translated"].progress_apply(prep.mask_pii_en)

# Verify
tickets_cleaned.head()


In [None]:
tickets_cleaned.head(20)

In [None]:
tickets_cleaned.to_csv("tickets_cleaned.csv", index=False)

In [3]:

# Load from CSV into DataFrame
tickets_cleaned = pd.read_csv("tickets_cleaned.csv")

# Preview
tickets_cleaned.head()


Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ..."


In [4]:
# Combine text columns into one for keyword extraction and later use
tickets_cleaned["combined_text"] = tickets_cleaned["subject_masked"] + "|| " + tickets_cleaned["body_masked"]

# Verify
tickets_cleaned.head()


Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| Hi team, ..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| Hello, Hi, I'm..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| Hey,..."


In [5]:
# Extract keywords from the combined text
keywords_list = prep.extract_keywords_batch(tickets_cleaned["combined_text"].tolist(), top_n=10)

# Store keywords as a comma-separated string
tickets_cleaned["keywords"] = [", ".join(kws) for kws in keywords_list]

# Verify
tickets_cleaned.head()


Extracting keywords:   2%|▎              | 1003/49800 [00:33<24:26, 33.28text/s]

[INFO] Processed 1000/49800 texts


Extracting keywords:   4%|▌              | 2005/49800 [01:02<23:25, 34.01text/s]

[INFO] Processed 2000/49800 texts


Extracting keywords:   6%|▉              | 3003/49800 [01:32<24:05, 32.37text/s]

[INFO] Processed 3000/49800 texts


Extracting keywords:   8%|█▏             | 4004/49800 [02:02<22:16, 34.27text/s]

[INFO] Processed 4000/49800 texts


Extracting keywords:  10%|█▌             | 5003/49800 [02:32<22:09, 33.69text/s]

[INFO] Processed 5000/49800 texts


Extracting keywords:  12%|█▊             | 6005/49800 [03:02<21:25, 34.08text/s]

[INFO] Processed 6000/49800 texts


Extracting keywords:  14%|██             | 7005/49800 [03:32<23:34, 30.26text/s]

[INFO] Processed 7000/49800 texts


Extracting keywords:  16%|██▍            | 8004/49800 [04:01<20:37, 33.79text/s]

[INFO] Processed 8000/49800 texts


Extracting keywords:  18%|██▋            | 9006/49800 [04:31<19:39, 34.58text/s]

[INFO] Processed 9000/49800 texts


Extracting keywords:  20%|██▊           | 10004/49800 [05:01<19:54, 33.31text/s]

[INFO] Processed 10000/49800 texts


Extracting keywords:  22%|███           | 11003/49800 [05:31<20:08, 32.11text/s]

[INFO] Processed 11000/49800 texts


Extracting keywords:  24%|███▍          | 12006/49800 [06:01<19:05, 33.01text/s]

[INFO] Processed 12000/49800 texts


Extracting keywords:  26%|███▋          | 13003/49800 [06:31<18:21, 33.42text/s]

[INFO] Processed 13000/49800 texts


Extracting keywords:  28%|███▉          | 14005/49800 [07:00<18:32, 32.18text/s]

[INFO] Processed 14000/49800 texts


Extracting keywords:  30%|████▏         | 15007/49800 [07:30<17:03, 34.01text/s]

[INFO] Processed 15000/49800 texts


Extracting keywords:  32%|████▍         | 16005/49800 [08:01<18:00, 31.29text/s]

[INFO] Processed 16000/49800 texts


Extracting keywords:  34%|████▊         | 17005/49800 [08:31<17:12, 31.76text/s]

[INFO] Processed 17000/49800 texts


Extracting keywords:  36%|█████         | 18003/49800 [09:01<16:25, 32.27text/s]

[INFO] Processed 18000/49800 texts


Extracting keywords:  38%|█████▎        | 19005/49800 [09:31<15:24, 33.30text/s]

[INFO] Processed 19000/49800 texts


Extracting keywords:  40%|█████▌        | 20004/49800 [10:00<14:46, 33.60text/s]

[INFO] Processed 20000/49800 texts


Extracting keywords:  42%|█████▉        | 21006/49800 [10:31<14:26, 33.22text/s]

[INFO] Processed 21000/49800 texts


Extracting keywords:  44%|██████▏       | 22004/49800 [11:00<13:29, 34.34text/s]

[INFO] Processed 22000/49800 texts


Extracting keywords:  46%|██████▍       | 23003/49800 [11:30<12:41, 35.18text/s]

[INFO] Processed 23000/49800 texts


Extracting keywords:  48%|██████▋       | 24004/49800 [12:00<12:31, 34.31text/s]

[INFO] Processed 24000/49800 texts


Extracting keywords:  50%|███████       | 25005/49800 [12:30<11:46, 35.10text/s]

[INFO] Processed 25000/49800 texts


Extracting keywords:  52%|███████▎      | 26006/49800 [12:59<11:16, 35.18text/s]

[INFO] Processed 26000/49800 texts


Extracting keywords:  54%|███████▌      | 27006/49800 [13:29<11:16, 33.69text/s]

[INFO] Processed 27000/49800 texts


Extracting keywords:  56%|███████▊      | 28003/49800 [13:59<10:49, 33.56text/s]

[INFO] Processed 28000/49800 texts


Extracting keywords:  58%|████████▏     | 29007/49800 [14:29<10:13, 33.87text/s]

[INFO] Processed 29000/49800 texts


Extracting keywords:  60%|████████▍     | 30002/49800 [14:59<10:35, 31.16text/s]

[INFO] Processed 30000/49800 texts


Extracting keywords:  62%|████████▋     | 31003/49800 [15:28<09:12, 34.05text/s]

[INFO] Processed 31000/49800 texts


Extracting keywords:  64%|████████▉     | 32004/49800 [15:59<09:26, 31.40text/s]

[INFO] Processed 32000/49800 texts


Extracting keywords:  66%|█████████▎    | 33005/49800 [16:28<07:47, 35.89text/s]

[INFO] Processed 33000/49800 texts


Extracting keywords:  68%|█████████▌    | 34006/49800 [16:58<07:41, 34.25text/s]

[INFO] Processed 34000/49800 texts


Extracting keywords:  70%|█████████▊    | 35003/49800 [17:27<07:39, 32.20text/s]

[INFO] Processed 35000/49800 texts


Extracting keywords:  72%|██████████    | 36005/49800 [17:57<07:04, 32.51text/s]

[INFO] Processed 36000/49800 texts


Extracting keywords:  74%|██████████▍   | 37004/49800 [18:27<06:40, 31.99text/s]

[INFO] Processed 37000/49800 texts


Extracting keywords:  76%|██████████▋   | 38006/49800 [18:57<04:58, 39.49text/s]

[INFO] Processed 38000/49800 texts


Extracting keywords:  78%|██████████▉   | 39005/49800 [19:27<05:02, 35.64text/s]

[INFO] Processed 39000/49800 texts


Extracting keywords:  80%|███████████▏  | 40007/49800 [19:57<04:26, 36.76text/s]

[INFO] Processed 40000/49800 texts


Extracting keywords:  82%|███████████▌  | 41004/49800 [20:26<04:02, 36.30text/s]

[INFO] Processed 41000/49800 texts


Extracting keywords:  84%|███████████▊  | 42006/49800 [20:56<03:46, 34.47text/s]

[INFO] Processed 42000/49800 texts


Extracting keywords:  86%|████████████  | 43002/49800 [21:26<03:28, 32.62text/s]

[INFO] Processed 43000/49800 texts


Extracting keywords:  88%|████████████▎ | 44003/49800 [21:55<02:55, 33.05text/s]

[INFO] Processed 44000/49800 texts


Extracting keywords:  90%|████████████▋ | 45003/49800 [22:25<02:19, 34.51text/s]

[INFO] Processed 45000/49800 texts


Extracting keywords:  92%|████████████▉ | 46002/49800 [22:55<01:51, 34.05text/s]

[INFO] Processed 46000/49800 texts


Extracting keywords:  94%|█████████████▏| 47005/49800 [23:25<01:28, 31.69text/s]

[INFO] Processed 47000/49800 texts


Extracting keywords:  96%|█████████████▍| 48003/49800 [23:54<00:55, 32.56text/s]

[INFO] Processed 48000/49800 texts


Extracting keywords:  98%|█████████████▊| 49005/49800 [24:24<00:24, 32.64text/s]

[INFO] Processed 49000/49800 texts


Extracting keywords: 100%|██████████████| 49800/49800 [24:48<00:00, 33.46text/s]


[INFO] Keyword extraction completed for 49800 texts


Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text,keywords
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...,"rate limits transactions error, limits transac..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...,"kyc documents getting rejected, kyc documents ..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| Hi team, ...","pay online keeps declining, online keeps decli..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| Hello, Hi, I'm...","prio card missing block, prio card missing, hi..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| Hey,...","tokenization failed wallet, tokenization faile..."


In [None]:
# # Hello, Hi, I'm Sam Heikkinen. My card keeps getting errors when I try to pay. Balance should be fine, so I'm confused. I tried three times today, same result. Balance shkouldj be fine, so I'm confused.
# # Hallå, Hi, I'm Henrik Svensson. Jag har problem med card lost/stolen. Som jag nämnde tidigare, detta blockerar oss. Phone: +46 101 389 4079 IBAN: FI9106024211679678 Card: 4000 1234 5678 9010 Säg till om ni behöver skärmdumpar. Med vänlig hälsning.
# # Hei, Hi, I'm Sanna Mäkinen. Ongelma liittyy aiheeseen: API Rate Limit/Auth. 💳 IBAN: FI8663695284994003 As mentioned earlier, this is blocking our checkout. Ilmoittakaa jos tarvitsette kuvakaappauksia. Saldon pitäisi riittää, joten olen hämmentynyt. Ilmoittakaa jos tarvitsette kuvakaappauksia. Saldon pitäisi riittää, joten olen hämmentynyt.
# #Hej, Hi, I'm Matilda Virtanen. Jag har problem med dispute/chargeback. Testade också på en annan enhet. 🙏 Phone: +358 167 297 3630 Email: matilda.virtanen@example.com Card: 4970 **** **** 8430 Säg till om ni behöver skärmdumpar. Försökte tre gånger idag, samma resultat. Försökte tre gånger idag, samma resultat. Täckningen borde räcka, så jag är förvirrad. Tack!

# masked = preprocess_ticket("Hej, Hi, I'm Matilda Virtanen. Jag har problem med dispute/chargeback. Testade också på en annan enhet. 🙏 Phone: +358 167 297 3630 Email: matilda.virtanen@example.com Card: 4970 **** **** 8430 Säg till om ni behöver skärmdumpar. Försökte tre gånger idag, samma resultat. Försökte tre gånger idag, samma resultat. Täckningen borde räcka, så jag är förvirrad. Tack!")

In [6]:
tickets_cleaned.head()

Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text,keywords
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...,"rate limits transactions error, limits transac..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...,"kyc documents getting rejected, kyc documents ..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| Hi team, ...","pay online keeps declining, online keeps decli..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| Hello, Hi, I'm...","prio card missing block, prio card missing, hi..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| Hey,...","tokenization failed wallet, tokenization faile..."


In [19]:
# Combine text columns into one for keyword extraction and later use
tickets_cleaned["combined_text"] = tickets_cleaned["subject_translated"] + "|| " + tickets_cleaned["body_masked"]

# Verify
tickets_cleaned.head(10)

Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text,keywords
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...,"rate limits transactions error, limits transac..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...,"kyc documents getting rejected, kyc documents ..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| Hi team, ...","pay online keeps declining, online keeps decli..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| Hello, Hi, I'm...","prio card missing block, prio card missing, hi..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| Hey,...","tokenization failed wallet, tokenization faile..."
5,TKT-500005,en,"FYI: Sandbox ok, prod failing","Hello, Hi, I'm Jordan Mäkinen. I'm having trou...","FYI: Sandbox ok, prod failing","Hello, Hi, I'm Jordan Mäkinen. I'm having trou...","FYI: [PERSON], prod failing","Hello, Hi, I'm [PERSON]. I'm having trouble wi...","FYI: Sandbox ok, prod failing|| Hello, Hi, I'm...","trouble integration sdk launch, trouble integr..."
6,TKT-500006,fi,PRIO: Miten kasvattaa API-kiintiötä?,"Hei, Hi, I'm Sanna Mäkinen. Ongelma liittyy ai...",PRIO: How to increase the API quota?,"Hi, Hi, I'm Sanna Mäkinen. The problem relates...",PRIO: How to increase the API quota?,"Hi, Hi, I'm [PERSON]. The problem relates to: ...","PRIO: How to increase the API quota?|| Hi, Hi,...","prio increase api quota, increase api quota hi..."
7,TKT-500007,en,Declined transactions without reason?,"Hello, Hi, I'm Drew Laine. My card keeps getti...",Declined transactions without reason?,"Hello, Hi, I'm Drew Laine. My card keeps getti...",Declined transactions without reason?,"Hello, Hi, I'm [PERSON]. My card keeps getting...","Declined transactions without reason?|| Hello,...","card keeps getting declined, getting declined ..."
8,TKT-500008,sv,Tillfällig höjning av e‑handelsgräns,"Hej där, Hi, I'm Sofia Ahonen. Mitt kort blir ...",Temporary increase of e-commerce limit,"Hi there, Hi, I'm Sofia Ahonen. My card gets w...",Temporary increase of e-commerce limit,"Hi there, Hi, I'm [PERSON]. My card gets wrong...",Temporary increase of e-commerce limit|| Hi th...,"temporary increase commerce limit, increase co..."
9,TKT-500009,en,Question about last month’s fees,"Hi, I'm Alex Ahonen. Not sure if this is on my...",Question about last month’s fees,"Hi, I'm Alex Ahonen. Not sure if this is on my...",Question about last month’s fees,"Hi, I'm [PERSON]. Not sure if this is on my si...","Question about last month’s fees|| Hi, I'm [PE...","card 4970 7040 tested, tested device card 4970..."


In [21]:

import re

# Define greetings to remove (case-insensitive)
greetings_pattern = r'\b(hi|hello|hey)\b'

# Remove greetings from combined_text
tickets_cleaned["combined_text"] = tickets_cleaned["combined_text"].apply(
    lambda x: re.sub(greetings_pattern, '', x, flags=re.IGNORECASE)
)

# Remove extra whitespace after removal
tickets_cleaned["combined_text"] = tickets_cleaned["combined_text"].str.replace(r'\s+', ' ', regex=True).str.strip()

# Verify
tickets_cleaned.head(10)


Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text,keywords
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...,"rate limits transactions error, limits transac..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...,"kyc documents getting rejected, kyc documents ..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| team, , I...","pay online keeps declining, online keeps decli..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| , , I'm [PERSO...","prio card missing block, prio card missing, hi..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| , , ...","tokenization failed wallet, tokenization faile..."
5,TKT-500005,en,"FYI: Sandbox ok, prod failing","Hello, Hi, I'm Jordan Mäkinen. I'm having trou...","FYI: Sandbox ok, prod failing","Hello, Hi, I'm Jordan Mäkinen. I'm having trou...","FYI: [PERSON], prod failing","Hello, Hi, I'm [PERSON]. I'm having trouble wi...","FYI: Sandbox ok, prod failing|| , , I'm [PERSO...","trouble integration sdk launch, trouble integr..."
6,TKT-500006,fi,PRIO: Miten kasvattaa API-kiintiötä?,"Hei, Hi, I'm Sanna Mäkinen. Ongelma liittyy ai...",PRIO: How to increase the API quota?,"Hi, Hi, I'm Sanna Mäkinen. The problem relates...",PRIO: How to increase the API quota?,"Hi, Hi, I'm [PERSON]. The problem relates to: ...","PRIO: How to increase the API quota?|| , , I'm...","prio increase api quota, increase api quota hi..."
7,TKT-500007,en,Declined transactions without reason?,"Hello, Hi, I'm Drew Laine. My card keeps getti...",Declined transactions without reason?,"Hello, Hi, I'm Drew Laine. My card keeps getti...",Declined transactions without reason?,"Hello, Hi, I'm [PERSON]. My card keeps getting...","Declined transactions without reason?|| , , I'...","card keeps getting declined, getting declined ..."
8,TKT-500008,sv,Tillfällig höjning av e‑handelsgräns,"Hej där, Hi, I'm Sofia Ahonen. Mitt kort blir ...",Temporary increase of e-commerce limit,"Hi there, Hi, I'm Sofia Ahonen. My card gets w...",Temporary increase of e-commerce limit,"Hi there, Hi, I'm [PERSON]. My card gets wrong...",Temporary increase of e-commerce limit|| there...,"temporary increase commerce limit, increase co..."
9,TKT-500009,en,Question about last month’s fees,"Hi, I'm Alex Ahonen. Not sure if this is on my...",Question about last month’s fees,"Hi, I'm Alex Ahonen. Not sure if this is on my...",Question about last month’s fees,"Hi, I'm [PERSON]. Not sure if this is on my si...","Question about last month’s fees|| , I'm [PERS...","card 4970 7040 tested, tested device card 4970..."


In [22]:
# Extract keywords from the combined text
keywords_list = prep.extract_keywords_batch(tickets_cleaned["combined_text"].tolist(), top_n=10)

# Store keywords as a comma-separated string
tickets_cleaned["keywords"] = [", ".join(kws) for kws in keywords_list]

# Verify
tickets_cleaned.head()

Extracting keywords:   2%|▎              | 1001/49800 [00:47<38:30, 21.12text/s]

[INFO] Processed 1000/49800 texts


Extracting keywords:   4%|▌              | 2003/49800 [01:50<57:45, 13.79text/s]

[INFO] Processed 2000/49800 texts


Extracting keywords:   6%|▉              | 3001/49800 [03:01<58:44, 13.28text/s]

[INFO] Processed 3000/49800 texts


Extracting keywords:   8%|█▏             | 4003/49800 [04:13<50:29, 15.12text/s]

[INFO] Processed 4000/49800 texts


Extracting keywords:  10%|█▌             | 5001/49800 [05:24<52:25, 14.24text/s]

[INFO] Processed 5000/49800 texts


Extracting keywords:  12%|█▊             | 6001/49800 [06:36<54:40, 13.35text/s]

[INFO] Processed 6000/49800 texts


Extracting keywords:  14%|██             | 7001/49800 [07:48<49:31, 14.40text/s]

[INFO] Processed 7000/49800 texts


Extracting keywords:  16%|██▍            | 8002/49800 [08:58<45:09, 15.43text/s]

[INFO] Processed 8000/49800 texts


Extracting keywords:  18%|██▋            | 9002/49800 [10:08<44:03, 15.43text/s]

[INFO] Processed 9000/49800 texts


Extracting keywords:  20%|██▊           | 10002/49800 [11:18<43:32, 15.23text/s]

[INFO] Processed 10000/49800 texts


Extracting keywords:  22%|███           | 11001/49800 [12:29<45:20, 14.26text/s]

[INFO] Processed 11000/49800 texts


Extracting keywords:  24%|███▎          | 12002/49800 [13:40<33:07, 19.01text/s]

[INFO] Processed 12000/49800 texts


Extracting keywords:  26%|███▋          | 13004/49800 [14:50<26:02, 23.54text/s]

[INFO] Processed 13000/49800 texts


Extracting keywords:  28%|███▉          | 14004/49800 [15:38<30:39, 19.46text/s]

[INFO] Processed 14000/49800 texts


Extracting keywords:  30%|████▏         | 15001/49800 [16:26<37:31, 15.45text/s]

[INFO] Processed 15000/49800 texts


Extracting keywords:  32%|████▍         | 16001/49800 [17:37<43:16, 13.02text/s]

[INFO] Processed 16000/49800 texts


Extracting keywords:  34%|████▊         | 17001/49800 [18:48<44:59, 12.15text/s]

[INFO] Processed 17000/49800 texts


Extracting keywords:  36%|█████         | 18003/49800 [19:38<29:22, 18.04text/s]

[INFO] Processed 18000/49800 texts


Extracting keywords:  38%|█████▎        | 19004/49800 [20:26<24:14, 21.17text/s]

[INFO] Processed 19000/49800 texts


Extracting keywords:  40%|█████▌        | 20002/49800 [21:22<38:53, 12.77text/s]

[INFO] Processed 20000/49800 texts


Extracting keywords:  42%|█████▉        | 21002/49800 [22:33<33:02, 14.53text/s]

[INFO] Processed 21000/49800 texts


Extracting keywords:  44%|██████▏       | 22002/49800 [23:44<32:56, 14.06text/s]

[INFO] Processed 22000/49800 texts


Extracting keywords:  46%|██████▍       | 23001/49800 [24:54<29:26, 15.17text/s]

[INFO] Processed 23000/49800 texts


Extracting keywords:  48%|██████▋       | 24001/49800 [26:06<28:31, 15.07text/s]

[INFO] Processed 24000/49800 texts


Extracting keywords:  50%|███████       | 25003/49800 [27:20<27:48, 14.86text/s]

[INFO] Processed 25000/49800 texts


Extracting keywords:  52%|███████▎      | 26002/49800 [28:33<30:32, 12.99text/s]

[INFO] Processed 26000/49800 texts


Extracting keywords:  54%|███████▌      | 27002/49800 [29:47<28:28, 13.35text/s]

[INFO] Processed 27000/49800 texts


Extracting keywords:  56%|███████▊      | 28002/49800 [31:00<24:06, 15.07text/s]

[INFO] Processed 28000/49800 texts


Extracting keywords:  58%|████████▏     | 29002/49800 [32:13<22:51, 15.16text/s]

[INFO] Processed 29000/49800 texts


Extracting keywords:  60%|████████▍     | 30000/49800 [33:26<24:31, 13.46text/s]

[INFO] Processed 30000/49800 texts


Extracting keywords:  62%|████████▋     | 31002/49800 [34:40<22:51, 13.71text/s]

[INFO] Processed 31000/49800 texts


Extracting keywords:  64%|████████▉     | 32003/49800 [35:54<22:05, 13.43text/s]

[INFO] Processed 32000/49800 texts


Extracting keywords:  66%|█████████▎    | 33002/49800 [37:06<22:25, 12.49text/s]

[INFO] Processed 33000/49800 texts


Extracting keywords:  68%|█████████▌    | 34002/49800 [38:17<18:17, 14.39text/s]

[INFO] Processed 34000/49800 texts


Extracting keywords:  70%|█████████▊    | 35001/49800 [39:27<17:24, 14.16text/s]

[INFO] Processed 35000/49800 texts


Extracting keywords:  72%|██████████    | 36001/49800 [40:37<16:53, 13.61text/s]

[INFO] Processed 36000/49800 texts


Extracting keywords:  74%|██████████▍   | 37001/49800 [41:49<16:39, 12.80text/s]

[INFO] Processed 37000/49800 texts


Extracting keywords:  76%|██████████▋   | 38003/49800 [42:59<12:04, 16.28text/s]

[INFO] Processed 38000/49800 texts


Extracting keywords:  78%|██████████▉   | 39003/49800 [44:10<12:01, 14.97text/s]

[INFO] Processed 39000/49800 texts


Extracting keywords:  80%|███████████▏  | 40002/49800 [45:21<11:30, 14.18text/s]

[INFO] Processed 40000/49800 texts


Extracting keywords:  82%|███████████▌  | 41001/49800 [46:32<11:58, 12.25text/s]

[INFO] Processed 41000/49800 texts


Extracting keywords:  84%|███████████▊  | 42001/49800 [47:43<10:13, 12.70text/s]

[INFO] Processed 42000/49800 texts


Extracting keywords:  86%|████████████  | 43001/49800 [48:56<08:02, 14.08text/s]

[INFO] Processed 43000/49800 texts


Extracting keywords:  88%|████████████▎ | 44001/49800 [50:06<06:16, 15.39text/s]

[INFO] Processed 44000/49800 texts


Extracting keywords:  90%|████████████▋ | 45002/49800 [51:16<05:33, 14.37text/s]

[INFO] Processed 45000/49800 texts


Extracting keywords:  92%|████████████▉ | 46002/49800 [52:27<04:46, 13.27text/s]

[INFO] Processed 46000/49800 texts


Extracting keywords:  94%|█████████████▏| 47001/49800 [53:39<03:22, 13.83text/s]

[INFO] Processed 47000/49800 texts


Extracting keywords:  96%|█████████████▍| 48001/49800 [54:52<02:25, 12.34text/s]

[INFO] Processed 48000/49800 texts


Extracting keywords:  98%|█████████████▊| 49002/49800 [56:06<00:56, 14.04text/s]

[INFO] Processed 49000/49800 texts


Extracting keywords: 100%|██████████████| 49800/49800 [57:05<00:00, 14.54text/s]

[INFO] Keyword extraction completed for 49800 texts





Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text,keywords
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...,"rate limits transactions error, limits transac..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...,"kyc documents getting rejected, kyc documents ..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| team, , I...","pay online keeps declining, pay online keeps, ..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| , , I'm [PERSO...","prio card missing block, prio card missing, bl..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| , , ...","tokenization failed wallet setup, tokenization..."


In [7]:

max_words = tickets_cleaned["combined_text"].apply(lambda x: len(str(x).split())).max()
print("Maximum word count:", max_words)


Maximum word count: 554


In [23]:
tickets_cleaned.to_csv("tickets_cleaned.csv", index=False)

In [24]:
# Load from CSV into DataFrame
tickets_cleaned = pd.read_csv("tickets_cleaned.csv")

# Preview
tickets_cleaned.head(10)

Unnamed: 0,ticket_id,lang,subject_clean,body_clean,subject_translated,body_translated,subject_masked,body_masked,combined_text,keywords
0,TKT-500000,en,URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm Sam Heikkinen. My card keeps ge...",URGENT: Hitting rate limits on /transactions (...,"Hello, Hi, I'm [PERSON]. My card keeps getting...",URGENT: Hitting rate limits on /transactions (...,"rate limits transactions error, limits transac..."
1,TKT-500001,en,Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm Alex Mäkinen. My card keeps g...",Question about KYC — documents keep getting re...,"Hi team, Hi, I'm [PERSON]. My card keeps getti...",Question about KYC — documents keep getting re...,"kyc documents getting rejected, kyc documents ..."
2,TKT-500002,en,Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm Jamie Salonen. Not sure if th...",Can’t pay online — keeps declining,"Hi team, Hi, I'm [PERSON]. Not sure if this is...","Can’t pay online — keeps declining|| team, , I...","pay online keeps declining, pay online keeps, ..."
3,TKT-500003,sv,"PRIO: Kort saknas, spärra nu","Hallå, Hi, I'm Henrik Svensson. Jag har proble...","PRIO: Card missing, block now","Hello, Hi, I'm Henrik Svensson. I have problem...","PRIO: Card missing, block now","Hello, Hi, I'm [PERSON]. I have problems with ...","PRIO: Card missing, block now|| , , I'm [PERSO...","prio card missing block, prio card missing, bl..."
4,TKT-500004,en,Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm Sa Ahonen. Not sure if this is on...",Tokenization failed during wallet setup,"Hey, Hi, I'm [PERSON]. Not sure if this is on ...","Tokenization failed during wallet setup|| , , ...","tokenization failed wallet setup, tokenization..."
5,TKT-500005,en,"FYI: Sandbox ok, prod failing","Hello, Hi, I'm Jordan Mäkinen. I'm having trou...","FYI: Sandbox ok, prod failing","Hello, Hi, I'm Jordan Mäkinen. I'm having trou...","FYI: [PERSON], prod failing","Hello, Hi, I'm [PERSON]. I'm having trouble wi...","FYI: Sandbox ok, prod failing|| , , I'm [PERSO...","trouble integration sdk launch, trouble integr..."
6,TKT-500006,fi,PRIO: Miten kasvattaa API-kiintiötä?,"Hei, Hi, I'm Sanna Mäkinen. Ongelma liittyy ai...",PRIO: How to increase the API quota?,"Hi, Hi, I'm Sanna Mäkinen. The problem relates...",PRIO: How to increase the API quota?,"Hi, Hi, I'm [PERSON]. The problem relates to: ...","PRIO: How to increase the API quota?|| , , I'm...","prio increase api quota, increase api quota pr..."
7,TKT-500007,en,Declined transactions without reason?,"Hello, Hi, I'm Drew Laine. My card keeps getti...",Declined transactions without reason?,"Hello, Hi, I'm Drew Laine. My card keeps getti...",Declined transactions without reason?,"Hello, Hi, I'm [PERSON]. My card keeps getting...","Declined transactions without reason?|| , , I'...","card keeps getting declined, declined transact..."
8,TKT-500008,sv,Tillfällig höjning av e‑handelsgräns,"Hej där, Hi, I'm Sofia Ahonen. Mitt kort blir ...",Temporary increase of e-commerce limit,"Hi there, Hi, I'm Sofia Ahonen. My card gets w...",Temporary increase of e-commerce limit,"Hi there, Hi, I'm [PERSON]. My card gets wrong...",Temporary increase of e-commerce limit|| there...,"commerce limit card gets, increase commerce li..."
9,TKT-500009,en,Question about last month’s fees,"Hi, I'm Alex Ahonen. Not sure if this is on my...",Question about last month’s fees,"Hi, I'm Alex Ahonen. Not sure if this is on my...",Question about last month’s fees,"Hi, I'm [PERSON]. Not sure if this is on my si...","Question about last month’s fees|| , I'm [PERS...","fees sure tested device, card 4970 7040 tested..."


In [25]:
# Write DataFrame to Postgres
tickets_cleaned.to_sql(
    name="ticket_preprocessed",   
    con=engine,
    if_exists="replace",           
    index=False,                   
    dtype=None                     
)

print("Data successfully written to 'ticket_preprocessed' table.")

Data successfully written to 'ticket_preprocessed' table.
