In [8]:
import pandas as pd 
import numpy as np 
import os

In [9]:
data_dir = "translations/v2"
batch_files = [f for f in os.listdir(data_dir)]

df1 = pd.DataFrame()

for batch_file in batch_files:
    batch_path = os.path.join(data_dir, batch_file)
    df_temp = pd.read_csv(batch_path)
    df1 = pd.concat([df1, df_temp], ignore_index = True)

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   combined  2000 non-null   object
 1   language  2000 non-null   object
 2   queue     2000 non-null   object
dtypes: object(3)
memory usage: 47.0+ KB


In [21]:
# You can just add the dataframe together and then remove any duplicated rows
df = pd.read_csv('data/aa_dataset-tickets-multi-lang-5-2-50-version.csv')

df_eng = df[df.language == "en"].copy()

# Save only English rows with non-null subjects
df_chunk = df_eng[(df_eng.subject.notnull()) & (df_eng.subject != '')].copy()
df_chunk['combined'] = df_chunk['subject'] + ' [SEP] ' + df_chunk['body']
df_chunk = df_chunk[['combined','language', 'queue']]

print("Duplicated rows in df1:", df1.duplicated().sum())

df_combined = pd.concat([df_chunk, df1], axis=0, ignore_index=True)
df_combined = df_combined.drop_duplicates()

df_combined.info()

Duplicated rows in df1: 1
<class 'pandas.core.frame.DataFrame'>
Index: 15230 entries, 0 to 15730
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   combined  15230 non-null  object
 1   language  15230 non-null  object
 2   queue     15230 non-null  object
dtypes: object(3)
memory usage: 475.9+ KB


In [22]:
for i in df_eng.queue.unique():
    print(f"{i}: {len(df_eng[df_eng.queue == i])}")

Technical Support: 4737
Returns and Exchanges: 820
Billing and Payments: 1595
Sales and Pre-Sales: 513
Service Outages and Maintenance: 664
Product Support: 3073
IT Support: 1942
Customer Service: 2410
Human Resources: 348
General Inquiry: 236


In [None]:
queues = ['Technical Support', 'Billing and Payments', 'Product Support', 'IT Support', 'Customer Service']
mask = (df_combined['queue'].isin(queues)) & (df_combined['language'] == 'en')

# Get indices to drop: sample 20% from each queue
drop_indices = []
for q in queues:
    idx = df_combined[mask].sample(frac=0.4, random_state=42).index
    drop_indices.extend(idx)

# Drop the selected rows
df_reduced = df_combined.drop(index=drop_indices).reset_index(drop=True)
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10597 entries, 0 to 10596
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   combined  10597 non-null  object
 1   language  10597 non-null  object
 2   queue     10597 non-null  object
dtypes: object(3)
memory usage: 248.5+ KB


In [27]:
for lang in df_reduced.language.unique():
    print(f"{lang}: {len(df_reduced[df_reduced.language == lang])}")

en: 9098
yor: 500
ibo: 500
hau: 499


In [28]:
df_reduced.to_csv('data/final_dataset.csv', index=False)

In [None]:
batch_dir = "translations"
batch_files = sorted([f for f in os.listdir(batch_dir) if f.startswith("translated_batch")])

for i, batch_file in enumerate(batch_files, 1):
    batch_path = os.path.join(batch_dir, batch_file)
    df = pd.read_csv(batch_path)

    columns = ["yor_Latn", "ibo_Latn", "hau_Latn"]

    ff = df[columns].copy()

    df.drop(columns=columns, inplace=True)

    new_df = pd.DataFrame(columns= ["combined", "language", "queue"])

    for i in columns:
        new_df["combined"] = ff[[i]]
        new_df["language"] = i.split("_")[0]
        new_df["queue"] = df["queue"]
        df = pd.concat([df, new_df], axis=0, ignore_index=True)
    
    df.to_csv(f"translations/translated_batch{i}_v2.csv")
    

In [17]:
df = pd.read_csv("translations/translated_batch0.csv")
print(len(df[df.queue == "Technical Support"]))
df.head()

50


Unnamed: 0,combined,language,queue,yor_Latn,ibo_Latn,hau_Latn
0,Assistance with IFTTT Integration [SEP] Lookin...,en,Technical Support,Ìrànlọ́wọ́ nínú ìdarapọ̀ IFTTT [SEP] Ṣíwá àwọn...,Enyemaka na IFTTT Integration [SEP] Ị na-achọ ...,Taimako tare da IFTTT Haɗuwa [SEP] Neman cikak...
1,Incident of Medical Data Security [SEP] Custom...,en,Technical Support,Ìṣẹ̀lẹ̀ Ìṣirò Ìsọfúnni Ìsọfúnni Ìsọfúnni Ìsọfú...,Ihe omume nke Medical Data Security [SEP] Nkwa...,Rikicin Tsaron Bayanai na Likita [SEP] Tallafi...
2,Guidance on Integrating Microsoft Azure with M...,en,Technical Support,Ìdarí lórí kíkó Microsoft Azure jọ sí Monday.c...,Nduzi na ijikọta Microsoft Azure na Monday.com...,Jagora kan Haɗa Microsoft Azure tare da Ayyuka...
3,System Performance Problems [SEP] The company ...,en,Technical Support,Awọn iṣoro iṣẹ eto [SEP] Ile-iṣẹ naa pade awọn...,Nsogbu arụmọrụ usoro [SEP] Ụlọ ọrụ ahụ zutere ...,Matsalolin Ayyukan Tsarin [SEP] Kamfanin ya sa...
4,Detected Unauthorized Access Attempt [SEP] The...,en,Technical Support,A ti rí ìfilọ́ sí ìfilọ́ sí ìfilọ́ sí ìfilọ́ s...,Achọpụtala Mgbalị Access A Na-anaghị Enye Iwu ...,An gano Yunkurin Samun izini mara izini [SEP] ...


Remove the columns from the df 
Create a new df
Create the yoruba, Igbo and Hausa data in the new df
Concat both dfs.

In [22]:
df = pd.read_csv("translations/translated_batch0.csv")

columns = ["yor_Latn", "ibo_Latn", "hau_Latn"]

ff = df[columns].copy()

df.drop(columns=columns, inplace=True)

new_df = pd.DataFrame(columns= ["combined", "language", "queue"])

for i in columns:
    new_df["combined"] = ff[[i]]
    new_df["language"] = i.split("_")[0]
    new_df["queue"] = df["queue"]
    df = pd.concat([df, new_df], axis=0, ignore_index=True)

df.tail(10)

Unnamed: 0,combined,language,queue
390,Matsalar da ke tattare da Bayanan Bayanai [SEP...,hau,Returns and Exchanges
391,Sanarwar Lokaci na Tsarin [SEP] An sami katsew...,hau,Returns and Exchanges
392,Sauran ra'ayoyin dawowar saka hannun jari [SEP...,hau,Returns and Exchanges
393,Taimako da ake buƙata don batun Haɗuwa [SEP] A...,hau,Returns and Exchanges
394,Bayanin Tsaro na Likita [SEP] Shin zai yiwu a ...,hau,Returns and Exchanges
395,Tsaron Bayanai a Kula da Lafiya [SEP] Za ka iy...,hau,Returns and Exchanges
396,Taimako Game da Kamfanin Talla [SEP] Kamfanin ...,hau,Returns and Exchanges
397,Rahoton dawowar saka hannun jari Rashin daidai...,hau,Returns and Exchanges
398,Bambanci a cikin tsinkayar saka hannun jari sa...,hau,Returns and Exchanges
399,Tallafawa Batutuwa na Fasaha [SEP] Kayan aikin...,hau,Returns and Exchanges


In [None]:
df.to_csv("translations/translated_batch0_combined.csv")

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   combined  400 non-null    object
 1   language  400 non-null    object
 2   queue     400 non-null    object
dtypes: object(3)
memory usage: 9.5+ KB


In [24]:
df.head()

Unnamed: 0,combined,language,queue
0,Assistance with IFTTT Integration [SEP] Lookin...,en,Technical Support
1,Incident of Medical Data Security [SEP] Custom...,en,Technical Support
2,Guidance on Integrating Microsoft Azure with M...,en,Technical Support
3,System Performance Problems [SEP] The company ...,en,Technical Support
4,Detected Unauthorized Access Attempt [SEP] The...,en,Technical Support


In [25]:
print(len(df[df.language == "ibo"]))

100


In [26]:
print(len(df[df.queue == "Technical Support"]))

200
