In [None]:
from pandas import read_excel, read_parquet


filename = "../../data/input/wti_raw.xlsx"
ds_translated = read_excel(filename)
ds_original = read_parquet(
    "hf://datasets/airesearch/WangchanThaiInstruct_7.24/data/train-00000-of-00001.parquet"
)

tasks = ["Multiple choice", "Closed QA", "Summarization"]
ds_original = ds_original.query(f" Task_type in {tasks} ")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = (
    ds_original
    .merge(ds_translated, on="ID", how="inner")
)

print("ds_original:", ds_original.shape)
print("ds_translated:", ds_translated.shape)
print("df:", df.shape)

ds_original: (2346, 9)
ds_translated: (2346, 4)
df: (2346, 12)


In [5]:
df.columns

Index(['ID', 'Domain', 'Instruction', 'Input', 'Output', 'Thai_Specfic',
       'Tags', 'Task_type', 'License', 'en_instruction', 'en_input',
       'en_output'],
      dtype='object')

In [None]:
df.to_excel("../../data/input/wti_semi_raw.xlsx", index=False)

In [None]:
from pandas import read_excel


df = read_excel("../../data/input/wti_semi_raw.xlsx")

In [2]:
df[["en_instruction", "en_input", "en_output"]].head(20)

Unnamed: 0,en_instruction,en_input,en_output
0,What are the top 5 most popular funds that peo...,"In the past week, from June 11 to June 17, 202...",What are the top 5 most popular funds searched...
1,"Please summarize the article ""Concerns of QT C...","At the end of last month, Christopher Waller, ...","At the end of last month, Christopher Waller, ..."
2,"Sure, here is the translation:\n\nThai: ช่วยสร...",The Situation of the Market in Terms of Growth...,"The Growth Stock and Value Stock markets, from..."
3,"Please summarize the article ""Growth and Value...",It is believed that many people have heard of ...,**English:**\n\nThe Growth and Value Rotation ...
4,"Please summarize the article titled ""What Happ...",**What Happened to LUNA Coin? Why Did the Pric...,"Since May 7, 2022, the UST coin has de-pegged ..."
5,Please summarize the article and explain what ...,1. Do Kwan is a 31-year-old South Korean techn...,Do Kwon is a 31-year-old South Korean technolo...
6,"Sure, here is the translation of the provided ...",Why are private equity stocks off the market w...,Private equity stocks off the market are worth...
7,Which of the following is not a reason why the...,"A. Since 1955, the United States has never had...",The correct answer is (J) because the inflatio...
8,"Sure, here is the translation:\n\nThai: จงสรุป...","SMIC Grows Threefold Despite U.S. Ban\n\n""SMIC...","""SMIC"" or Semiconductor Manufacturing Internat..."
9,What is a yield curve?,English: The yield curve has three types:\n1. ...,There are 3 types:\n1. Normal curve: The yield...


In [3]:
col = "en_instruction"

test_1 = df.loc[11, col]
test_2 = df.loc[14, col]

print(test_1, end="\n=====\n")
print(test_2)

Sure, here is the translation:

"Please summarize the article 'Crown Token and Last Idol' with a price surge of over 664%. NFTs to watch in 2022!!"
=====
Sure, here is the translation:

Thai: ช่วยสรุปบทความ หุ้นปูติน-รัสเซีย VS หุ้นโลก

English: Please summarize the article "Putin-Russia Stocks VS Global Stocks".


In [4]:
def clean_text(text: str) -> str:
    try:
        text_lower = text.lower()

        # pattern 1 -- English: <text>
        if "english" in text_lower:
            idx = text_lower.find("english")
            idx += len("english") + 2
            text = text[idx:]
        
        # pattern 2 -- \n\n"<text>"
        elif ("here is the translation" in text_lower) and  ('\n\n"' in text_lower):
            idx = text_lower.find('\n\n"')
            idx += len('\n\n"')
            text = text[idx:-1]
        
        # pattern 3 -- "<text>"
        elif (text[0] == '"') and (text[-1] == "'"):
            text = text[1:-1]
        
        return text.replace("**", "")
    except:
        return text

In [8]:
print(clean_text(test_1), end="\n=====\n")
print(clean_text(test_2))

Please summarize the article 'Crown Token and Last Idol' with a price surge of over 664%. NFTs to watch in 2022!!
=====
Please summarize the article "Putin-Russia Stocks VS Global Stocks".


In [9]:
for c in ["en_instruction", "en_input", "en_output"]:
    df[c] = df[c].apply(clean_text)

In [None]:
df.to_excel("../../data/input/wti_clean.xlsx", index=False)

## After cleaning manually

In [1]:
from pandas import read_excel


df = read_excel("../../data/input/wti_clean.xlsx")

In [2]:
cases = []
for idx, item in df.iterrows():
    for c in ["en_instruction", "en_input", "en_output"]:
        if item[c] != item[c]: continue
        if "translation" in item[c]:
            cases.append(f"{idx} {c}")
len(cases)

23

In [118]:
i = 22

idx, c = cases[i].split()
idx = int(idx)
print(c)
df.loc[idx, c]

en_input


"Amendment of the Plan\nSupreme Court Decision No. 5818/2546 (Source: Rehabilitation Book by Ajarn Auan)\nIn this case, there were 5 remaining creditors eligible to receive debt repayment in the business rehabilitation. The rehabilitation plan proposed to fully repay 3 creditors. For creditor T. Company (Creditor Group 4), which is the parent company of the debtor, and the objector (Creditor No. 9), the plan stipulated that these two creditors would receive only 15% of their debt repayment. Only the objector opposed the plan, citing unfair treatment as their debt was reduced by 85% and they would receive only 15% repayment.\nThe plan preparer filed a petition to amend the plan, increasing the debt repayment to 45%. Although this amendment was not legally compliant, it demonstrated that the objector indeed received unfair treatment regarding the insufficient amount of debt repayment according to the plan. However, on the day of plan consideration, the objector submitted a statement conf

In [110]:
idx

2194

In [104]:
n = len("\n\nI hope these translations are helpful! If you need any further assistance, feel free to ask.")

In [105]:
df.loc[idx, c] = df.loc[idx, c][:-n]

In [65]:
for case in cases:
    idx, c = case.split()
    idx = int(idx)
    text = df.loc[idx, c]
    
    if "\n\n---\n\n" in text:
        rm_idx = text.find('\n\n---\n\n')
        df.loc[idx, c] = text[:rm_idx]

    if "\n---\n\n" in text:
        n = len("\n---\n\n")
        df.loc[idx, c] = text[n:]

In [None]:
df.to_excel("../../data/input/wti_clean.xlsx", index=False)