In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, XGLMTokenizerFast, XGLMConfig#, LlamaTokenizer, LlamaForCausalLM
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

In [2]:
train_df =  pd.read_json("/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/train_ted_en-ja")
train_df

Unnamed: 0,talk_id,doc
0,129,"{'en': ['What I'm going to show you first, as ..."
1,769,{'en': ['I'd like to share with you a discover...
2,1160,{'en': ['This is a representation of your brai...
3,1161,"{'en': ['I'm Jessi, and this is my suitcase.',..."
4,779,{'en': ['Everybody talks about happiness these...
5,1165,"{'en': ['In 2007, I decided that we needed to..."
6,1166,{'en': ['I want you to imagine two couples in...
7,783,{'en': ['If I can leave you with one big idea ...
8,785,{'en': ['I grew up on a steady diet of science...
9,531,{'en': ['Last year at TED I gave an introducti...


In [3]:
val_df = pd.read_json("/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/val_ted_en-ja")
val_df

Unnamed: 0,talk_id,doc
0,1666,"{'en': ['When I was 11, I remember waking up ..."
1,1548,"{'en': ['the Oxford English Dictionary is ""sno..."
2,1553,{'en': ['There are a lot of ways the people ar...
3,1685,"{'en': ['I live in South Central.', 'This is S..."
4,1694,"{'en': ['When I was little, I thought my count..."
5,1699,{'en': ['This is where I live. I live in Kenya...
6,1592,"{'en': ['Five years ago, I experienced a bit ..."
7,1600,"{'en': ['Today I have just one request.', 'Ple..."
8,1617,"{'en': ['Everything I do, and everything I do ..."
9,1634,{'en': ['Photography has been my passion ever...


In [4]:
test_df = pd.read_json("/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/test_ted_en-ja")
test_df

Unnamed: 0,talk_id,doc
0,1922,"{'en': ['Intelligence -- what is it?', 'If we ..."
1,2183,{'en': ['Today I'm going to speak to you abou...
2,1932,{'en': ['I'm going to talk to you tonight abo...
3,1939,"{'en': ['I had brain surgery 18 years ago, an..."
4,1954,{'en': ['In many patriarchal societies and tri...
5,1443,{'en': ['I'd like to invite you to close your ...
6,1829,"{'en': ['So I'm going to talk about trust, an..."
7,1961,{'en': ['The world makes you something that yo...
8,1835,"{'en': ['So, we used to solve big problems.', ..."
9,2102,"{'en': ['This is my niece, Stella.', 'She's ju..."


In [5]:
lang_list = ["ja", "de", "fr", "zh", "ko", "ar"]
lang_to_train_df = {}
lang_to_val_df = {}
lang_to_test_df = {}

for lang in lang_list:
    train_df = pd.read_json(f"/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/train_ted_en-{lang}")
    val_df = pd.read_json(f"/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/val_ted_en-{lang}")
    test_df = pd.read_json(f"/home/sumire/thesis/LLM_Contextual_Prompt_MT/data/iwslt_hf/test_ted_en-{lang}")
    lang_to_train_df[lang] = train_df
    lang_to_val_df[lang] = val_df
    lang_to_test_df[lang] = test_df
    print (lang)
    test_num_sents = 0
    for doc in lang_to_test_df[lang]["doc"]:
        num_sent = len(doc["en"])
        test_num_sents += num_sent
    print (test_num_sents)

ja
2479
de
2385
fr
2516
zh
2502
ko
2470
ar
2506


In [17]:
tokenizer = AutoTokenizer.from_pretrained("upstage/Llama-2-70b-instruct-v2")
data_statistics = []

for lang in lang_list:
    print (lang)
    train_num_docs = len(lang_to_train_df[lang])
    val_num_docs = len(lang_to_val_df[lang])
    test_num_docs = len(lang_to_test_df[lang])

    train_num_sents = 0
    train_all_num_tokens_src = 0
    train_all_num_tokens_tgt = 0

    train_num_tokens_list_src = []
    train_num_tokens_list_tgt = []
    
    
    for doc in lang_to_train_df[lang]["doc"]:
        sent = []
        num_sent = len(doc["en"])
        train_num_sents += num_sent
        for src_or_tgt in ["en", lang]:
            for sent in doc[src_or_tgt]:
                num_tokens = len(tokenizer(sent).input_ids)
                if src_or_tgt == "en":
                    train_all_num_tokens_src += num_tokens
                else:
                    train_all_num_tokens_tgt += num_tokens
                if src_or_tgt == "en":
                    train_num_tokens_list_src.append(num_tokens)
                else:
                    train_num_tokens_list_tgt.append(num_tokens)
        
    train_avg_num_tokens_src = train_all_num_tokens_src/train_num_sents
    train_avg_num_tokens_tgt = train_all_num_tokens_tgt/train_num_sents
    train_max_tokens_per_sent_src = max(train_num_tokens_list_src)
    train_max_tokens_per_sent_tgt = max(train_num_tokens_list_tgt)

    val_num_sents = 0    
    val_all_num_tokens_src = 0
    val_all_num_tokens_tgt = 0
    val_num_tokens_list_src = []
    val_num_tokens_list_tgt = []
    
    for doc in lang_to_val_df[lang]["doc"]:
        sent = []
        num_sent = len(doc["en"])
        val_num_sents += num_sent
        for src_or_tgt in ["en", lang]:
            for sent in doc[src_or_tgt]:
                num_tokens = len(tokenizer(sent).input_ids)
                if src_or_tgt == "en":
                    val_all_num_tokens_src += num_tokens
                else:
                    val_all_num_tokens_tgt += num_tokens
                if src_or_tgt == "en":
                    val_num_tokens_list_src.append(num_tokens)
                else:
                    val_num_tokens_list_tgt.append(num_tokens)
    val_avg_num_tokens_src = val_all_num_tokens_src/val_num_sents        
    val_avg_num_tokens_tgt = val_all_num_tokens_tgt/val_num_sents
    val_max_tokens_per_sent_src = max(val_num_tokens_list_src)
    val_max_tokens_per_sent_tgt = max(val_num_tokens_list_tgt)

    test_num_sents = 0    
    test_all_num_tokens_src = 0
    test_all_num_tokens_tgt = 0
    test_num_tokens_list_src = []
    test_num_tokens_list_tgt = []
    for doc in lang_to_test_df[lang]["doc"]:
        sent = []
        num_sent = len(doc["en"])
        test_num_sents += num_sent
        for src_or_tgt in ["en", lang]:
            for sent in doc[src_or_tgt]:
        #for sent in doc["en"]:
                num_tokens = len(tokenizer(sent).input_ids)
                if src_or_tgt == "en":
                    test_all_num_tokens_src += num_tokens
                else:
                    test_all_num_tokens_tgt += num_tokens
                if src_or_tgt == "en":
                    test_num_tokens_list_src.append(num_tokens)
                else:
                    test_num_tokens_list_tgt.append(num_tokens)
                #test_num_tokens_list.append(num_tokens)
    test_avg_num_tokens_src = test_all_num_tokens_src/test_num_sents
    test_avg_num_tokens_tgt = test_all_num_tokens_tgt/test_num_sents
    test_max_tokens_per_sent_src = max(test_num_tokens_list_src)
    test_max_tokens_per_sent_tgt = max(test_num_tokens_list_tgt)
    
    
    stat_src = pd.DataFrame(
        {
            "num_doc":[train_num_docs, val_num_docs, test_num_docs],
            "num_sent": [train_num_sents, val_num_sents, test_num_sents],
            "max_tokens_per_sent_src":[train_max_tokens_per_sent_src, val_max_tokens_per_sent_src, test_max_tokens_per_sent_src],
            "avg_tokens_per_sent_src":[train_avg_num_tokens_src, val_avg_num_tokens_src, test_avg_num_tokens_src],
        }, 
        index=["train", "val", "test"],
        
    )
    stat_src = stat_src.style.set_caption(f"{lang} Data Statistics")
    data_statistics.append(stat_src)
    print (stat_src)
    
    
    stat_tgt = pd.DataFrame(
        {
            "num_doc":[train_num_docs, val_num_docs, test_num_docs],
            "num_sent": [train_num_sents, val_num_sents, test_num_sents],
            "max_tokens_per_sent_tgt":[train_max_tokens_per_sent_tgt, val_max_tokens_per_sent_tgt, test_max_tokens_per_sent_tgt],
            "avg_tokens_per_sent_tgt":[train_avg_num_tokens_tgt, val_avg_num_tokens_tgt, test_avg_num_tokens_tgt],
        }, 
        index=["train", "val", "test"],
        
    )
    stat_tgt = stat_tgt.style.set_caption(f"{lang} Data Statistics")
    data_statistics.append(stat_tgt)
    print (stat_tgt)
    
          

ja
<pandas.io.formats.style.Styler object at 0x7fbfe7ac9fd0>
<pandas.io.formats.style.Styler object at 0x7fbfe7c88fd0>
de
<pandas.io.formats.style.Styler object at 0x7fbfe7eccc70>
<pandas.io.formats.style.Styler object at 0x7fbfe7a00f70>
fr
<pandas.io.formats.style.Styler object at 0x7fc02c0b3f10>
<pandas.io.formats.style.Styler object at 0x7fbfe7d11fd0>
zh
<pandas.io.formats.style.Styler object at 0x7fbfe7b92fd0>
<pandas.io.formats.style.Styler object at 0x7fbfe7e259d0>
ko
<pandas.io.formats.style.Styler object at 0x7fbfe7c3ffd0>
<pandas.io.formats.style.Styler object at 0x7fbfe7c72b20>
ar
<pandas.io.formats.style.Styler object at 0x7fbfe7c728e0>
<pandas.io.formats.style.Styler object at 0x7fbfe7b069d0>


In [18]:
for i, lang in zip(data_statistics, lang_list):
    display(i)

Unnamed: 0,num_doc,num_sent,max_tokens_per_sent_src,avg_tokens_per_sent_src
train,50,5507,136,24.802978
val,15,964,119,26.272822
test,27,2479,114,24.971359


Unnamed: 0,num_doc,num_sent,max_tokens_per_sent_tgt,avg_tokens_per_sent_tgt
train,50,5507,295,45.992373
val,15,964,307,52.845436
test,27,2479,364,52.684954


Unnamed: 0,num_doc,num_sent,max_tokens_per_sent_src,avg_tokens_per_sent_src
train,50,5589,129,24.061728
val,15,975,135,24.895385
test,27,2385,119,23.159329


Unnamed: 0,num_doc,num_sent,max_tokens_per_sent_tgt,avg_tokens_per_sent_tgt
train,50,5589,146,30.177849
val,15,975,168,31.21641
test,27,2385,135,29.679245


Unnamed: 0,num_doc,num_sent,max_tokens_per_sent_src,avg_tokens_per_sent_src
train,50,5592,129,24.827611
val,15,979,106,26.054137
test,27,2516,114,24.834658


Unnamed: 0,num_doc,num_sent,max_tokens_per_sent_tgt,avg_tokens_per_sent_tgt
train,50,5592,153,31.302754
val,15,979,140,33.357508
test,27,2516,138,31.614865
