In [1]:
from datasets import load_dataset
from random import shuffle

def load_data(data_dir, file):
    return load_dataset(data_dir, data_files=file)   

def getLanguageData(raw_data, lang):
    idx_list = [idx for idx, val in enumerate(raw_data) if val['language'] == lang]

    #shuffle the data
    shuffle(idx_list)
    
    # data = [{
    #     'text': raw_data[idx]['text'],
    #     'label': raw_data[idx]['normalized label'] 
    #     }
    #     for idx in idx_list]

    input_text = [raw_data[idx]['text'] for idx in idx_list]
    label = [raw_data[idx]['normalized label'] for idx in idx_list]
    
    assert len(input_text) == len(label)

    return input_text, label

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def k_folds(k, text, label):
    text_folds = [[] for i in range(k)]
    label_folds = [[] for i in range(k)]
    n_language = len(text)
    
    print('Folding...')
    for lang_indx in range(n_language):
        lang_text = text[lang_indx]
        lang_label = label[lang_indx]
        
        size = len(lang_text)//k
        split=[]
        for i in range(k):
            split.append(i*size)
        if split[-1]!=len(lang_text):
            split.append(len(lang_text))
#         print(split)
            
        for i in range(k):
            text_folds[i].extend(lang_text[split[i]:split[i+1]])
            label_folds[i].extend(lang_label[split[i]:split[i+1]])
            
    for i in range(k):
        print('Fold',i+1,':',len(text_folds[i]),'data')
            
    return text_folds,label_folds

In [3]:
def k_fold_split(k, text_folds,label_folds):
    train_text = []
    train_label = []
    validate_text = []
    validate_label = []
    
    for i in range(k):
        validate_text.append(text_folds[i])
        validate_label.append(label_folds[i])
        
        text =[]
        label=[]
        for j in range(k):
            if(i!=j):
                text.extend(text_folds[i])
                label.extend(label_folds[i])
                
        train_text.append(text)
        train_label.append(label)
        
        print("Split",i+1,"- Training Data:",len(train_text[i]),"- Validation Data:",len(validate_text[i]))
        
    return train_text,train_label,validate_text,validate_label
        

In [4]:
raw_data = load_data("data", "train_copy.csv")['train']
k=2

# #English
# en_text, en_label = getLanguageData(raw_data, 'English')
# #French
# fr_text, fr_label = getLanguageData(raw_data, 'French')
# text_folds,label_folds = k_folds(10, {0:en_text,1:fr_text}, {0:en_label,1:fr_label})
# train_text,train_label,validate_text,validate_label = k_fold_split(10, text_folds,label_folds)

#English
print('English')
en_text, en_label = getLanguageData(raw_data, 'English')
en_text_folds,en_label_folds = k_folds(k, [en_text], [en_label])
en_train_text,en_train_label,en_validate_text,en_validate_label = k_fold_split(k, en_text_folds,en_label_folds)

#French
print('\nFrench')
fr_text, fr_label = getLanguageData(raw_data, 'French')
fr_text_folds,fr_label_folds = k_folds(k, [fr_text], [fr_label])
fr_train_text,fr_train_label,fr_validate_text,fr_validate_label = k_fold_split(k, fr_text_folds,fr_label_folds)
# print(text_folds,label_folds)

Using custom data configuration data-1ba08ea01a127387
Found cached dataset csv (C:/Users/avdhi/.cache/huggingface/datasets/csv/data-1ba08ea01a127387/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 498.79it/s]


English
Folding...
Fold 1 : 793 data
Fold 2 : 794 data
Split 1 - Training Data: 793 - Validation Data: 793
Split 2 - Training Data: 794 - Validation Data: 794

French
Folding...
Fold 1 : 794 data
Fold 2 : 794 data
Split 1 - Training Data: 794 - Validation Data: 794
Split 2 - Training Data: 794 - Validation Data: 794
