In [2]:
stt_pecha_tools = "01_stt_pecha_tools.tsv"
prodigy = "02_prodigy.tsv"
mv = "03_mv_saymore.tsv"

In [3]:
import pandas as pd
pecha_tools_df = pd.read_csv(stt_pecha_tools, sep='\t')
prodigy_df = pd.read_csv(prodigy, sep='\t')
mv_df = pd.read_csv(mv, sep='\t')

In [4]:
df = pd.concat([pecha_tools_df, prodigy_df, mv_df], ignore_index=True)

# clean the combined tsv

In [5]:
import re
text = "ཧ་ཧ་ཧ་ཧ་ཧ་འེ་"
text = re.sub(r'ཧ་ཧ་(ཧ་)+', r'ཧ་ཧ་ཧ་', text)
print(text)

ཧ་ཧ་ཧ་འེ་


In [6]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.strip()

    text = re.sub(r"་+", "་", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+།", "།", text)

    text = re.sub(r"ཧཧཧ+", "ཧཧཧ", text)
    text = re.sub(r'འེ་འེ་(འེ་)+', r'འེ་འེ་འེ་', text)
    text = re.sub(r'ཧ་ཧ་(ཧ་)+', r'ཧ་ཧ་ཧ་', text)

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)\༽\》\༼\《\༅\༄\༈\༑\༠]'
    
    text = re.sub(chars_to_ignore_regex, '', text)+" "
    return text

def check_if_regex(text):
    text = str(text)
    regex = re.compile(r'[a-zA-Z]+')
    match = re.search(regex, text)
    
    # Check if a match was found.
    if match:
        return True
    else:
        return False

In [7]:
df.shape

(804636, 6)

In [8]:
df = df[df['uni'].str.len() > 0]

In [9]:
df = df[df['uni'].str.len() < 994]

In [10]:
df = df[~df['uni'].apply(check_if_regex)]

In [11]:
df.shape

(804477, 6)

In [12]:
df['uni'] = df['uni'].map(clean_transcription)

In [13]:
df['url'] = df['url'].map(lambda x : x.replace('#','%23'))

In [14]:
df = df[df['file_name'] != 'STT_AB00148_0687_2124469_to_2126579']

In [15]:
df.drop_duplicates(subset='file_name', keep="first", inplace=True)

In [16]:
df.to_csv('04_combine_all.tsv', sep='\t', index=False)

In [25]:
df[['file_name','grade']].groupby('grade').count()

Unnamed: 0_level_0,file_name
grade,Unnamed: 1_level_1
2,446275
3,358199


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Splitting the data
train, test_temp = train_test_split(df, test_size=0.1, random_state=42, stratify=df['dept'])
val, test = train_test_split(test_temp, test_size=0.5, random_state=42, stratify=test_temp['dept'])         

In [20]:
len(train), len(val), len(test), len(train)+len(val)+len(test), len(df)

(724026, 40224, 40224, 804474, 804474)

In [39]:
train.to_csv('train.tsv', sep='\t', index=False)
val.to_csv(  'val.tsv', sep='\t', index=False)
test.to_csv( 'test.tsv', sep='\t', index=False)

In [40]:
df['uni'].str.contains('ཧཧཧ').value_counts()

uni
False    642034
True        559
Name: count, dtype: int64

In [35]:
import pandas as pd

df = pd.read_csv('04_combine_all.tsv', sep='\t')

In [36]:
import os
def getTimeSpan(filename):
    filename = os.path.splitext(os.path.basename(filename))[0]
    try:
        if "_to_" in filename:
            start, end = filename.split("_to_")
            start = start.split("_")[-1]
            end = end.split("_")[0]
            end = float(end)
            start = float(start)
            return abs((end - start)/1000)
        else:
            start, end = filename.split("-")
            start = start.split("_")[-1]
            end = end.split("_")[0]
            end = float(end)
            start = float(start)
            return abs(end - start)
    except Exception as err:
        print(f"filename is:'{filename}'. Could not parse to get time span.")
        return 0


In [37]:
df['time_span'] = df['file_name'].map(getTimeSpan)

In [38]:
df.groupby('dept')['time_span'].sum()/60/60

dept
STT_AB     98.450803
STT_CS     76.800410
STT_MV    414.516183
STT_NS    127.994451
STT_TT    186.008286
Name: time_span, dtype: float64

In [39]:
df['time_span'].sum()/60/60

903.7701333333339

In [40]:
# df.sort_values(by='time_span', ascending=False, inplace=True)
df = df.sample(frac = 1)

In [41]:
df.iloc[0:100,[0, 1, 3, 5]].to_csv("random_100.csv", index=False)
df.iloc[0:100,[0, 1, 3, 5]]

Unnamed: 0,file_name,uni,url,grade
549000,STT_AB00213_0110_566083_to_574018,དེ་ལྟར་མོ་རང་གིས་རང་གི་ཕ་མའི་སར་ལོག་རྒྱུའི་སྐོ...,https://d38pmlk0v88drf.cloudfront.net/wav/STT_...,2
591840,STT_MV0740_0096_683251_to_689676.wav,རང་གིས་བོད་ལ་འགྲོ་གི་མིན་ལབ་ཙང་། མ་དགའ་པ་བྱེད་...,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,2
475597,STT_AB00083_0044_226319_to_230774,པདྨ་རྡོ་རྗེ་ཀློག་སློབ་པ་ལ་ང་ལས་ཀྱང་ལྷག་པ་ཞིག་ཏ...,https://d38pmlk0v88drf.cloudfront.net/wav/STT_...,2
790086,STT_MV0179_0105_2539705_to_2546607.wav,བལ་གི་ཐག་པ་བསྒྲིལ་ན་སེང་གི་བཟུང་ཐུབ་ཀི་རེད། མང...,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,2
579142,STT_MV0683_0300_2157610_to_2163403.wav,ཨ་ཅག་ཞོ་ཤོ ཁ་སང་མངགས་པ་དེ། ནོར་རྒྱས་ཉ་ལྔ་ད། མཚ...,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,2
...,...,...,...,...
38678,STT_TT00047_01270.600-01272.200.wav,ཨེ་ནས་བརྗོད་པའམ་བཀླགས་པའི་,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,2
424085,STT_TT00170_00202.500-00207.050,ཧ་ལམ་རྟག་ཅིང་བརྟན་པ་བྱས་ནས། འགྱུར་བ་རྩ་བ་ཉིད་ན...,https://d38pmlk0v88drf.cloudfront.net/wav/STT_...,3
79284,STT_NS0140_0185_1321579_to_1331018.mp3,དེ་ཚོ་ཚང་མ་སོ་སོ་དང་བླངས་འདོན་ནས་ཡིན་ས་རེད། སྔ...,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,2
566772,STT_MV0195_0182_1275294_to_1281029.wav,སློབ་ཆེན་ལ་འགྲོ་དུས་ངས།,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,2


In [46]:
df[df['grade'] == 3].groupby('dept').count()

Unnamed: 0_level_0,file_name,uni,wylie,url,grade,time_span
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
STT_AB,147,147,147,147,147,147
STT_CS,37398,37398,37398,37398,37398,37398
STT_NS,42588,42588,42588,42588,42588,42588
STT_TT,278066,278066,278066,278066,278066,278066
