In [None]:
import os

import pandas as pd
import numpy as np

from collections import Counter
import json

#from turkish.deasciifier import Deasciifier !! not working
import text_format

from sklearn.model_selection import train_test_split

import IPython.display as ipd
import librosa

import matplotlib.pyplot as plt
#%matplotlib inline

# A) Dataset Loader

**Choose which dataset to work with**

In [None]:
#Mozilla Turkish Dataset
dataset_dir = os.path.join("..",'data',"Datasets","cv-corpus-5.1-2020-06-22","tr")

dataset_name = "cv-corpus-5.1-2020-06-22"+"_validated"

new_dataset_name = dataset_name+'_simple'

clip_dir = os.path.join(dataset_dir,"clips")
tsv_dir = os.path.join(dataset_dir,"validated.tsv") # Original annotations

df = pd.read_csv(tsv_dir,delimiter="\t")

sentences_original = df['sentence'].tolist()
paths = df['path'].tolist()


In [None]:
#METUbet
dataset_dir = os.path.join('..','data',"Datasets","METUbet","data")

dataset_name = "METUbet"
new_dataset_name = dataset_name

#clip_dir = os.path.join(dataset_dir,"clips")
csv_dir = os.path.join(dataset_dir,'METUbet.csv') # Original annotations

df = pd.read_csv(csv_dir,sep=",")

sentences_original = df['sentence'].tolist()
IDs = df['path'].tolist()

In [None]:
df

#ti20 English Dataset
tt = "train"

dataset_name = "ti20_"+tt

dataset_dir = os.path.join("..","..","Datasets","ti20",tt)

csv_dir = os.path.join(dataset_dir,"ti20_"+tt+".csv")

df = pd.read_csv(csv_dir,delimiter=",")

sentences = df['sentence'].tolist()
paths = df['path'].tolist()

# B) Format The Transcriptions

Clean the transcriptions from non-Turkish characters first and format the the remaining transcriptions with posterior knowledge of the dataset.

true_alphabet = ['a','b','c','ç','d','e','f','g','ğ','h','ı','i','j','k','l','m','n','o','ö','p','r','s','ş','t','u','ü','v','y','z']

In [None]:
# Get rid of sentences with unnecessary symbols. 
# Symbols found by posterior inspection
non_Turkish = ['x','X','w','W','q','Q','ë']
bad_indexes = []

for idx,sentence in enumerate(sentences_original):    
    for symbol in sentence:
        
        if symbol in non_Turkish:
            bad_indexes.append(idx)
            break

print("{} utterences deleted from the data set.".format(len(bad_indexes)))            

df.drop(df.index[bad_indexes],inplace=True)
df = df.reset_index(drop=True)            
print("Dataset updated")

sentences_turkish = df['sentence'].tolist()
paths = df['path'].tolist()
print("Remaining sentences: {}.".format(len(sentences_turkish)))

**Mozilla**

In [None]:
# format the sentences
sentences = [text_format.clean_text(sentence) for sentence in sentences_turkish if sentence] 

# Update dataframe
df['sentence'] = sentences

**METUbet**

In [None]:
sentences = text_format.METUbet_formatter(sentences_original) # metubet has de-ascifier indicators
sentences = [text_format.clean_text(sentence) for sentence in sentences if sentence]

df['turkish_sentence'] = sentences

# C) Dataset Analysis

## C.1)Sentence length Analysis

In [None]:
sentence_counter = Counter()

for sentence in sentences:
    sentence_counter[sentence] += 1

print("There are {} unique sentences.".format(len(sentence_counter)))

sentence_list = list(sentence_counter.keys())

In [None]:
txt_name = new_dataset_name+"_sentences.txt"
out_dir = os.path.join(dataset_dir,txt_name)

with open(out_dir, 'w',encoding="utf8") as f:
    
    for sentence in sentence_list:
        f.write(sentence+'\n')

## C.2)Word Count Distribution Analysis

In [None]:
word_counter = Counter()
for sentence in sentences: #de_ascii_sentences:
    for word in sentence.strip('.').split(' '):
        
        word_counter[word] += 1  
    
print("There are {} words in the dataset.".format(len(word_counter)))

print("\nThe 10 most common words with the number of appearances:\n")
n_most_common = word_counter.most_common(10)
for pair in n_most_common:
    print("{}\t |\t {}".format(pair[0],pair[1]))
    
top_common = dict(word_counter.most_common(50))
sorted_top_common = dict( sorted(top_common.items(), key=lambda x: x[0].lower()) )

fig, ax = plt.subplots(figsize=(20,8))
ax.bar(sorted_top_common.keys(), sorted_top_common.values())
ax.set_ylabel('Number of Appearances',fontsize=14)
ax.set_xlabel('Words',fontsize=14)
ax.set_title('Top 100 Appearing Words',fontsize=14)
plt.xticks(fontsize=20)
plt.yticks(fontsize=9)
plt.xticks(rotation=90)
#plt.savefig("Sentece Length Distribution.png")
plt.show()

**Export Word Dict**

In [None]:
json_name = dataset_name+"-word_dict.json"

json_path = os.path.join(dataset_dir,json_name)
with open(json_path,'w', encoding='utf-8') as outfile:
    json.dump(word_counter,outfile, ensure_ascii=False, indent=4)

**Compare Word Dicts**

In [None]:
json_name = "METUbet-word_dict.json"
with open(json_name,'r', encoding='utf-16') as infile:
    word_counter2 = json.load(infile)

In [None]:
word_set1 = set(word_counter)
word_set2 = set(word_counter2)

intersection = word_set1.intersection(word_set2)
union = word_set1.union(word_set2)

print("There are:\n")
print("\t{} words in set 1.".format(len(word_set1)))
print("\t{} words in set 2.".format(len(word_set2)))
print("\t{} words the intersection set.".format(len(intersection)))
print("\t{} words in the union set.".format(len(union)))

**BERTurk**

In [None]:
BERTurk_list = []
with open("vocab_32k.txt",encoding='utf-8') as fp:
    line = fp.readline()
    BERTurk_list.append(line.strip("\n"))
    while line:
        
        line = fp.readline()
        BERTurk_list.append(line.strip("\n"))

BERTURK = BERTurk_list[1971:] # 1971 found by inspecting the text file    32k    
#BERTURK = BERTurk_list[1925:] # 1925 found by inspecting the text file 128k

suffix_list = [element for element in BERTURK if '#' in element]
word_list = [element for element in BERTURK if '#' not in element]

word_set_berturk = set(word_list)

In [None]:
investigate_set = union

common_words = word_set_berturk.intersection(investigate_set)
print("There are {} common words ({:.2f}%).".format(len(common_words),100*len(common_words)/len(investigate_set)))

## C.3) Utterance Length Analysis

In [None]:
length_dict = Counter()

for sentence in sentences:
    length_dict[len(sentence)] += 1
           
sorted_items = sorted(length_dict.keys())
mean = np.mean(sorted_items)
var = np.var(sorted_items,dtype=np.float64)
dev = np.std(sorted_items,dtype=np.float64)
print("There are {} utterances.\n".format(len(sentences)))
print("Mean of the utterance lengths {}".format(mean))
print("Variance: {}".format(var))
print("Standard Deviation: {:.1f}\n".format(dev))

fig, ax = plt.subplots(figsize=(20,8))
ax.bar(length_dict.keys(), length_dict.values())
ax.set_ylabel('Number of Senteces',fontsize=14)
ax.set_xlabel('Sentence Length',fontsize=14)
ax.set_title('Sentence Length Distribution',fontsize=14)
ax.set_xlim([0,max(sorted_items)+1])
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
#plt.savefig("Sentece Length Distribution.png")
plt.show()

## C.4) Character Distribution Analysis

In [None]:
symbol_counter = Counter()

for sentence in sentences:
    for symbol in sentence:              
        
        symbol_counter[symbol] += 1 
        
print("Including the blank symbol(0), there are {} symbols.".format(len(symbol_counter)+1))   

alphabet = np.array(sorted(symbol_counter.keys(), key=lambda x:x.lower()))
alphabet = np.insert(alphabet,0,'0') # The blank is added here!!!!
print("\nOur Alphabet:")
print(alphabet)

### Export Alphabet

use utf16 for Turkish

In [None]:
export_dir = os.path.join(dataset_dir,new_dataset_name+'_alphabet.csv')
pd.DataFrame(alphabet).to_csv(export_dir,index=False,header=False,encoding='utf8')

# D) Encode the sentences

**Check the alphabet, if the blank is included, correct the class ids**

In [None]:
sentences_encoded = []
sentence_length = []

for sentence in sentences:
    
    encoded_sentence = []
    
    for symbol in sentence:
        
        class_id = np.where(alphabet==symbol)[0][0] # blank is included in the alphabet
        encoded_sentence.append(int(class_id))
        
    sentences_encoded.append(encoded_sentence)
    sentence_length.append(len(encoded_sentence))

df['encoded'] = sentences_encoded
df['sentence_length'] = sentence_length

# E) Training and Test Set Creation

In [None]:
split_ratio = 0.8 # train/total
N_batch = 16

In [None]:
print("Number of Total Samples: {}\n".format(df.shape[0]))
N_train = int(df.shape[0]*split_ratio)
N_train = N_train-(N_train%N_batch)

#gets a random 80% of the entire set
df_train = df.sample(n=N_train, random_state=1)
#gets the left out portion of the dataset
df_test = df.loc[~df.index.isin(df_train.index)].copy()

print("Number of Training Samples: {}".format(len(df_train)))
N_test = df.shape[0]-N_train
print("Number of Test Samples: {}".format(len(df_test)))

## E.1) Investigate the Test or Training Set

In [None]:
# Choose a Subset and investigate
sentences_invest = df_train['sentence'].tolist()
sentences_invest = df_test['sentence'].tolist()

length_dict = Counter()
for sentence in sentences_invest :

    length = len(sentence)
    
    length_dict[length] += 1
        
sorted_items = sorted(length_dict.keys())
mean = np.mean(sorted_items)
var = np.var(sorted_items,dtype=np.float64)
dev = np.std(sorted_items,dtype=np.float64)
print("There are {} utterances.\n".format(len(sentences_invest)))
print("Mean of the utterance lengths {}".format(mean))
print("Variance: {}".format(var))
print("Standard Deviation: {:.2f}\n".format(dev))

fig, ax = plt.subplots(figsize=(20,8))
ax.bar(length_dict.keys(), length_dict.values())
ax.set_ylabel('Number of Senteces',fontsize=14)
ax.set_xlabel('Sentence Length',fontsize=14)
ax.set_title('Sentence Length Distribution of the Training Set',fontsize=14)
ax.set_xlim([0,max(sorted_items)+1])
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
#plt.savefig("Sentece Length Distribution.png")
plt.show()

## E.2) Create Specific Subsets

### E.2.1) Choose around mean

In [None]:
shorter_length_dict = dict()

outside_dict = dict()

total = 0
for item in length_dict.items():
    
    if item[0] <= mean+dev and item[0] >= mean-dev:
    
        shorter_length_dict[item[0]] = item[1]
        total += item[1]
    else:
        outside_dict[item[0]] = item[1]

print("There are {} utterances around 1 standard deviation.\n".format(total))


fig, ax = plt.subplots(figsize=(20,8))
ax.bar(shorter_length_dict.keys(), shorter_length_dict.values())
ax.bar(outside_dict.keys(), outside_dict.values(),color="Red")

#plt.text(60, 750, 'Utterences in the Dataset(Blue+Red): {}\nMean of Utterance Lengths: {}\nStandard Deviation: {:.1f}\nUtterences in the Subset(Blue): {}'\
#         .format(len(sentences),mean,dev,total), fontsize=15,bbox=dict(alpha=0.5))

ax.set_ylabel('Number of Utterances',fontsize=15)
ax.set_xlabel('Utterance Length',fontsize=15)
ax.set_title('Utterance Length Distribution',fontsize=15)
ax.set_xlim([0,max(sorted_items)+1])
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.savefig("Utterance Length Distribution Subset.png")
plt.show()

### E.2.2)  Choose Specific Length

In [None]:
shorter_length_dict = dict()

outside_dict = dict()

total = 0
for item in length_dict.items():
    
    if item[0] == 4:   
        shorter_length_dict[item[0]] = item[1]
        total += item[1]
    else:
        outside_dict[item[0]] = item[1]

print("There are {} utterances with given length.\n".format(total))

fig, ax = plt.subplots(figsize=(20,8))
ax.bar(shorter_length_dict.keys(), shorter_length_dict.values())
ax.bar(outside_dict.keys(), outside_dict.values(),color="Red")

ax.set_ylabel('Number of Utterances',fontsize=15)
ax.set_xlabel('Utterance Length',fontsize=15)
ax.set_title('Utterance Length Distribution',fontsize=15)
ax.set_xlim([0,max(sorted_items)+1])
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.savefig("Utterance Length Distribution Subset.png")
plt.show()

# Subset 

In [None]:
remove_indices = []
for idx,sentence in enumerate(sentences):
    length = len(sentence)
    
    #if length >= mean+dev or length <= mean-dev:
    if length != 4:
        remove_indices.append(idx)
                
df.drop(df.index[remove_indices],inplace=True)
df = df.reset_index(drop=True)            
print("Subset Selected")
print(len(df))

## Sort the Dataset Based on Utterance Length

In [None]:
df.sort_values(by=['sentence_length'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True) # reset the index and drop it

df_train.sort_values(by=['sentence_length'], ascending=True, inplace=True)
df_train.reset_index(drop=False, inplace=True) # keep original indices in any case
df_test.sort_values(by=['sentence_length'], ascending=True, inplace=True)
df_test.reset_index(drop=False, inplace=True)

# Export new Dataframe

**Use pickle to preserve arrays, csv turns them into strings**

In [None]:
#Türkçe
new_name = new_dataset_name+"_ordered"

df.to_pickle(os.path.join(dataset_dir,new_name+".pkl")) 
df.to_csv(os.path.join(dataset_dir,new_name+".csv"),index=False)

In [None]:
train_set_name = new_name+'_train'

df_train.to_pickle(os.path.join(dataset_dir,train_set_name+".pkl")) 
df_train.to_csv(os.path.join(dataset_dir,train_set_name+".csv"),index=False)

test_set_name = new_name+'_test'

df_test.to_pickle(os.path.join(dataset_dir,test_set_name+".pkl")) 
df_test.to_csv(os.path.join(dataset_dir,test_set_name+".csv"),index=False)

#ti20
new_dataset_name = dataset_name+'_coded'

export_dir = os.path.join("..","..","Datasets","ti20",tt)

df.to_pickle(os.path.join(export_dir,new_dataset_name+'.pkl')) 
df.to_csv(os.path.join(export_dir,new_dataset_name+'.csv'),index=False)

df.to_csv(os.path.join(dataset_dir,"validated_cleaned2.csv"),index=False)