# Corpus Former
This notebook is forming a corpus from a Wiki Dump and the datasets that are used for training the network.

In [1]:
import os
import sys
import re
import string
import json
from collections import Counter

import pandas as pd
import numpy as np

sys.path.append('../utilities')
import text_format

In [2]:
# Read an alphabet
dataset_name = "METUbet"
dataset_dir = os.path.join("..","data","Datasets",dataset_name,"data") # root directory of the dataset

alphabet_dir = os.path.join(dataset_dir,'METUbet_alphabet.csv')
alphabet_original = pd.read_csv(alphabet_dir,delimiter=",",header=None,encoding='utf8',skip_blank_lines=False)[0].tolist()
alphabet_original.pop(0)
print("--"*40)
print(alphabet_original)

--------------------------------------------------------------------------------
[' ', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'y', 'z', 'ç', 'ö', 'ü', 'ğ', 'ı', 'ş']


In [5]:
lm_work_directory = os.path.join("..","data","language model work")

In [4]:
# Read the wiki dump text
wiki_dump_path = os.path.join(lm_work_directory,"wiki_00.txt")
with open(wiki_dump_path,'r', encoding='utf-8') as infile:
    wiki_dump = infile.read()

    
# Find out where each article is
iterator = re.finditer(r'<.+?>', wiki_dump)

boundaries = []
for i in iterator:    
    boundaries.append((i.start(),i.end()))
    
    
# Extract all the articles
articles = []
for i in range(len(boundaries)-1):
    
    article = wiki_dump[boundaries[i][1]:boundaries[i+1][0]]
    
    if len(article)>2: #skip 2 char strings
        articles.append(article)

In [None]:
# clean all articles
clean_articles = [text_format.clean_text(article) for article in articles]

In [None]:
# Seperate cleaned articles into sentences
clean_wiki_sentences = []
for article in clean_articles:
    
    sentence_list = article.split(". ")
    
    clean_wiki_sentences = text_format.filter_sentence_list(sentence_list, clean_wiki_sentences)
    
print("There are {} acceptable sentences.".format(len(clean_wiki_sentences)))

In [None]:
word_counter = text_format.analyze_words(clean_wiki_sentences)

bad_characters, bad_chars_str = text_format.analyze_symbols(clean_wiki_sentences,alphabet_original)

with open("bad_chars.txt", 'w', encoding="utf8") as f:
    f.write(bad_chars_str)

In [None]:
# Clean from bad characters twice
pattern1 = r"\s.+?[{}]+?.+?\s".format(bad_chars_str)
pattern2 = r"[{}].+?\s".format(bad_chars_str)

super_diminished_sentences = []
for sentence in clean_wiki_sentences:
    
    new_sentence = re.sub(pattern1, " ", sentence)
    new_sentence = re.sub(pattern2, " ", new_sentence)
    
    super_diminished_sentences.append(new_sentence)

In [None]:
bad_characters2, bad_chars_str2 = text_format.analyze_symbols(super_diminished_sentences,alphabet_original)

In [None]:
# Only take the sentences with turkish characters
remaining_sentences = []
#bad_sentences = []
flag = False
for sentence in super_diminished_sentences:
    
    for char in sentence:

        if char not in alphabet_original:
            flag = True
            break
            
    if flag:
        #bad_sentences.append(sentence)
        flag = False
        continue

    if sentence and len(sentence) > 5:
        remaining_sentences.append(sentence)
        
print("There are {} remaining sentences.".format(len(remaining_sentences)))

In [None]:
wiki_word_dict = text_format.analyze_words(remaining_sentences)

In [None]:
# for exporting 
project_name = "wiki_good_clean"
sent_list = remaining_sentences

txt_path = os.path.join(lm_work_directory,"{}.txt".format(project_name))
with open(txt_path, 'w', encoding="utf8") as f:
    
    for sentence in sent_list:
        f.write(sentence+'\n')

json_path = os.path.join(lm_work_directory,"{}-word_dict.json".format(project_name))
with open(json_path,'w', encoding='utf-8') as outfile:
    json.dump(wiki_word_dict,outfile, ensure_ascii=False, indent=4)

## Now read the sentences from the datasets

In [None]:
def read_and_prepare_datasets():

    txt_dir = os.path.join("..",'data',"Datasets","cv-corpus-5.1-2020-06-22","tr","cv-corpus-5.1-2020-06-22_validated_simple_sentences.txt")
    mozilla_sentences = text_format.read_txt(txt_dir,"utf-8")
    print("Initially: {} sentences".format(len(mozilla_sentences)))
    
    mozilla_sentences = list(filter(text_format.clean_text, mozilla_sentences))
    
    mozilla_sentences = text_format.filter_sentence_list(mozilla_sentences,clean_sentences=[],bound=0)
    print("After Cleaning: {} sentences".format(len(mozilla_sentences)))
    
    
    txt_dir = os.path.join('..','data',"Datasets","METUbet","data","METUbet_sentences.txt")
    metu_sentences = text_format.read_txt(txt_dir,'utf-8')
    print("\nInitially: {} sentences".format(len(metu_sentences)))
    
    metu_sentences = list(filter(text_format.clean_text, metu_sentences))

    metu_sentences = text_format.filter_sentence_list(metu_sentences,clean_sentences=[],bound=0)
    print("After Cleaning: {} sentences".format(len(metu_sentences)))
    
    return mozilla_sentences, metu_sentences


In [None]:
mozilla_sentences, metu_sentences = read_and_prepare_datasets() # read previous datasets

In [None]:
# for exporting cleaned sentence lists
txt_name = "Mozilla_sentences.txt"
txt_path = os.path.join(lm_work_directory, txt_name)
with open(txt_path, 'w', encoding="utf8") as f:
    
    for sentence in mozilla_sentences:
        f.write(sentence+'\n')
        
txt_name = "METUbet_sentences.txt"
txt_path = os.path.join(lm_work_directory, txt_name)
with open(txt_path, 'w', encoding="utf8") as f:
    
    for sentence in metu_sentences:
        f.write(sentence+'\n')

In [None]:
NN_datasets_sentences = text_format.merge_sentence_lists([mozilla_sentences, metu_sentences]) # merge the existing datasets

NN_word_dict = text_format.analyze_words(NN_datasets_sentences) # investigate their word dict

In [None]:
# export the word dicts
json_name = "{}-word_dict.json".format("NN_datasets")
with open(json_name,'w', encoding='utf-8') as outfile:
    json.dump(NN_word_dict,outfile, ensure_ascii=False, indent=4)
    
    
text_name = "{}-word_list.txt".format("NN_datasets")
with open(text_name, 'w', encoding="utf8") as f:
    
    for word in NN_word_dict.keys():
        f.write(word+'\n')
        
# Export the merged sentences       
txt_name = "{}_sentences.txt".format("NN_datasets")
with open(txt_name, 'w', encoding="utf8") as f:
    
    for sentence in NN_datasets_sentences:
        f.write(sentence+'\n')

**Merge the Neural Network Training Sets and the cleaned Wikipedia DUmp**

In [None]:
total_word_dict = {**NN_word_dict, **wiki_word_dict}

total_sentences = text_format.merge_sentence_lists([NN_datasets_sentences,remaining_sentences]) # merge all sentences

filtered = text_format.filter_sentence_list(total_sentences,clean_sentences=[]) # clean and format each sentence
print("There are {} sentences in the merged set.".format(len(filtered)))

bad_characters3, bad_chars_str3 = text_format.analyze_symbols(filtered,alphabet_original) # analyze characters

**Dump the merged word dict**

In [None]:
# for exporting the cleaned wiki dump
txt_name = "LM_sentences.txt"
with open(txt_name, 'w', encoding="utf8") as f:
    
    for sentence in total_sentences:
        f.write(sentence+'\n')
        
        
json_name = "LM-word_dict.json"
with open(json_name,'w', encoding='utf-8') as outfile:
    json.dump(total_word_dict,outfile, ensure_ascii=False, indent=4)

# For personal wiki scrapes

In [None]:
wiki_500 = read_txt("Wiki_sentences_500pages.txt")
print("{} sentences".format(len(wiki_500)))

wiki_500_cleaned = [clean_text(text) for text in wiki_500]

wiki_500_filtered = filter_sentence_list(wiki_500_cleaned)
print("{} sentences".format(len(wiki_500_filtered)))

In [None]:
total_sentences = merge_wiki_scrapes([mozilla_sentences,metu_sentences,wiki500_cleaned,wiki_dump_sentences])
print(len(total_sentences))