In [1]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-storage.json'

In [2]:
import re

def cleaning(string):
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [3]:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('mesolitica-general')

In [4]:
from glob import glob
from pathlib import Path
import json
import tensorflow as tf

In [5]:
files = glob('../bert/dumping*')
files = [i for i in files if 'insta' not in i and 'twitter' not in i and 'combined' not in i]
files

['../bert/dumping-parliament.txt',
 '../bert/dumping-iium.txt',
 '../bert/dumping-wiki.txt',
 '../bert/dumping-news.txt',
 '../bert/dumping-watpadd.txt',
 '../bert/dumping-pdf.txt']

In [6]:
for file in files:
    with open(file) as fopen:
        data = list(filter(None, fopen.read().split('\n')))
    print(file, len(data))
    s = Path(file).stem
    filename = f'{s}.tsv'
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(data)):
            outfile.write('%s\t%s\n' % ('', cleaning(data[i])))
    
    blob = bucket.blob(f't5-data/{filename}')
    blob.upload_from_filename(filename)

../bert/dumping-parliament.txt 890823
../bert/dumping-iium.txt 1121978
../bert/dumping-wiki.txt 1715551
../bert/dumping-news.txt 1791784
../bert/dumping-watpadd.txt 1445379
../bert/dumping-pdf.txt 596417


In [7]:
for file in files:
    with open(file) as fopen:
        data = fopen.read().split('\n')
    results, result = [], []
    for i in data:
        if not len(i) and len(result):
            results.append('. '.join(result))
            result = []
        else:
            result.append(i)
    print(file, len(results))
    s = Path(file).stem
    filename = f'{s}-pair.tsv'
    
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(results)):
            outfile.write('%s\t\n' % (cleaning(results[i])))
    
    blob = bucket.blob(f't5-data/{filename}')
    blob.upload_from_filename(filename)

../bert/dumping-parliament.txt 70046
../bert/dumping-iium.txt 17837
../bert/dumping-wiki.txt 326735
../bert/dumping-news.txt 105469
../bert/dumping-watpadd.txt 55091
../bert/dumping-pdf.txt 55538


In [9]:
with open('stemming.json') as fopen:
    data = json.load(fopen)
    
with tf.io.gfile.GFile('stemming.tsv', "w") as outfile:
    for i in range(len(data)):
        outfile.write("%s\t%s\n" % (cleaning(data[i][1]), cleaning(data[i][0])))

blob = bucket.blob('t5-data/stemming.tsv')
blob.upload_from_filename('stemming.tsv')

In [10]:
with open('synonyms.json') as fopen:
    data = json.load(fopen)
    
with tf.io.gfile.GFile('synonyms.tsv', "w") as outfile:
    for i in range(len(data)):
        outfile.write("%s\t%s\n" % (cleaning(data[i][1]), cleaning(data[i][0])))
        
blob = bucket.blob('t5-data/synonyms.tsv')
blob.upload_from_filename('synonyms.tsv')

In [11]:
files = glob('../malaya-dataset/Malaya-Dataset/text-similarity/quora/*.json')
labels = {0: 'tak sama', 1: 'sama'}

for file in files:
    with open(file) as fopen:
        data = json.load(fopen)
    print(file, len(data))
    s = Path(file).stem
    filename = f'quora-{s}.tsv'
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(data)):
            if len(data[i]) != 2:
                continue
            label = labels[data[i][1]]
            splitted = data[i][0].split(' <> ')
            if len(splitted) != 2:
                continue
            q = f'soalan1: {cleaning(splitted[0])} soalan2: {cleaning(splitted[1])}'
            outfile.write('%s\t%s\n' % (q, label))
    
    blob = bucket.blob(f't5-data/{filename}')
    blob.upload_from_filename(filename)

../malaya-dataset/Malaya-Dataset/text-similarity/quora/0-100k.json 99993
../malaya-dataset/Malaya-Dataset/text-similarity/quora/400k-500k.json 4290
../malaya-dataset/Malaya-Dataset/text-similarity/quora/300k-400k.json 99995
../malaya-dataset/Malaya-Dataset/text-similarity/quora/200k-300k.json 99992
../malaya-dataset/Malaya-Dataset/text-similarity/quora/100k-200k.json 100000


In [12]:
labels = {'contradiction': 'percanggahan', 'entailment': 'berkait'}

files = glob('../malaya-dataset/Malaya-Dataset/text-similarity/snli/*.json')

for file in files:
    with open(file) as fopen:
        data = json.load(fopen)
    print(file, len(data))
    
    s = Path(file).stem
    filename = f'snli-{s}.tsv'
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(data)):
            if len(data[i]) != 2:
                continue

            label = labels.get(data[i][0], data[i][0])
            splitted = data[i][1].split(' <> ')
            if len(splitted) != 2:
                continue
            q = f'ayat1: {cleaning(splitted[0])} ayat2: {(splitted[1])}'
            outfile.write('%s\t%s\n' % (q, label))
    
    blob = bucket.blob(f't5-data/{filename}')
    blob.upload_from_filename(filename)

../malaya-dataset/Malaya-Dataset/text-similarity/snli/pary7.json 30934
../malaya-dataset/Malaya-Dataset/text-similarity/snli/part2.json 49998
../malaya-dataset/Malaya-Dataset/text-similarity/snli/part1.json 50000
../malaya-dataset/Malaya-Dataset/text-similarity/snli/part4.json 50000
../malaya-dataset/Malaya-Dataset/text-similarity/snli/part6.json 100000
../malaya-dataset/Malaya-Dataset/text-similarity/snli/part3.json 50000
../malaya-dataset/Malaya-Dataset/text-similarity/snli/part5.json 49998


In [13]:
with open('translated-validation.json') as fopen:
    data = json.load(fopen)
    
questions, answers = [], []
for row in data:
    if '<>' not in row:
        q, a = row.split('? ')
        q = f'{q}?'
    else:
        q, a = row.split('<>')
    questions.append(q.strip())
    answers.append(a.strip())
    
with tf.io.gfile.GFile('qa-validation.tsv', "w") as outfile:
    for i in range(len(questions)):
        outfile.write("%s\t%s\n" % (cleaning(questions[i]), cleaning(answers[i])))
        
blob = bucket.blob('t5-data/qa-validation.tsv')
blob.upload_from_filename('qa-validation.tsv')

In [14]:
with open('translated-train.json') as fopen:
    data = json.load(fopen)
    
questions, answers = [], []
for row in data:
    try:
        if '<>' not in row:
            q, a = row.split('? ')
            q = f'{q}?'
        else:
            q, a = row.split('<>')
        questions.append(q.strip())
        answers.append(a.strip())
    except:
        pass
    
with tf.io.gfile.GFile('qa-train.tsv', "w") as outfile:
    for i in range(len(questions)):
        outfile.write("%s\t%s\n" % (cleaning(questions[i]), cleaning(answers[i])))
        
blob = bucket.blob('t5-data/qa-train.tsv')
blob.upload_from_filename('qa-train.tsv')

In [15]:
# !rm -rf __MACOSX/news
# !rm -rf news
# !wget https://huseinhouse-data.s3-ap-southeast-1.amazonaws.com/news.zip
# !unzip news.zip

In [16]:
def filter_news(string):
    string = string.lower()
    return 'javascript is disabled' in string or 'requires javascript' in string or 'javascript' in string \
    or 'président' in string

In [None]:
news = glob('news/*.json')

results = []
for n in news:
    with open(n) as fopen:
        data = json.load(fopen)
    for i in data:
        if not filter_news(i['text']) and i['language'] != 'ENGLISH' and len(i['text']) and len(i['title']):
            results.append((cleaning(i['text']), cleaning(i['title'])))
            
with tf.io.gfile.GFile('news-title.tsv', "w") as outfile:
    for i in range(len(results)):
        outfile.write("%s\t%s\n" % (results[i][0], results[i][1]))
        
blob = bucket.blob('t5-data/news-title.tsv')
blob.upload_from_filename('news-title.tsv')

In [None]:
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news/news-30k/news-30k.json.zip
# !unzip news-30k.json.zip

In [None]:
with open('news-30k.json') as fopen:
    data = json.load(fopen)
    
results = []
for i in data:
    if not filter_news(i['text']) and i['language'] == 'malay' and len(i['text']) and len(i['title']):
        results.append((cleaning(i['text']), cleaning(i['title'])))
        
with tf.io.gfile.GFile('news-title2.tsv', "w") as outfile:
    for i in range(len(results)):
        outfile.write("%s\t%s\n" % (results[i][0], results[i][1]))
        
blob = bucket.blob('t5-data/news-title2.tsv')
blob.upload_from_filename('news-title2.tsv')