In [15]:
import json
import re
from itertools import groupby 
from string import punctuation
import random
import os

from tqdm import tqdm
import pandas as pd


In [16]:
input_file_path = 'output/0-youtube-sentences.csv'
output_dir = 'output'

In [34]:
pd.read_csv(input_file_path).shape

(16347, 2)

In [17]:
VALID_BANGLA_REGEX = re.compile(r'[^\u0980-\u09FF ।,?!\(\)-.”“/;"‘:\']+')
SPLIT_REGEX = re.compile(r'\n|।|\?')
SPLIT_BY_NEWLINE_REGEX = re.compile(r'\n')
EMOJI_REGEX = re.compile(
    "(["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "])"
    )


def is_all_bangla(text, allowed_threshold=96.9):
    """
    Allows some non-bangla characters (excluding punctuations) and 
    finds out if contains greater than <percent> characters are from Bangla language
    """
    valid_bangla_text = VALID_BANGLA_REGEX.sub('', text)
    valid_bangla_text_len = len(valid_bangla_text)
    all_text_len = len(text)
    
    percent_bangla = (valid_bangla_text_len*100.0) / all_text_len

    if percent_bangla < allowed_threshold:
        return False
    return True
    
    
    
def is_valid(text):
    l= len(text)
    if l<MIN_CHAR_LEN or l>MAX_CHAR_LEN:
        return False
    if len(text.split())<MIN_WORD_LEN:
        return False
    if not is_all_bangla(text):
        return False
    
    return True

def split_sentences(text):
    return SPLIT_REGEX.split(text)


def split_sentences_from_paragraphs(paragraphs):
    sentences = []
    for p in paragraphs:
        sentences.append(SPLIT_REGEX.split(p))
    return sentences


def split_paragraphs(text):
    return SPLIT_BY_NEWLINE_REGEX.split(text)


def remove_repeating_puncts(texts, punctuations=set(punctuation+ '।\n')):
    # ref: https://stackoverflow.com/a/32485876
    newtext = []
    for k, g in groupby(texts):
        if k in punctuations:
            newtext.append(k)
        else:
            newtext.extend(g)
    return ''.join(newtext)


def remove_emoticons(text):
    # Ref: https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1#gistcomment-3208085
    # Ref: https://en.wikipedia.org/wiki/Unicode_block
    text = re.sub(EMOJI_REGEX, r' ', text)
    return text


def remove_non_bangla_characters(text):
    if isinstance(text, str):
        cleaned = VALID_BANGLA_REGEX.sub(' ', text)
        return cleaned
    elif isinstance(text, list):
        results = []
        for item in text:
            cleaned = VALID_BANGLA_REGEX.sub(' ', item)
            results.append(cleaned)
        return results
    else:
        raise Exception('invalid param')
        
    
def is_poem(paragraphs):
    """Find out if blog article is poem or not, by analyzing avg. paragraph length"""
    
    total_len = 0
    for p in paragraphs:
        if not isinstance(p, str):
            print('error ', p)
        total_len += len(p)
    avg_para_len = total_len / len(paragraphs)

    if avg_para_len < 100:
        return True
    else:
        return False

In [18]:
input_df = pd.read_csv(input_file_path)
input_df.columns= ["text","ID"]
input_df.head()

Unnamed: 0,text,ID
0,এই পুলিশকে ও এবাবে মেরে ফেলে উচিৎ,UgxyQWOAsyvXxVvxp-Z4AaABAg
1,অমেরিকা উন্নত দেশ হলে..অমরিকান লোক সবচেয়ে বেশি...,UgxXKy8QwTW7ER91jgR4AaABAg
2,কুত্তার জাতী আমেরিকা,UgzaZ-h6vU-Exk4ApIt4AaABAg
3,বাংলাদেশের পুলিশের চেয়েও খারাপ,UgzHzIQGAmvmrzedamx4AaABAg
4,মানবাধিকার সংগঠনগুলো এখন কোথায় 🤔🤔🤔🤔🤔🤔,Ugz1L1YA3i3-3igEWhh4AaABAg


In [21]:
all_chars = []
char_map = {}
count = 0
count_not_bangla = 0

filtered_texts = []
for idx, row in tqdm(input_df.iterrows()):
    text = row['text']
    if not isinstance(text, str):
        continue

    text = remove_repeating_puncts(text)
    text = remove_emoticons(text)
    if not is_all_bangla(text):
        count_not_bangla += 1
        continue

    paragraphs = split_paragraphs(text)

#     if is_poem(paragraphs):
#         print("Found poem: ", row['ID'])
#         count += 1
#         continue
    
    filtered_texts.append((text, row['ID']))
    
print(count)
print(count_not_bangla)

16347it [00:01, 9055.78it/s]

0
7463





In [36]:
len(filtered_texts)

8884

In [None]:
# i = 0
# for f in filtered_texts[20000:]:
#     i+=1
#     if i>20:
#         break
#     print(f)

# Extract documents

In [23]:
paragraphs_for_next_step = []
documents = []
need_document = 18000

rejected_count = 0
for text, url in tqdm(filtered_texts):
    # if already found n documents needed, break
    if len(documents) >= need_document:
        break
        
    if not isinstance(text, str):
        continue
        
    paragraphs = split_paragraphs(text)
    cleaned_paragraphs = remove_non_bangla_characters(paragraphs)
    string = " ".join(cleaned_paragraphs)
    
    if len(paragraphs) < 3:
        rejected_count += 1
        continue
        
    if len(string) < 800: # character count
        rejected_count += 1
        continue
    
    # trim documents with more than 10 paragraphs 
    if len(paragraphs) > 10:
        no_of_paragraphs = random.randint(3, 10)
        cleaned_paragraphs = cleaned_paragraphs[:no_of_paragraphs]
        cleaned_text = '\n'.join(cleaned_paragraphs)
        
        words = cleaned_text.split()
        documents.append({'text': cleaned_text, 'url':url, 'para_count': no_of_paragraphs, 'word_count': len(words)})
        
        for p in paragraphs[:no_of_paragraphs]:
            paragraphs_for_next_step.append((p, url))
    else:
        cleaned_text = '\n'.join(cleaned_paragraphs)
        words = cleaned_text.split()
        documents.append({'text': cleaned_text, 'url':url, 'para_count': len(paragraphs), 'word_count': len(words)})
        

print('Rejected documents: ', rejected_count)
print("Got clean documents: ", len(documents))
print("Left paragraph for next: ", len(paragraphs_for_next_step))

100%|██████████| 8884/8884 [00:00<00:00, 267542.12it/s]

Rejected documents:  8866
Got clean documents:  18
Left paragraph for next:  45





## Check stats and export Document CSV

In [None]:
document_df = pd.DataFrame(documents)
document_df.describe()

In [None]:
# filter lower word count documents, if some slipped through
document_df = document_df[document_df['word_count'] > 40]

# drop some columns to match with our standard CSV format
document_df = document_df.drop('para_count', axis=1)
document_df = document_df.drop('word_count', axis=1)
document_df['source'] = 'blog'

In [None]:
document_df.to_csv(os.path.join('output', '1-blog-documents.csv'), index=None)

# Extract Paragraphs

In [None]:
sentences_for_next_step = []
paragraphs = []

rejected = 0
for text, url in tqdm(paragraphs_for_next_step):
    if not isinstance(text, str):
        continue
        
    sentences = split_sentences(text)
    sentences = remove_non_bangla_characters(sentences)
    
    if len(text) < 200:
        for s in sentences:
            sentences_for_next_step.append((s, url))
        rejected += 1
        continue
        
    if len(sentences) < 3 or len(sentences) > 15:
        for s in sentences:
            sentences_for_next_step.append((s, url))
        rejected += 1
        continue    
    
    cleaned_text = " ".join(sentences)
    word_count = len(cleaned_text.split())
    paragraphs.append({'text': cleaned_text, 'url': url, 'word_count': word_count, 'sent_count': len(sentences) })
    
print('Rejected paragraphs: ', rejected)

## Check stats and export Paragraph CSV

In [None]:
# check stats
para_df = pd.DataFrame(paragraphs)
para_df.describe()

In [None]:
# filter lower word count documents, if some slipped through
# para_df = para_df[para_df['word_count'] > 20]

# drop some columns to match with our standard CSV format
para_df = para_df.drop('sent_count', axis=1)
para_df = para_df.drop('word_count', axis=1)
para_df['source'] = 'blog'

para_df.to_csv(os.path.join(output_dir, '1-blog-paragraphs.csv'), index=None)

# Extract sentences

In [37]:
rejected = 0
sentences = []
for s, url in tqdm(filtered_texts):
    if not isinstance(s, str):
        continue
    s = remove_non_bangla_characters(s)
    if len(s) < 20:
        rejected += 1
        continue
    
    words = s.split()
    if len(words) < 3 and len(words) > 15:
        rejected += 1
        continue
    
    sentences.append({'text': s, 'word_count': len(words), 'char_count': len(s)})
    
print("Rejected sentences: ", rejected)

100%|██████████| 8884/8884 [00:00<00:00, 211677.34it/s]

Rejected sentences:  1357





In [40]:
len(sentences)

7527

## Check stats and export sentenc CSVs

In [41]:
sentence_df = pd.DataFrame(sentences)
sentence_df.describe()

Unnamed: 0,char_count,word_count
count,7527.0,7527.0
mean,86.533812,14.987512
std,117.736619,19.418516
min,20.0,0.0
25%,40.0,7.0
50%,63.0,11.0
75%,100.0,17.0
max,3315.0,528.0


In [42]:
# filter lower word count sentences, if some slipped through
sentence_df = sentence_df[sentence_df['word_count'] > 3]

# drop some columns to match with our standard CSV format
sentence_df = sentence_df.drop('char_count', axis=1)
sentence_df = sentence_df.drop('word_count', axis=1)
sentence_df['source'] = 'youtube'

sentence_df.to_csv(os.path.join(output_dir, '1-youtube-sentences.csv'), index=None)

In [44]:
pd.read_csv("output/1-youtube-sentences.csv").head()

Unnamed: 0,text,source
0,এই পুলিশকে ও এবাবে মেরে ফেলে উচিৎ,youtube
1,অমেরিকা উন্নত দেশ হলে.অমরিকান লোক সবচেয়ে বেশি ...,youtube
2,বাংলাদেশের পুলিশের চেয়েও খারাপ,youtube
3,মানবাধিকার সংগঠনগুলো এখন কোথায়,youtube
4,যাক তা হলে পৃথিবীতে বাংলাদেশের পুলিশের চেয়ে খা...,youtube
