## Load the Data

In [None]:
!ls ../input/

In [None]:
import pandas as pd

data = pd.read_csv('../input/bdnews24.csv', encoding='utf-8', index_col=None)

In [None]:
data

## Data Cleaning
Upon inspection, looks like there are non-bengali characters (english, arabic etc). let's get rid of all character other than Bengali.

In [None]:
import re

def clean(txt):
    # filter out all the characters that is not bengali and latin-1
    # latin-1 includes english characters, puntuations and other symbols
    # u0980-u0A7F -> Bengali Unicode Range
    # u0020-u003F -> Basic Latin and english punctuation and numbers ( ! " # $ % & ' ( )'" * + ....etc)
    # u2000-u206F -> Punctuation characters
    # u0964-u0964 -> dari is in Devanagiri unicode range (no idea why, there is a dari in bengali unicode range too. may be because dari used by avro is from devanagiri and most of the people use avro to write begnali)
    # U+0900 – U+097F range of Devanagiri
    # return re.sub(r'[^\u0900-\u0A7F\u0020-u003F\u2000-\u206F]', '', txt)
    # you can explore unicodes from here: https://jrgraphix.net/r/Unicode/

    # this will remove all the characters except bangla and english
    # and that includes emojis, arabic and gibberish characters
    regex = r'[^\u0900-\u0A7F\u0020-u003F\u2000-\u206F]'
    txt = re.sub(regex, '', txt)
    
    # now let's remove english characters to
    # some blog will not makes sense without the english part
    # but I'm removing them since my goal is to train a 
    # bengali language model, and I don't want non-bengali characters
    # before removing all the english char we need to remove
    # html tags, otherwise there will be characters like '<> </>' after removing english chars
    txt = re.sub(r'<.*?>', '', txt)
    txt = re.sub(r'[a-zA-Z0-9]', '', txt)
    
    # if you draw more samples you'll see that there are some characters left
    # and they don't make much sense, we will replace these with a space ' '
    chars = ['/', ';',
         '—', 
         '=', '%',
         '>', '<',
         '_', '…',
         '–', '*', '~',
         '}', '{',
         "\\", 
         '[', ']',
         '#', '+',
         '∗', '&', '|',
         '`', '@', '^',
         '$', '•']
    for char in chars:
        txt = txt.replace(char, ' ')

    # you will also see some raw unicodes like these
    # let's remove them too
    txt = txt.replace('\u200c', '')
    txt = txt.replace('\u200d', '')
    
    
    # replaces bengali single and double quotes with the english ones 
    txt = txt.replace('‘‘', '"') # found some contents that use ‘‘...’’ 
    txt = txt.replace('’’', '"') # instead of “...”
    txt = txt.replace('‘', "'")
    txt = txt.replace('’', "'")
    txt = txt.replace('“', '"')
    txt = txt.replace('”', '"')
    
    # at this stage, you'll see there are multiple characters (. , " " -) together
    # and parenthesis with nothing inside of them
    #  e.g., কায়রোর আরবী নাম কিন্তু আসলে এল কাহিরাহ ()। কাহিরাহ শব্দের অর্থ হল  " "  " " অর্থাৎ যে জয় করে।
    # this removes empty parenthesis
    txt = txt.replace(' ( ', '')
    txt = txt.replace(' ) ', '')
    txt = txt.replace(' )', '')
    txt = txt.replace('( ', '')
    txt = txt.replace('()', '')
    txt = txt.replace('( )', '')
    
    # remove dots
    txt = text = re.sub(r'\.+', '', txt)
    txt = txt.replace(' . ', '')  
    
   
    # removes multiple spaces
    txt = re.sub(r' +',' ',txt)
    # removes if more than one dari
    txt = re.sub(r'।+', '। ', txt) 

    

    return txt.strip()

In [None]:
import random

sample = data['contents'][random.randint(0, len(data))]

print(sample)

In [None]:
clean(sample)