In [None]:
from glob import glob
from tqdm import tqdm
import json
from langdetect import detect as lang_detect
import re
import pandas as pd

from banglakit import lemmatizer as lem
from banglakit.lemmatizer import BengaliLemmatizer

lemmatizer = BengaliLemmatizer()

In [None]:
MAX_CHAR_LEN=150
MIN_CHAR_LEN=10
MIN_WORD_LEN= 3
SIMILARITY_THRESHOLD = .7

input_text_path = '/hdd/sifat/NLP/sentiment_analysis/data/news-analyzer1.json'
output_csv_path = 'news_1k_articles_15k.csv'

## Load the raw text source

In [None]:
text_list=[]

with open(input_text_path) as input_f:
    for obj in input_f:
        text_list.append(obj)
        
print('Loaded {} lines from {} file'.format(len(text_list)))

## Cleaning and Filtering

In [None]:
def clean(text):
    garbage_list= "'‘’“”/\"—|"  ## Removes these characters
    clean_text= ""
    for char in text:
        if char not in garbage_list:
            clean_text+=char
    return clean_text

def is_all_bangla(text):
    if bool(re.match("^[\u0980-\u09FF ।,?!.]+$",text)):  ## only valid characters
        return True
    return False
    
def is_valid(text):
    l= len(text)
    if l<MIN_CHAR_LEN or l>MAX_CHAR_LEN:
        return False
    if len(text.split())<MIN_WORD_LEN:
        return False
    if not is_all_bangla(text):
        return False
    
    return True


def lemmatize_sentence(text):
    result=""
    for word in text.split():
        lem_word= lemmatizer.lemmatize(word, pos=lem.POS_NOUN)
        if is_all_bangla(lem_word):
            result+= lem_word + " "
        else:
            result+= word + " "
    return result
  

In [None]:
clean_sentences= []
lem_sentences= []

for text in text_list:
    article,article_url= json.loads(text)["_source"]["body"], json.loads(text)["_source"]["url"]
    for m in article.split("।"):
        clean_m= clean(m)
        if is_valid(clean_m.strip()):
            clean_sentences.append((clean_m.strip(),article_url))
            lem_sentences.append(lemmatize_sentence(clean_m.strip()))


In [None]:
print(f"Total articles {total} Valid and clean sentences {len(clean_sentences)}")

In [None]:
clean_sentences[:10]

## Similarity Checker based on jaccard similarity

In [None]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [None]:
jaccard_similarity("নতুন সিমে কয় টাকা লোড দিলে এক জিবি নেট পাওয়া যায়".split(),"নতুন সিমে কতো টাকা লোড দিলে এক জিবি নেট পাওয়া যাবে".split())

In [None]:
final_sentences= []
for clean_idx,(sentence,s_id) in tqdm(enumerate(clean_sentences)):
    is_unique=True
    l1= lemmatize_sentence(sentence)
    for lem_idx,l2 in enumerate(lem_sentences):
        if clean_idx==lem_idx:
            continue
        if jaccard_similarity(l1.split(),l2.split()) > SIMILARITY_THRESHOLD:
            is_unique=False
            break
    
    if is_unique:
        final_sentences.append((sentence,s_id))

In [None]:
print(f"Total {total} unique based on jaccard similarity {len(final_sentences)}")

In [None]:
final_sentences[:5]

## Create output csv file

In [None]:
pd.DataFrame.to_csv(pd.DataFrame(final_sentences),"news_1k_articles_15k.csv",header=None,index=False)