In [1]:
import time
start_time = time.time()

In [2]:
import pandas as pd
import numpy as np
import random
import glob
import logging
import os
import re
from multiprocessing import Pool

from langdetect import detect_langs
import pycld2 as cld2

from bs4 import BeautifulSoup

In [3]:
logFormatter = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(format=logFormatter, level=logging.DEBUG)
logger = logging.getLogger(__name__)

### Setup paths

In [4]:
TG_DATA = "./Data/TG_Data"

### Useful functions

In [5]:
def find_between_html_tag(file_ptr,tag):
        regex_t = re.compile(f"(?<=<{tag}>)(.*)(?=</{tag}>)",re.MULTILINE)

        regex_t_s = re.compile(f"<{tag}>")
        regex_t_e = re.compile(f"</{tag}>")
        
        regex_html_strip = re.compile(r'<.*?>')
        
        for line in file_ptr:
            if regex_t_s.search(line):
                match_t = regex_t.search(line)
                if match_t:
                    s = regex_html_strip.sub('',match_t.group())
                    return tag,s
                else:
                    for newline in file_ptr:
                        line = line + newline
                        if regex_t_e.search(newline):
                            match_t = regex_t.search(line.replace('\n',' '))
                            s = regex_html_strip.sub('',match_t.group().strip())
                            return tag,s
                        
        return None,None

In [6]:
def find_headline(file_path):
    fname = os.path.basename(file_path)
    
    with open(file_path,'r') as file_ptr:
        html_sec,text = find_between_html_tag(file_ptr,'h1')
        
        # Separate loop for <p> ONLY IF <h> was found
        if html_sec is None:
            html_sec,text = find_between_html_tag(file_ptr,'p')
    return fname,html_sec,text
    

In [7]:
def read_filelist(folder_path = "."):
    r_path = os.path.join(folder_path, "**/*.html")
    file_list = [f for f in glob.glob(r_path, recursive=True)]
    return file_list

### Process the files

#### Read data

In [8]:
%%time
file_list = read_filelist(TG_DATA)

CPU times: user 2.03 s, sys: 2.54 s, total: 4.57 s
Wall time: 11.7 s


In [9]:
logger.info(f'Number of files : {len(file_list)}')

2019-11-23 20:33:35,641 - INFO - Number of files : 766886


In [10]:
%%time
pool = Pool()
results= pool.map(find_headline, file_list)
pool.close()

CPU times: user 850 ms, sys: 396 ms, total: 1.25 s
Wall time: 34.4 s


In [11]:
%%time
df_headline = pd.DataFrame(results, columns=['fname', 'html_sec', 'text'])

CPU times: user 343 ms, sys: 19.9 ms, total: 363 ms
Wall time: 370 ms


#### Language tagging using `cld2` package

##### `cld2` seem to give better results than `langdetect` package
##### https://github.com/CLD2Owners/cld2

In [12]:
def compute_lang_prob(t):
    top_l = None
    top_l_prob = 0.0
    
    en_prob = 0.0
    ru_prob = 0.0
    try: 
        for l in t[2]:
            if l[2]>top_l_prob:
                top_l_prob = l[2]
                top_l = l[1]
            if l[1] == 'en':
                en_prob = l[2]
            elif l[1] == 'ru':
                ru_prob = l[2]
    except :
        pass

    return {'top_l' : top_l, 'top_l_prob' :top_l_prob ,'en_prob' : en_prob, 'ru_prob' : ru_prob}

In [13]:
def detect_langage(text,method = 'cld2'):
    # Pass the 'method' parameter for deferent
    # models. 
    # Valid params = [cld2,langdetect,polyglot]
    
    ## Encode to utf-8
    text = text.encode('utf-8').decode("utf-8", "ignore")
    
    try:
        if method == 'cld2':
            # Pass to cld2
            result = cld2.detect(text, bestEffort=False)
        elif method == 'langdetect':
            ### TODO : return values properly
            result = detect_langs(text)
        elif method == 'polyglot':
            ### TODO : implement polyglot
            result = tuple()
        else:
            result = tuple()
    except:
        result = tuple()
    
    # Now, compute the probabilities
    _p = compute_lang_prob(result)
    return _p

In [14]:
%%time
pool = Pool()
results= pool.map(detect_langage, list(df_headline['text']))
pool.close()

CPU times: user 1.06 s, sys: 459 ms, total: 1.52 s
Wall time: 2.56 s


In [16]:
df_probs = pd.DataFrame(results)

In [18]:
# %%time
# # Compute result and update it to a df
# df_probs = pd.DataFrame(list(df_headline['text'].apply(lambda x : detect_langage(x))))

In [19]:
df_result = pd.concat([df_headline,df_probs],axis=1)

In [91]:
df_result.head()

Unnamed: 0,fname,html_sec,text,top_l,top_l_prob,en_prob,ru_prob
0,4120487137627119965.html,h1,中国冀望加强与印尼的渔业合作,zh,97.0,0.0,0.0
1,2053985518435310483.html,h1,تكريم مصطفى قمر عن مجمل أعماله بقصر السينما.. صور,ar,98.0,0.0,0.0
2,7679282538646280504.html,h1,"बैंकों के विलय से 7000 शाखाओं पर खतरा, आप पर ह...",hi,99.0,0.0,0.0
3,1105816786000706407.html,h1,Punjab as a driver for Pakistan’s growth,en,97.0,97.0,0.0
4,6181833003973924891.html,h1,Thánh lễ ở London cầu nguyện cho 39 nạn nhân c...,vi,98.0,0.0,0.0


### Inspect results obtained

There seem to be quite a few Fale negatives (Model tagged the article as None
1. We can run other taggers on this and take output
2. We can fetch the body of these files and then tag using that

In [41]:
# np.array(df_result[(df_result['top_l']=='en') & (df_result['top_l_prob'] < 99)]['text']).shape

In [42]:
# np.array(df_result[(df_result['top_l']=='en') & (df_result['top_l_prob'] > 90) &
#                    (df_result['top_l_prob'] < 96)]['text']).tolist()

In [43]:
# list(df_result[(df_result['top_l'].isna())]['text'])

In [44]:
# df_result[df_result['fname'] == '1360813307990682336.html'].iloc[0,2]

### Prepare the output

Sample : 

    [
      {
        "lang_code": "en",
        "articles": [
          "981787246124324.html",
          "239748235923753.html",
          ...
        ]
      },
      {
        "lang_code": "ru",
        "articles": [
          "273612748127432.html",
          ...
        ]
      },
      ...
    ]

In [45]:
# For now, extract the cases where model was > 95% sure
en_articles = list(df_result[df_result['en_prob']>=95]['fname'])
ru_articles = list(df_result[df_result['ru_prob']>=95]['fname'])

In [46]:
def prepare_output(lang_code,article_list):
    #TODO : Make sure lang_code is a valid 
    #       ISO 639-1 two-letter language code
    d = {"lang_code" : lang_code,"articles":article_list}
    return d

In [47]:
output = [prepare_output("en",en_articles),prepare_output("ru",ru_articles)]

In [None]:
print(output)

In [48]:
# print("Total execution time : " + "{0:.2}".format(str(time.time()-start_time)) + " seconds")

Total execution time : 12 seconds


### Validating
If we pass in the whole document and it's the language detectors can perform way better. We will test that out with the accuracy obtained by headline based language detector

In [49]:
def get_soup(file):
    with open(file,'r') as file_ptr:
        soup = BeautifulSoup(file_ptr,'lxml')
    return soup

In [50]:
def extract_meta(soup):
    d = {}
    
    #TODO : Add exception handle to all of this
    try: 
        d['title'] = soup.find("meta",  property="og:title")['content']
    except TypeError as e:
        logger.error('Title not found')
        d['title'] = ""
    
    try:
        d['url'] = soup.find("meta",  property="og:url")['content']
    except TypeError as e:
        logger.error('Title not found')
        d['url'] = ""
    
    try:
        d['site_name'] = soup.find("meta",  property="og:site_name")['content']
    except TypeError as e:
        logger.error('Title not found')
        d['site_name'] = ""
    
    try:
        d['published_time'] = soup.find("meta",  property="article:published_time")['content']
    except TypeError as e:
        logger.error('Title not found')
        d['published_time'] = ""
    
    try:
        d['description'] = soup.find("meta",  property="og:title")['content']
    except TypeError as e:
        logger.error('Title not found')
        d['published_time'] = ""
    
    return d

In [51]:
def extract_text(soup,tag = 'all'):
    assert tag in ['all','p','h1']
    if tag == 'all':
        text = soup.text.strip()
    else:
        p_contents = soup.find_all(tag)
        text = ""
        for p in p_contents:
            text = text + p.getText()
    return text

In [52]:
def parse_html_file(file):
    soup = get_soup(file)
    d = extract_meta(soup)
    d['p_text'] = extract_text(soup,'p')
    return d

In [53]:
# TODO : Make a good sanitization function @Jun
def sanitize_text(text):
    sane_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',text, flags=re.MULTILINE)
    sane_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',sane_text, flags=re.MULTILINE)
    sane_text = text
    return sane_text

In [54]:
%%time
pool = Pool()
results= pool.map(parse_html_file, file_list)
pool.close()

2019-11-23 20:59:07,606 - ERROR - Title not found


CPU times: user 9.33 s, sys: 7.7 s, total: 17 s
Wall time: 5min 36s


In [55]:
df_parsed = pd.DataFrame(results)
df_parsed['all_text'] = df_parsed['title'] + "\n" + df_parsed['p_text']

2019-11-23 20:59:38,434 - INFO - NumExpr defaulting to 8 threads.


In [57]:
%%time
pool = Pool()
results= pool.map(detect_langage, list(df_parsed['all_text']))
pool.close()

CPU times: user 10 s, sys: 18.4 s, total: 28.4 s
Wall time: 47.7 s


In [58]:
df_probs_a = pd.DataFrame(results)

In [59]:
# %%time
# # Compute result and update it to a df
# df_probs_a = pd.DataFrame(list(df_parsed['all_text'].apply(lambda x : detect_langage(sanitize_text(x)))))

In [60]:
df_result_a = pd.concat([df_parsed,df_probs_a],axis=1)

In [65]:
df_result_a[df_result_a.top_l.isna()]['all_text'].shape

(2451,)

In [90]:
# (df_result_a[(df_result_a['top_l']=='en') & (df_result_a['top_l_prob'] < 90)][['title','top_l_prob']])

In [85]:
np.histogram(df_result_a.en_prob)

(array([585318,   2824,    546,    226,    152,     65,     74,    120,
           389, 177172]),
 array([ 0. ,  9.9, 19.8, 29.7, 39.6, 49.5, 59.4, 69.3, 79.2, 89.1, 99. ]))