In [89]:
import time
start_time = time.time()

In [35]:
import pandas as pd
import numpy as np
import random
import glob
import logging
import os
import re
from multiprocessing import Pool

from langdetect import detect_langs
import pycld2 as cld2

In [2]:
logFormatter = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(format=logFormatter, level=logging.DEBUG)
logger = logging.getLogger(__name__)

### Setup paths

In [3]:
DATA = "./Data"

### Useful functions

In [4]:
def find_between_html_tag(file_ptr,tag):
        regex_t = re.compile(f"(?<=<{tag}>)(.*)(?=</{tag}>)",re.MULTILINE)

        regex_t_s = re.compile(f"<{tag}>")
        regex_t_e = re.compile(f"</{tag}>")
        
        regex_html_strip = re.compile(r'<.*?>')
        
        for line in file_ptr:
            if regex_t_s.search(line):
                match_t = regex_t.search(line)
                if match_t:
                    s = regex_html_strip.sub('',match_t.group())
                    return tag,s
                else:
                    for newline in file_ptr:
                        line = line + newline
                        if regex_t_e.search(newline):
                            match_t = regex_t.search(line.replace('\n',' '))
                            s = regex_html_strip.sub('',match_t.group().strip())
                            return tag,s
                        
        return None,None

In [11]:
def find_headline(file_path):
    fname = os.path.basename(file_path)
    
    with open(file_path,'r') as file_ptr:
        html_sec,text = find_between_html_tag(file_ptr,'h1')
        
        # Separate loop for <p> ONLY IF <h> was found
        if html_sec is None:
            html_sec,text = find_between_html_tag(file_ptr,'p')
    return fname,html_sec,text
    

In [6]:
def read_filelist(folder_path = "."):
    r_path = os.path.join(folder_path, "**/*.html")
    file_list = [f for f in glob.glob(r_path, recursive=True)]
    return file_list

### Process the files

#### Read data

In [8]:
%%time
file_list = read_filelist(DATA)

CPU times: user 691 ms, sys: 585 ms, total: 1.28 s
Wall time: 1.59 s


In [9]:
logger.info(f'Number of files : {len(file_list)}')

2019-11-20 04:23:56,972 - INFO - Number of files : 300932


In [12]:
%%time
pool = Pool()
results= pool.map(find_headline, file_list)
pool.close()

CPU times: user 331 ms, sys: 156 ms, total: 487 ms
Wall time: 11.4 s


In [13]:
%%time
df_headline = pd.DataFrame(results, columns=['fname', 'html_sec', 'text'])

CPU times: user 145 ms, sys: 8.72 ms, total: 154 ms
Wall time: 153 ms


#### Language tagging using `cld2` package

##### `cld2` seem to give better results than `langdetect` package
##### https://github.com/CLD2Owners/cld2

In [14]:
def compute_lang_prob(t):
    top_l = None
    top_l_prob = 0.0
    
    en_prob = 0.0
    ru_prob = 0.0
    try: 
        for l in t[2]:
            if l[2]>top_l_prob:
                top_l_prob = l[2]
                top_l = l[1]
            if l[1] == 'en':
                en_prob = l[2]
            elif l[1] == 'ru':
                ru_prob = l[2]
    except :
        pass

    return {'top_l' : top_l, 'top_l_prob' :top_l_prob ,'en_prob' : en_prob, 'ru_prob' : ru_prob}

In [55]:
def detect_langage(text,method = 'cld2'):
    # Pass the 'method' parameter for deferent
    # models. 
    # Valid params = [cld2,langdetect,polyglot]
    
    ## Encode to utf-8
    text = text.encode('utf-8').decode("utf-8", "ignore")
    
    try:
        if method == 'cld2':
            # Pass to cld2
            result = cld2.detect(text, bestEffort=False)
        elif method == 'langdetect':
            ### TODO : return values properly
            result = detect_langs(text)
        elif method == 'polyglot':
            ### TODO : implement polyglot
            result = tuple()
        else:
            result = tuple()
    except:
        result = tuple()
    
    # Now, compute the probabilities
    _p = compute_lang_prob(result)
    return _p

In [16]:
# %%time
# pool = Pool()
# results= pool.map(detect_langage, list(df_headline['text']))
# pool.close()

In [17]:
%%time
# Compute result and update it to a df
df_probs = pd.DataFrame(list(df_headline['text'].apply(lambda x : detect_langage(x))))

CPU times: user 3.35 s, sys: 172 ms, total: 3.52 s
Wall time: 3.48 s


In [18]:
df_result = pd.concat([df_headline,df_probs],axis=1)

In [58]:
df_result.head()

Unnamed: 0,fname,html_sec,text,top_l,top_l_prob,en_prob,ru_prob
0,302165934779691477.html,h1,Widow of ex-KGB agent plans legal action on UK...,en,98.0,98.0,0.0
1,8375520247404441920.html,h1,Уряд спростив низку бюрократичних процедур,uk,98.0,0.0,0.0
2,3780707622460920289.html,h1,"На Украине считают, что иностранцев не следует...",ru,99.0,0.0,99.0
3,8724142282929742420.html,h1,Дмитрий Дибров и Максим Галкин сыграют в «Кто ...,ru,99.0,0.0,99.0
4,2827746228549799757.html,h1,«Джокер» собрал миллиард долларов в мировом пр...,ru,98.0,0.0,98.0


### Inspect results obtained

There seem to be quite a few Fale negatives (Model tagged the article as None
1. We can run other taggers on this and take output
2. We can fetch the body of these files and then tag using that

In [44]:
# np.array(df_result[(df_result['top_l']=='en') & (df_result['top_l_prob'] < 96)]['text'])

In [56]:
# list(df_result[(df_result['top_l'].isna())]['text'])

In [57]:
# df_result[df_result['fname'] == '1360813307990682336.html'].iloc[0,2]

### Prepare the output

Sample : 

    [
      {
        "lang_code": "en",
        "articles": [
          "981787246124324.html",
          "239748235923753.html",
          ...
        ]
      },
      {
        "lang_code": "ru",
        "articles": [
          "273612748127432.html",
          ...
        ]
      },
      ...
    ]

In [69]:
# For now, extract the cases where model was > 95% sure
en_articles = list(df_result[df_result['en_prob']>=95]['fname'])
ru_articles = list(df_result[df_result['ru_prob']>=95]['fname'])

In [62]:
def prepare_output(lang_code,article_list):
    #TODO : Make sure lang_code is a valid 
    #       ISO 639-1 two-letter language code
    d = {"lang_code" : lang_code,"articles":article_list}
    return d

In [70]:
output = [prepare_output("en",en_articles),prepare_output("ru",ru_articles)]

In [95]:
print("Total execution time : " + "{0:.2}".format(str(time.time()-start_time)) + " seconds")

Total execution time : 25 seconds
