In [2]:
import pandas as pd
import numpy as np
import logging
import glob
import os
import json

import pycld2 as cld2
from multiprocessing import Pool
import text_preprocessing as tp

In [1]:
### LOGGER
logFormatter = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(format=logFormatter, level=logging.DEBUG)
logger = logging.getLogger(__name__)

NameError: name 'logging' is not defined

In [3]:
def read_filelist(folder_path,**kwargs):
    '''Return the relative file path of all the html files 
    which are present in the path
    
    By defauly, the search is recursive and extends to all the folders
    present in the folder_path provided.
    
    Pass `recursive=False` for fetch only the current
    directory
    
    Pass the file extension desired as 'file_type'.
    Eg: file_type = html (Default is 'htm')
    
    '''
    try:
        recursive = kwargs['recursive']
        if not isinstance(recursive,bool):
            logger.info('Recursive option not passed correctly. Defaulting to True')
            recursive = True
    except KeyError as e:
        recursive = True
    
    try:
        file_type = kwargs['file_type']
        if not isinstance(file_type,str):
            logger.info("Defaulting to 'htm'")
            file_type = 'htm'
    except KeyError as e:
        logger.info("Defaulting to 'htm*'")
        file_type = 'htm*'
    r_path = os.path.join(folder_path, "**/*."+file_type)
    file_list = [f for f in glob.glob(r_path, recursive=recursive)]
    return file_list


def get_soup(file):
    '''Return the BeautifulSoup object of the html file provided'''
    from bs4 import BeautifulSoup
    
    with open(file,'r') as file_ptr:
        soup = BeautifulSoup(file_ptr,'lxml')
    return soup


def extract_meta(soup):
    '''This is a specific function for the type of file TG has
    provided in the samples. The look-up values are based on that
    
    If those fields are not found (as in the case of a generic html)
    empty values are returned in those cases'''
    d = {}
    
    #TODO : Add exception handle to all of this
    try: 
        d['title'] = soup.find("meta",  property="og:title")['content']
    except TypeError as e:
#         logger.error('Title not found')
        d['title'] = ""
    
    try:
        d['url'] = soup.find("meta",  property="og:url")['content']
    except TypeError as e:
#         logger.error('Title not found')
        d['url'] = ""
    
    try:
        d['site_name'] = soup.find("meta",  property="og:site_name")['content']
    except TypeError as e:
#         logger.error('Title not found')
        d['site_name'] = ""
    
    try:
        d['published_time'] = soup.find("meta",  property="article:published_time")['content']
    except TypeError as e:
#         logger.error('Title not found')
        d['published_time'] = ""
    
    try:
        d['description'] = soup.find("meta",  property="og:title")['content']
    except TypeError as e:
#         logger.error('Title not found')
        d['published_time'] = ""
    
    return d

def extract_text(soup,tag = 'all'):
    '''Takes the soup objects and the tag name as inputs,
    returns all the text in that tag concatenated 
    together'''
    
    if tag == 'all':
        text = soup.text.strip()
    else:
        p_contents = soup.find_all(tag)
        text = ""
        for p in p_contents:
            text = text + " " + p.getText()
    return text


def sanitize_text(text):
    import re
    sane_text = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '',text, flags=re.MULTILINE)
    sane_text = bytes(sane_text, 'utf-8').decode('utf-8','ignore')
    
    return sane_text


def extract_links(soup,domain=False):
    '''Takes the soup objects and returns all the links'''
    links = [a.get('href') for a in soup.find_all('a', href=True)]

    return links


def parse_html_file(file):
    '''Uses bs4 to get the soup of the file and calls
    the other extraction functions
    TODO : Better html parsers are available'''
    soup = get_soup(file)
    d = extract_meta(soup)
    d['p_text'] = extract_text(soup,'p')
    d['links'] = extract_text(soup,'a')
    d['all_text'] = d['title'] + "\n" + d['p_text']
    d['links'] =extract_links(soup)
    return d




### FUNCTIONS FOR LANGUAGE DETECTION
def compute_lang_prob(t):
    '''Returns the top language code identified.
    Also returns the probaility of English and Russian
    (Specific to this problem statement)'''
    top_l = None
    top_l_prob = 0.0
    
    en_prob = 0.0
    ru_prob = 0.0
    try: 
        for l in t[2]:
            if l[2]>top_l_prob:
                top_l_prob = l[2]
                top_l = l[1]
            if l[1] == 'en':
                en_prob = l[2]
            elif l[1] == 'ru':
                ru_prob = l[2]
    except :
        pass

    return {'top_l' : top_l, 'top_l_prob' :top_l_prob ,'en_prob' : en_prob, 'ru_prob' : ru_prob}


def detect_langage(text,method = 'cld2'):
    '''For each piece of text input, this
    function uses the method passed to return
    the detected languages
    Pass the 'method' parameter for different models. 
    Valid params = [cld2,langdetect,polyglot]'''
    
    ## Encode to utf-8
    text = text.encode('utf-8').decode("utf-8", "ignore")
    
    try:
        if method == 'cld2':
            # Pass to cld2
            result = cld2.detect(text, bestEffort=False)
        elif method == 'langdetect':
            ### TODO : return values properly
            result = detect_langs(text)
        elif method == 'polyglot':
            ### TODO : implement polyglot
            result = tuple()
        else:
            result = tuple()
    except Exception as e:
#         logger.error(e)
        result = tuple()
    
    # Now, compute the probabilities
    _p = compute_lang_prob(result)
    return _p


def detect_distributed(file):
    '''This function calls the process in order.
    Parallizes well.'''
#     soup = get_soup(file)
#     d = extract_meta(soup)
#     d['p_text'] = extract_text(soup,'p')
#     d['all_text'] = sanitize_text(d['title'] + "\n" + d['p_text'])
    d = parse_html_file(file)
    d.update(detect_langage(d['all_text']))
    
    return d


def label_final_lang(df_prob,prob=0.95):
    '''Once the probabilities are calculated,
    prepare the list of EN and RU articles.
    TODO : Ideally this should scale to any language.'''
    # For now, extract the cases where model was > 95% sure
    en_articles = list(df_prob[df_prob['en_prob']>=prob]['fname'])
    ru_articles = list(df_prob[df_prob['ru_prob']>=prob]['fname'])
    return en_articles,ru_articles


def prepare_output(lang_code,article_list):
    '''Prepare the JSON output in the desired manner
    TODO : Make sure lang_code is a valid ISO 639-1 
    two-letter language code'''
    
    d = {"lang_code" : lang_code,"articles":article_list}
    return d


def languages(path,**kwargs):
    '''This function outputs the EN and RU 
    articles in the provided path. 
    If full path of the files are required,
    pass full_path = True'''
    file_list = read_filelist(path)
    logger.info(f'Number of files : {len(file_list)}')
    
    with Pool() as pool:
        results = pool.map(detect_distributed, file_list)
    
    df_prob = pd.DataFrame(results)
    df_prob['html_dict'] = list(results)
    
    try:
        full_path = kwargs['full_path']
        if full_path:
            df_prob['fname'] = [f for f in file_list]
        else:
            df_prob['fname'] = [os.path.basename(f) for f in file_list]
    except KeyError as e:
        logger.warning('Returning only the file name')
        df_prob['fname'] = [os.path.basename(f) for f in file_list]
    

    # For now, extract the cases where model was > 95% sure
    try:
        threshold = kwargs['threshold']
    except KeyError as e:
        logger.info("Setting default probabiltiy at 0.95")
        threshold = 0.95
    
    en_articles,ru_articles = label_final_lang(df_prob,prob=threshold)
    
    output = prepare_output("en",en_articles),prepare_output("ru",ru_articles)
    
    try:
        return_parsed_df = kwargs['return_parsed_df']
        if return_parsed_df:
            return output,df_prob
        else:
            return output
    except KeyError as e:
        logger.info('Returning output dictionary only.')
        return output


In [4]:
def load_vectors(fname):
    import io
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return n,d,data


## Find out memory requirements for saving N number of vectors

In [13]:
%%time
n,d,ft_dict = load_vectors("assets/wiki-news-300d-1M.vec")

CPU times: user 1min 12s, sys: 44.3 s, total: 1min 56s
Wall time: 2min 5s


In [29]:
len(ft_dict)

999994

In [27]:
out = dict(itertools.islice(ft_dict.items(), 100000)) 

In [35]:
with open('assets/wiki_news_ft_300D_selected.json', 'w') as outfile:
    json.dump(out, outfile)

### Takes aroung 253 MB to save 100K vectors

## Get only the English articles using the languages module

In [5]:
%%time
en_ru_articles = languages('Data/TG_Data/',full_path=True)

2019-12-02 23:07:57,815 - INFO - Defaulting to 'htm*'
2019-12-02 23:08:03,731 - INFO - Number of files : 1080604
2019-12-02 23:22:56,107 - INFO - Setting default probabiltiy at 0.95
2019-12-02 23:22:58,560 - INFO - Returning output dictionary only.


CPU times: user 1min 1s, sys: 2min 29s, total: 3min 31s
Wall time: 15min 9s


In [6]:
en_articles = en_ru_articles[0]['articles']

In [9]:
with Pool(8) as pool:
    results = pool.map(parse_html_file,en_articles)

## Find the top N words

In [52]:
from progressbar import ProgressBar

In [56]:
pbar = ProgressBar(maxval=len(results)).start()

i = 0
text = []
for h in results:
    text.append(h['all_text'])
    
    i+=1
    pbar.update(i)
pbar.finish()

100% |########################################################################|


In [74]:
with open('Data/all_text.json','w') as f:
    json.dump(text,f)

In [70]:
%%time
with Pool(8) as pool:
    results = pool.map(tp.preprocess, text)   

IndexError: string index out of range