# Preprocess

In [1]:
%matplotlib inline

import random
import seaborn as sns

from itertools import chain
from tqdm import tqdm_notebook
from hyperdash import Experiment
from string import ascii_lowercase
from langdetect import detect as detect_lang, detect_langs

## Args

In [2]:
TOPLEVELFUNCS_DESC_PATH = 'code-docstring-corpus/V2/parallel/parallel_desc'
METHODS_DESC_PATH = 'code-docstring-corpus/V2/parallel/parallel_methods_desc'
DESC_PATHS = [
    TOPLEVELFUNCS_DESC_PATH,
    METHODS_DESC_PATH
]
MAX_READ = 1000000

## Read

In [3]:
def read_desc_gen(file_path):
    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            yield line.strip()[1:-1]
            
            
def read_all():
    all_gen = (read_desc_gen(desc_path) for desc_path in DESC_PATHS)
    return list(set(chain.from_iterable(all_gen)))

In [4]:
%time descs = read_all()[:MAX_READ]
len(descs)

CPU times: user 688 ms, sys: 92 ms, total: 780 ms
Wall time: 784 ms


357220

## Filtering

In [5]:
LEN_LB, LEN_UP = 25, 500
LOG_STEP = 10000
CHARS_TRESHOLD = 0.8

In [6]:
def is_english_detect(text):
    try:
        return detect_lang(text) == 'en'
    except:
        return False


def is_chinise(text):
    for c in text:
        if 0x4e00 <= ord(c) <= 0x9fff:
            return True
    return False


def is_ascii(text):
    return all(31 < ord(c) < 128 for c in text)

    
def is_plausible_chars(text):
    m = sum(bool(c in ascii_lowercase) for c in text.lower()) 
    n = len(text)
    return m / n >= CHARS_TRESHOLD and is_ascii(text)


def is_sensible(text):
    return is_english_detect(text) and is_plausible_chars(text)


def is_valid(desc):
    return LEN_LB <= len(desc) <= LEN_UP and is_sensible(desc)

In [None]:
def english_sorting(descs):
    exp = Experiment('1.1.1: Preprocessing english sorting', capture_io=False)
    new_descs = []
    for i, desc in enumerate(tqdm_notebook(descs)):
        if is_valid(desc):
            new_descs.append(desc)
        if i % LOG_STEP == 0 or i == len(descs) - 1:
            exp.metric('step', i)
            exp.metric('current len', len(new_descs))
    exp.end()
    return new_descs

In [None]:
%time descs = english_sorting(descs)
len(descs)

| step:   0.000000 |
| current len:   0.000000 |


## Analysis

In [None]:
sns.distplot([len(desc) for desc in descs]);

In [None]:
random.sample(descs, 10)

## Save

In [None]:
%store descs