# Preprocess

In [1]:
%matplotlib inline

import os
import random
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from utils import monitor
from itertools import chain
from string import ascii_lowercase
from sklearn.datasets.lfw import Bunch
from more_itertools import unique_everseen, take
from langdetect import detect as detect_lang, detect_langs

## Args

In [2]:
args = Bunch()

## Main `df`

In [3]:
df = pd.DataFrame(columns=['desc'])
len(df)

0

## Docstring corpus

### Read

In [4]:
_data_base_path = './code-docstring-corpus/V2/parallel'
args.toplevelfuncs_desc_path = os.path.join(_data_base_path, 'parallel_desc')
args.methods_desc_path = os.path.join(_data_base_path, 'parallel_methods_desc')
args.desc_paths = [args.toplevelfuncs_desc_path, args.methods_desc_path]
args.max_descs = 1000000

In [5]:
def read_desc_gen(file_path):
    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            yield line.strip()[1:-1]
            
            
def read_all():
    all_gen = (read_desc_gen(desc_path) for desc_path in args.desc_paths)
    return list(take(args.max_descs, unique_everseen(chain.from_iterable(all_gen))))

In [6]:
descs = read_all()
len(descs)

357220

### Filtering

In [7]:
args.len_lb, args.len_up = 25, 200
args.log_step = 5000
args.chars_threshold = 0.8

In [8]:
def is_english_detect(text):
    try:
        return detect_lang(text) == 'en'
    except:
        return False


def is_chinise(text):
    for c in text:
        if 0x4e00 <= ord(c) <= 0x9fff:
            return True
    return False


def is_ascii(text):
    return all(31 < ord(c) < 128 for c in text)

    
def is_plausible_chars(text):
    m = sum(bool(c in ascii_lowercase) for c in text.lower()) 
    n = len(text)
    return m / n >= args.chars_threshold and is_ascii(text)


def is_sensible(text):
    return is_english_detect(text) and is_plausible_chars(text)


def is_valid(desc):
    return args.len_lb <= len(desc) <= args.len_up and is_sensible(desc)

In [None]:
@monitor('[1.1.1] Preprocessing english sorting')
def english_sorting(descs, *, exp):
    new_descs = []
    for i, desc in enumerate(tqdm(descs)):
        if is_valid(desc):
            new_descs.append(desc)
        if i % args.log_step == 0 or i == len(descs) - 1:
            exp.metric('step', i)
            exp.metric('current len', len(new_descs))
    return new_descs

In [None]:
%time descs = english_sorting(descs)
len(descs)

  3%|▎         | 10337/357220 [01:29<50:03, 115.48it/s]

### Analysis

In [None]:
sns.distplot([len(desc) for desc in descs]);

In [None]:
random.sample(descs, 10)

### Add

In [None]:
df = df.append(pd.DataFrame.from_dict({'desc': descs}), ignore_index=True)
df.head()

## Stepik data

* Ask Timophey about this.

In [None]:
...

## Save

In [None]:
args.save_path = 'data.hdf5'

In [None]:
%time df.to_hdf(args.save_path, 'df', mode='w', format='t', complevel=9)

In [None]:
!du -sh $args.save_path