## Setup

### Install `requirements.txt`

In [1]:
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (from -r requirements.txt (line 116))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 66kB/s s eta 0:00:01
Collecting https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.0.0/es_core_news_sm-2.0.0.tar.gz (from -r requirements.txt (line 117))
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.0.0/es_core_news_sm-2.0.0.tar.gz (36.7MB)
[K    100% |████████████████████████████████| 36.7MB 68kB/s s eta 0:00:01   26% |████████▌                       | 9.7MB 50.6MB/s eta 0:00:01




Building wheels for collected packages: en-core-web-sm, es-core-news-sm
  Running setup.py bdist_wheel for en-core-web-sm ... [?25ldone
[?25h  Stored in directory: /home/stefan/.cache/pip/wheels/54/7c/d8/f86364af8fbba7258e14adae115f18dd2c91552406edc3fdaa
  Running setup.py bdist_wheel for es-core-news-sm ... [?25ldone
[?25h  Stored in directory: /home/stefan/.cache/pip/wheels/9e/28/c4/df4980946eb229379ed26d349566e427fa029dbf03546ccb94
Successfully built en-core-web-sm es-core-news-sm
[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!{sys.executable} -m spacy validate


[93m    Installed models (spaCy v2.0.11)[0m
    /home/stefan/.virtualenvs/word2vec_translate/lib/python3.6/site-packages/spacy

    TYPE        NAME                  MODEL                 VERSION                                   
    package     es-core-news-sm       es_core_news_sm       [38;5;2m2.0.0[0m    [38;5;2m✔[0m      
    package     en-core-web-sm        en_core_web_sm        [38;5;2m2.0.0[0m    [38;5;2m✔[0m      
    link        en_core_web_sm        en_core_web_sm        [38;5;2m2.0.0[0m    [38;5;2m✔[0m      
    link        en                    en_core_web_sm        [38;5;2m2.0.0[0m    [38;5;2m✔[0m      
    link        es                    es_core_news_sm       [38;5;2m2.0.0[0m    [38;5;2m✔[0m      
    link        es_core_news_sm       es_core_news_sm       [38;5;2m2.0.0[0m    [38;5;2m✔[0m      


### Imports

In [3]:
import os, tarfile, sys
from pathlib import Path
from time import time
from pprint import pprint
from collections import Counter

import numpy as np
from numpy.random import choice
import pandas as pd

from IPython.display import SVG, display

import spacy

from gensim.models.word2vec import LineSentence
from gensim.models.phrases import Phrases, Phraser

### Settings

In [4]:
pd.set_option('float_format', '{:,.2f}'.format)
np.random.seed(42)

In [7]:
LANGUAGES = ['en', 'es']
language_dict = dict(zip(LANGUAGES, ['English', 'Spanish']))

In [8]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)

### Extract Data

In [None]:
path = Path('data')
if not path.exists():
    tar = tarfile.open('data.tar.gz', "r:gz")
    tar.extractall()
    tar.close()

## Preprocess Data

### TED 2013 English & Spanish

In [9]:
SOURCE = 'TED'
FILE_NAME = 'TED2013'

Data source: http://opus.nlpl.eu/TED2013.php

In [23]:
filename = Path('data', 'TED', 'TED2013.en')
print(filename.read_text()[:500])

http://www.ted.com/talks/stephen_palumbi_following_the_mercury_trail.html
There's a tight and surprising link between the ocean's health and ours, says marine biologist Stephen Palumbi. He shows how toxins at the bottom of the ocean food chain find their way into our bodies, with a shocking story of toxic contamination from a Japanese fish market. His work points a way forward for saving the oceans' health -- and humanity's.
fish,health,mission blue,oceans,science
899
Stephen Palumbi: Following 


### Tokenize & Clean Sentences

Models expect data provided as a single sentence per line. We'll remove punctuation after using `spaCy`'s parser to tokenize the input text.

In [10]:
def read_sentences(path, min_sent_length=3):
    stats = pd.DataFrame()
    sentences = []
    skipped, word_count = 0, 0
    
    with open(path) as source:
        for sentence in source:
            # remove short sentences and urls (for TED data)
            n_words = len(sentence.split())
            if n_words < min_sent_length or sentence.startswith('http:///'):
                skipped += 1
            else:
                word_count += n_words
                sentences.append(sentence.strip())
                
    stats = pd.Series({'Sentences': len(sentences),
                       '# Words': word_count,
                       'Skipped': skipped})
    return sentences, stats

In [11]:
def clean_sentences(sents, nlp, path, lang):
    exclude = ['PUNCT', 'SYM', 'X']
    start = time()
    vocab = Counter()
    sents = nlp.pipe(sents)
    d = []
    with open(path / 'ngrams_1.txt'.format(language), 'a') as f:
        for i, sent in enumerate(sents):
            if i % 20000 == 0 and i > 0:
                print(i, end=' ')
            d.extend([[i, w.text, w.pos_] for w in sent])
            clean_sentence = [w.text.lower() for w in sent if w.pos_ not in exclude]
            vocab.update(clean_sentence)
            f.write(' '.join(clean_sentence) + '\n')

    vocab = pd.Series(vocab).sort_values(ascending=False).to_frame('count')
    with pd.HDFStore(path.parent / 'vocab.h5') as store:
        store.put('/'.join([lang, 'vocab']), vocab)
        store.put('/'.join([lang, 'tokens']), pd.DataFrame(d, columns=['sent_id', 'token', 'pos']))
    duration = time() - start
    print('\n\tDuration: ', format_time(duration))

In [13]:
sentences, stats = {}, pd.DataFrame()

for language in LANGUAGES:
    source_path =  Path('data', SOURCE, '{}.{}'.format(FILE_NAME, language))
    sentences[language], stats[language_dict[language]] = read_sentences(source_path)
    
    print(language, end=': ')
    target_path = Path('vocab', SOURCE, language)
    if not target_path.exists():
        target_path.mkdir(parents=True, exist_ok=True)

    clean_sentences(sentences[language], spacy.load(language), target_path, language)    

en: 20000 40000 60000 80000 100000 120000 140000 
	Duration:  00:09:27
es: 20000 40000 60000 80000 100000 120000 140000 
	Duration:  00:08:02


### Corpus Summary Stats

In [14]:
stats.applymap(lambda x: '{:,d}'.format(x))

Unnamed: 0,English,Spanish
# Words,2640928,2548942
Sentences,152729,151850
Skipped,5166,6045


In [16]:
with pd.HDFStore(Path('vocab', SOURCE, 'vocab.h5')) as store:
    store.put('stats', stats)

### Inspect Result

In [17]:
sentences['en'][:3]

["There's a tight and surprising link between the ocean's health and ours, says marine biologist Stephen Palumbi. He shows how toxins at the bottom of the ocean food chain find their way into our bodies, with a shocking story of toxic contamination from a Japanese fish market. His work points a way forward for saving the oceans' health -- and humanity's.",
 'Stephen Palumbi: Following the mercury trail',
 'It can be a very complicated thing, the ocean.']

In [18]:
sentences['es'][:3] 

['Existe una estrecha y sorprendente relación entre nuestra salud y la salud del océano, dice el biologo marino Stephen Palumbi. Nos muestra, através de una impactante historia acerca de la contaminación tóxica en el mercado pesquero japonés, como las toxinas de la cadena alimenticia del fondo oceánico llegan a nuestro cuerpo.',
 'Stephen Palumbi: Siguiendo el camino del mercurio.',
 'El océano puede ser una cosa muy complicada.']

### Create n-grams

In [31]:
def create_ngrams(language, max_length=3):
    """Using gensim to create ngrams"""
    
    path = Path('vocab', SOURCE, language)
    n_grams = pd.DataFrame()
    start = time()
    for n in range(2, max_length + 1):
        print(n, end=' ')
        
        sentences = LineSentence(str(path / 'ngrams_{}.txt'.format(n-1)))
        phrases = Phrases(sentences, threshold=100, min_count=10)

        s = pd.Series({k.decode('utf-8'): v for k,
                       v in phrases.export_phrases(sentences)}) 
        s = s.to_frame('score').reset_index().rename(
            columns={'index': 'phrase'}).assign(length=n)
        
        n_grams = pd.concat([n_grams, s])
        grams = Phraser(phrases)
        sentences = grams[sentences]
        
        with open(path / 'ngrams_{}.txt'.format(n), 'w') as f:
            for sentence in sentences:
                f.write(' '.join(sentence) + '\n')
                
    n_grams = n_grams.sort_values('score', ascending=False)
    n_grams.phrase = n_grams.phrase.str.replace('_', ' ')
    n_grams['ngram'] = n_grams.phrase.str.replace(' ', '_')
    
    with pd.HDFStore(Path(path.parent / 'vocab.h5')) as store:
        store.put('/'.join([language, 'ngrams']), n_grams)
        
    print('\n\tDuration: ', format_time(time() - start))
    print('\tngrams: {:,d}\n'.format(len(n_grams)))
    print(n_grams.groupby('length').size())

In [32]:
for language in LANGUAGES:
    print('\n', language, end=' ')
    create_ngrams(language)


 en 2 3 
	Duration:  00:01:13
	ngrams: 1,016

length
2    906
3    110
dtype: int64

 es 2 3 
	Duration:  00:00:41
	ngrams: 508

length
2    462
3     46
dtype: int64
