### Munge

Stack Exchange, Inc. is very generous and does data dumps of the stackoverflows'
site posts (find the link below). There's a good spectrum of dataset sizes from
their distinct subdomains, and if you're fearless, you can do the whole shabang,
probably more than a TB uncompressed.

I settled on a few 10s of GB, but that's a bit much to sit comfortably in my box's
memory (maybe yours, too), so I setup a streaming xml parser to do the preproccessing
and write the results to a few tsv's.

Applications to follow....

In [1]:
from xml.etree.ElementTree import iterparse
from lxml import html

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

import string
import logging
import simplejson
import spacy

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='../log/w2v_etc.log')

log = logging.getLogger('w2v_etc.log')
nlp = spacy.load('en')

In [2]:
"""Streaming xml parser filters on date and writes to tsv 
"""

START_YEAR = 2009
STOP_YEAR = 2017

SOURCE_PATH = '../datasets/stackoverflow_posts/stackoverflow_posts.xml'
SECONDARY_FOLDER = '../datasets/stackoverflow_posts/'

years = list(range(START_YEAR, STOP_YEAR))
filepaths = [SECONDARY_FOLDER + str(year) + '.tsv' for year in years]


def _paragraph_generator(elem):
    
    def _assess_date_get_body():
        date = elem.get('CreationDate', '')
        body = elem.get('Body', '')
        if not date or not body or date < str(START_YEAR) or date > str(STOP_YEAR):
            return '', date
        else:
            return body, date
    
    body, date = _assess_date_get_body()
    if body:

        def _gather_context_tag(tag, prefix):
            x = elem.get(tag, '')
            if x: return prefix + '_' + x
            else: return ''

        def _gather_context_tags():
            user = _gather_context_tag('OwnerUserId', 'USER')
            post = _gather_context_tag('Id', 'POST')
            tags = [user, post]
            return [t for t in tags if t]

        def _format_stackoverflow_tags(tags):
            tree = html.fromstring(tags)
            return ['TAG_' + t.tag for t in tree.xpath('//*')
                    if t.tag and not t.tag in ['html', 'body']]

        def _gather_stackoverflow_tags():
            tags = elem.get('Tags', '')
            if tags: return _format_stackoverflow_tags(tags)
            else: return []

        def _gather_doc2vec_tags():
            return _gather_stackoverflow_tags() + _gather_context_tags()

        tags = _gather_doc2vec_tags()
        
        def _process_token(token):
            """spaCy Token -> str"""
            if token.lemma_: return token.lemma_.lower()
            # elif token.like_num: return 'NUMBER'
            else: return token.text.lower()
        
        def _prepare_words(paragraph):
            # keep words, numbers, punctuation; lemmatize;
            # join named-entities; lower case
            doc = nlp(paragraph)
            ents = doc.ents
            doc = [_process_token(token) for token in doc
                   if token.is_alpha or token.is_punct or token.like_num]
            doc.extend([str(entity) for entity in ents if len(entity) > 1])
            return doc
        
        def _gather_paragraphs():
            body_tree = html.fromstring(body)
            return [p.text for p in body_tree.xpath('//p') if p.text]
        
        paragraphs = _gather_paragraphs()
        
        for p in paragraphs:
            words = _prepare_words(p)
            yield date, words, tags


def _which_year_index(date):
    date = str(date)
    i = -1
    for y in years:
        i += 1
        if str(y) < date < str(y + 1): return i
    return None

            
def _write_paragraphs(paragraphs, files):
    n = 0
    for date, words, tags in paragraphs:
        n += 1
        file = files[_which_year_index(date)]
        file.write('\t'.join(words))
        file.write('\t\t')
        file.write('\t'.join(tags))
        file.write('\n')
    return n


def write_secondaries():
    n_elem = 0
    n_para = 0
    try:
        files = [open(filepath, 'w+') for filepath in filepaths]
        for event, elem in iterparse(SOURCE_PATH):
            n_elem += 1
            if (n_elem % 100000 == 0):
                msg = "parsed {} paragraphs, read {} elements".format(n_para, n_elem)
                log.info(msg)
            try:
                paragraphs = _paragraph_generator(elem)
                n_para += _write_paragraphs(paragraphs, files)
            except Exception as e:
                log.warning((type(e), e))
            finally:
                elem.clear()
    finally:
        [file.close() for file in files]
        msg = "parsed {} paragraphs, read {} elements".format(n_para, n_elem)
        log.info(msg)

        
write_secondaries()

In [3]:
DATA_PATHS = ['../datasets/stackoverflow_posts/2013.tsv']
SAVE_PATH = '../saved/2013_180_20_8_5.d2v'

class StackOverflowPostIterator():
    def __init__(self): pass
    def __iter__(self):
        for file_path in DATA_PATHS:
            with open(file_path, 'r') as file:
                for line in file:
                    line = line[:-1]
                    words, tags = (line.split('\t\t') + [''])[0:2]
                    words = words.split('\t')
                    tags = tags.split('\t')
                    yield LabeledSentence(words=words, tags=tags)

        
def train_and_save():
    model = Doc2Vec(StackOverflowPostIterator(),
                    size=180, negative=20, window=8, min_count=5,
                    iter=1, workers=5, alpha=0.1, sample=1e-5)
    model.save(SAVE_PATH)
    return model


model = train_and_save()
# model = Doc2Vec.load(SAVE_PATH)


In [18]:
# word similarities

print(model.most_similar('agile'))
print('')
print(model.most_similar('node'))
print('')
print(model.most_similar('python'))

[('scrum', 0.6519924402236938), ('msf', 0.5370078086853027), ('thesis', 0.5179572105407715), ('senior', 0.5103329420089722), ('evolution', 0.5098087787628174), ('kanban', 0.5042551159858704), ('mature', 0.4935457706451416), ('cto', 0.4915340542793274), ('sprints', 0.48361465334892273), ('wrox', 0.4827408790588379)]

[('nodes', 0.7072689533233643), ('leaf', 0.5357713103294373), ('nodeset', 0.5138298273086548), ('newnode', 0.49955424666404724), ('cluster', 0.49572986364364624), ('childnodes', 0.48936402797698975), ('45892', 0.478634774684906), ('subnodes', 0.4744775593280792), ('child', 0.4739358425140381), ('subgraph', 0.47250238060951233)]

[('ironpython', 0.5724384784698486), ('lua', 0.5550795793533325), ('distutils', 0.543355405330658), ('java', 0.5419704914093018), ('bash', 0.5343711376190186), ('perl', 0.5284351706504822), ('2.7.8', 0.5262273550033569), ('cpython', 0.5256874561309814), ('3.4', 0.5247158408164978), ('jython', 0.5143867135047913)]


In [17]:
search_type = 'USER'
postive_search_vectors = [model['factor'], model['svd'], model['decomposition']]
negative_search_vectors = [model['diagonalisation']]

found_tags = model.docvecs.most_similar(postive_search_vectors, negative_search_vectors)
found_post_tags = [t for t,_ in found_tags if t[0:len(search_type)] == search_type]
for t in found_post_tags:
    post_vector = model.docvecs[t]
    post_words = [w for w,_ in model.most_similar([post_vector])]
    print(t, post_words)

USER_843348 ['multinomial', 'multiclass', 'variational', 'conjugate', 'tensor', 'dijkstras', 'dxd', 'posterior', 'lanczos', 'logit']
USER_2484687 ['svms', 'tanh', 'softmax', 'regularization', 'posterior', 'svd', 'multiclass', 'bayesian', 'rbf', 'unsupervised']
USER_1166362 ['sortwknn', 'maiores', 'calculcate', 'radiance', 'betrounds', 'qaudiooutput', 'reml', 'standarization', 'quickhull', 'piecewise']
USER_3942586 ['cepstral', 'kernlab', 'mtx', 'mirza', 'lup', '1607', 'expm', 'munkres', 'diagonalisation', 'pseudoinverse']


In [16]:
search_type = 'POST'
postive_search_vectors = [model['fortran'], model['numerical']]
negative_search_vectors = [model['pointer']]

found_tags = model.docvecs.most_similar(postive_search_vectors, negative_search_vectors)
found_post_tags = [t for t,_ in found_tags if t[0:len(search_type)] == search_type]
for t in found_post_tags:
    post_vector = model.docvecs[t]
    post_words = [w for w,_ in model.most_similar([post_vector])]
    print(t, post_words)

POST_21847418 ['syms', 'recreational', 'nrz', 'getlevel', 'unitless', 'alltough', 'lmertest', 'trignometric', 'storagelocation', '1728']
POST_26934178 ['fortran', 'thtat', 'compaq', 'apl', 'rutines', 'bdfl', 'gfortran', 'leakoff', '1539', 'generalising']
POST_21330945 ['natty', 'qaudiooutput', 'innerchild', 'lanczos', 'artistic', 'exploiting', 'distinctions', 'amenable', 'interoperable', 'relaxed']
POST_21886516 ['numerical', 'chemical', 'subscripts', 'reals', 'conditioned', 'superscripts', 'numerically', 'dissipation', 'dbarithmeticexpression', 'numeric']
POST_21078051 ['regionerate', 'mpir', 'daqmx', '5.80', 'convertors', '2.99', 'playerobject', 'aussi', 'ligatures', '10.2.0']
POST_25493270 ['mpich', 'hpc', 'openib', 'openmpi', '2.6.9', 'mpi', 'galsim', 'withhold', 'mlpack', 'infiniband']
POST_25383677 ['fortran', 'apl', '1539', 'befunge', 'culturally', 'dyalog', 'mkl', 'openacc', 'pgi', '1325']
POST_27199392 ['multinomial', 'kernlab', 'covariate', 'fitpack', 'borehole', 'mle', 'virg