In [1]:
from xml.etree.ElementTree import iterparse
from lxml import html

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

import string
import logging
import simplejson
import spacy

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='../log/w2v_etc.log')

log = logging.getLogger('w2v_etc.log')
nlp = spacy.load('en')

In [None]:
"""Streaming xml parser filters on date and writes to csv 
"""

START_YEAR = 2009
STOP_YEAR = 2017

SOURCE_PATH = '../datasets/stackoverflow_posts/stackoverflow_posts.xml'
SECONDARY_FOLDER = '../datasets/stackoverflow_posts/'

years = list(range(START_YEAR, STOP_YEAR))
filepaths = [SECONDARY_FOLDER + str(year) + '.tsv' for year in years]


def _paragraph_generator(elem):
    
    def _assess_date_get_body():
        date = elem.get('CreationDate', '')
        body = elem.get('Body', '')
        if not date or not body or date < str(START_YEAR) or date > str(STOP_YEAR):
            return '', date
        else:
            return body, date
    
    body, date = _assess_date_get_body()
    if body:

        def _gather_context_tag(tag, prefix):
            x = elem.get(tag, '')
            if x: return prefix + '_' + x
            else: return ''

        def _gather_context_tags():
            user = _gather_context_tag('OwnerUserId', 'USER')
            post = _gather_context_tag('Id', 'POST')
            tags = [user, post]
            return [t for t in tags if t]

        def _format_stackoverflow_tags(tags):
            tree = html.fromstring(tags)
            return ['TAG_' + t.tag for t in tree.xpath('//*')
                    if t.tag and not t.tag in ['html', 'body']]

        def _gather_stackoverflow_tags():
            tags = elem.get('Tags', '')
            if tags: return _format_stackoverflow_tags(tags)
            else: return []

        def _gather_doc2vec_tags():
            return _gather_stackoverflow_tags() + _gather_context_tags()

        tags = _gather_doc2vec_tags()
        
        def _process_token(token):
            """spaCy Token -> str"""
            if token.lemma_: return token.lemma_.lower()
            # elif token.like_num: return 'NUMBER'
            else: return token.text.lower()
        
        def _prepare_words(paragraph):
            # keep words, numbers, punctuation; lemmatize;
            # join named-entities; lower case
            doc = nlp(paragraph)
            ents = doc.ents
            doc = [_process_token(token) for token in doc
                   if token.is_alpha or token.is_punct or token.like_num]
            doc.extend([str(entity) for entity in ents if len(entity) > 1])
            return doc
        
        def _gather_paragraphs():
            body_tree = html.fromstring(body)
            return [p.text for p in body_tree.xpath('//p') if p.text]
        
        paragraphs = _gather_paragraphs()
        
        for p in paragraphs:
            words = _prepare_words(p)
            yield date, words, tags


def _which_year_index(date):
    date = str(date)
    i = -1
    for y in years:
        i += 1
        if str(y) < date < str(y + 1): return i
    return None

            
def _write_paragraphs(paragraphs, files):
    n = 0
    for date, words, tags in paragraphs:
        n += 1
        file = files[_which_year_index(date)]
        file.write('\t'.join(words))
        file.write('\t\t')
        file.write(','.join(tags))
        file.write('\n')
    return n


def write_secondaries():
    n_elem = 0
    n_para = 0
    try:
        files = [open(filepath, 'w+') for filepath in filepaths]
        for event, elem in iterparse(SOURCE_PATH):
            n_elem += 1
            if (n_elem % 100000 == 0):
                msg = "parsed {} paragraphs, read {} elements".format(n_para, n_elem)
                log.info(msg)
            try:
                paragraphs = _paragraph_generator(elem)
                n_para += _write_paragraphs(paragraphs, files)
            except Exception as e:
                log.warning((type(e), e))
            finally:
                elem.clear()
    finally:
        [file.close() for file in files]
        msg = "parsed {} paragraphs, read {} elements".format(n_para, n_elem)
        log.info(msg)

        
write_secondaries()

In [None]:
DATA_PATHS = ['../datasets/stackoverflow_posts/2015.tsv']
SAVE_PATH = '../saved/2015_180_20_8_5.d2v'

class StackOverflowPostIterator():
    def __init__(self): pass
    def __iter__(self):
        for file_path in DATA_PATHS:
            with open(file_path, 'r') as file:
                for line in file:
                    line = line[:-1]
                    words, tags = (line.split('\t\t') + [''])[0:2]
                    words = words.split('\t')
                    tags = tags.split('\t')
                    yield LabeledSentence(words=words, tags=tags)

        
def train_and_save():
    model = Doc2Vec(StackOverflowPostIterator(),
                    size=180, negative=20, window=8, min_count=5,
                    iter=1, workers=5, alpha=0.1, sample=1e-5)
    model.save(SAVE_PATH)
    return model


model = train_and_save()
# model = Doc2Vec.load(SAVE_PATH)


In [None]:
# word similarities

print(model.most_similar('agile'))
print('')
print(model.most_similar('node'))
print('')
print(model.most_similar('python'))

In [None]:
# searching for java-free posts about linear regression

search_type = 'POST'
postive_search_vectors = [model['linear'], model['regression']]
negative_search_vectors = [model['java']]

found_tags = model.docvecs.most_similar(postive_search_vectors, negative_search_vectors)
found_post_tags = [t for t,_ in found_tags if t[0:len(search_type)] == search_type]
for t in found_post_tags:
    post_vector = model.docvecs[t]
    post_words = [w for w,_ in model.most_similar([post_vector])]
    print(p, post_words)

In [None]:
# searching for users who post about numerical fortran and not javascript

search_type = 'USER'
postive_search_vectors = [model['fortran'], model['numerical']]
negative_search_vectors = [model['javascript']]

found_tags = model.docvecs.most_similar(postive_search_vectors, negative_search_vectors)
found_post_tags = [t for t,_ in found_tags if t[0:len(search_type)] == search_type]
for t in found_post_tags:
    post_vector = model.docvecs[t]
    post_words = [w for w,_ in model.most_similar([post_vector])]
    print(p, post_words)