In [2]:
from xml.etree.ElementTree import iterparse
from lxml import html

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

import string
import logging
import simplejson
import spacy

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='../log/w2v_etc.log')

log = logging.getLogger('w2v_etc.log')

nlp = spacy.load('en')

In [None]:
"""Streaming xml parser filters on date and writes to csv 
"""

SOURCE_PATH = '../datasets/stackoverflow_posts.xml'
SECONDARY_PATH = '../datasets/stackoverflow_posts_1501_1601.csv'

START_DATE = '2015-01-01'
STOP_DATE = '2016-01-01'


def _paragraph_generator(elem):
    
    def _assess_date_get_body():
        date = elem.get('CreationDate', '')
        body = elem.get('Body', '')
        if not date or not body or date < START_DATE or date >= STOP_DATE:
            return ''
        else:
            return body
    
    body = _assess_date_get_body()
    if body:

        def _gather_context_tag(tag, prefix):
            x = elem.get(tag, '')
            if x: return prefix + '_' + x
            else: return ''

        def _gather_context_tags():
            user = _gather_context_tag('OwnerUserId', 'USER')
            post = _gather_context_tag('Id', 'POST')
            tags = [user, post]
            return [t for t in tags if t]

        def _format_stackoverflow_tags(tags):
            tree = html.fromstring(tags)
            return ['TAG_' + t.tag for t in tree.xpath('//*')
                    if t.tag and not t.tag in ['html', 'body']]

        def _gather_stackoverflow_tags():
            tags = elem.get('Tags', '')
            if tags: return _format_stackoverflow_tags(tags)
            else: return []

        def _gather_doc2vec_tags():
            return _gather_stackoverflow_tags() + _gather_context_tags()

        tags = _gather_doc2vec_tags()

        def _prepare_words(paragraph):
            # keep words, numbers, punctuation; lemmatize;
            # join named-entities; lower case
            doc = nlp(paragraph)
            ents = doc.ents
            doc = [token.lemma_ for token in doc
                   if token.is_alpha or token.is_punct or token.like_num]
            doc.extend([str(entity) for entity in ents if len(entity) > 1])
            return [w.lower() for w in doc]

            # p = paragraph
            # p = p.translate(p.maketrans("","", string.punctuation))
            # p = p.translate(p.maketrans("   ","   ", '\n\t'))
            # p = p.lower()
            # return [w for w in p.split(' ') if w]
        
        def _gather_paragraphs():
            body_tree = html.fromstring(body)
            return [p.text for p in body_tree.xpath('//p') if p.text]
        
        paragraphs = _gather_paragraphs()
        
        for p in paragraphs:
            words = _prepare_words(p)
            yield words, tags

            
def _write_paragraphs(paragraphs, file):
    n = 0
    for words, tags in paragraphs:
        n += 1
        file.write(','.join(words))
        file.write(';')
        file.write(','.join(tags))
        file.write('\n')
    return n


def write_secondary():
    n_elem = 0
    n_para = 0
    try:
        file = open(SECONDARY_PATH, 'w+')
        for event, elem in iterparse(SOURCE_PATH):
            n_elem += 1
            if (n_elem % 100000 == 0): log.info(str(n_para) + ' ' + str(n_elem))
            try:
                paragraphs = _paragraph_generator(elem)
                n_para += _write_paragraphs(paragraphs, file)
            except Exception as e:
                log.warning((type(e), e))
            finally:
                elem.clear()
    finally:
        file.close()


write_secondary()


In [3]:
class StackOverflowPostIterator():
    def __init__(self): pass
    def __iter__(self):
        with open(SECONDARY_PATH, 'r') as file:
            for line in file:
                line = line[:-1]
                words, tags = (line.split(';') + [''])[0:2]
                words = words.split(',')
                tags = tags.split(',')
                yield LabeledSentence(words=words, tags=tags)


SAVE_PATH = '../saved/18mo_180_20_8_5_a.d2v'
        
def train_and_save():
    model = Doc2Vec(StackOverflowPostIterator(),
                    size=180, negative=20, window=8, min_count=5,
                    iter=1, workers=5, alpha=0.1, sample=1e-5)
    model.save(SAVE_PATH)
    return model


model = train_and_save()
# model = Doc2Vec.load(SAVE_PATH)


In [4]:
print(model.most_similar('agile'))
print(model.most_similar('node'))
print(model.most_similar('python'))

[('scrum', 0.714698076248169), ('kanban', 0.5858825445175171), ('methodologies', 0.5715190768241882), ('collaboration', 0.5650316476821899), ('stakeholders', 0.5586098432540894), ('tdd', 0.5405069589614868), ('velasco', 0.5336797833442688), ('dissertation', 0.5329622030258179), ('coached', 0.5323850512504578), ('fanfiction', 0.5321285724639893)]
[('crawlerjs', 0.63333660364151), ('nodes', 0.6314841508865356), ('v01031', 0.6245298385620117), ('xdmpnodeuri', 0.5910543203353882), ('assd', 0.5787916779518127), ('companyupdate', 0.5708209872245789), ('chefstacktraceout', 0.5616493225097656), ('hexfield', 0.5563744306564331), ('cusersxxxxxxxnpmnodemodulessailsbinsailsjs', 0.5531876087188721), ('newnode', 0.5505025386810303)]
[('pythons', 0.5597633123397827), ('amazingjust', 0.5398553013801575), ('ironpython', 0.5287105441093445), ('unicurses', 0.5230951905250549), ('qpython', 0.5211294889450073), ('python26', 0.5179910659790039), ('windowsproblem', 0.5071616172790527), ('apl', 0.504757642745

In [None]:
n = 10

for sentence in StackOverflowPostIterator():
    x = sentence
    n -= 1
    if (n < 0):
        break

x

In [None]:
from datetime import datetime
SOURCE_PATH = '../datasets/stackoverflow_posts.xml'

n = 0
m = 0
keys = set([])
min_date = None
max_date = None
start_time = datetime.now()

for event, elem in iterparse(SOURCE_PATH):
    n += 1
    keys.update(elem.keys())
    date = elem.get('CreationDate', '')
    body = elem.get('Body', '')
    elem.clear()
    if min_date is None or (date and date < min_date): min_date = date
    if max_date is None or (date and date > max_date): max_date = date
    if date and body and date >= '2015-01-01' and date < '2016-01-01': m += 1
    if (n % 1000000 == 0): print(m, n, min_date, max_date, keys)

end_time = datetime.now()

print(n, (end_time - start_time).total_seconds())
keys