In [None]:
from xml.etree.ElementTree import iterparse
from lxml import html

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

import string
import logging
import simplejson

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='../log/w2v_etc.log')

log = logging.getLogger('w2v_etc.log')

In [None]:
"""Streaming xml parser filters on date and writes to csv 
"""


#SOURCE_PATH = '../datasets/buddhism/Posts.xml'
#SECONDARY_PATH = '../output/stackoverflow_posts.csv'

SOURCE_PATH = '../datasets/stackoverflow_posts.xml'
SECONDARY_PATH = '../datasets/stackoverflow_posts.csv'

START_DATE = '2015-03-01'
STOP_DATE = '2015-09-01'


def _paragraph_generator(elem):
    
    def _assess_date_get_body():
        date = elem.get('CreationDate', '')
        body = elem.get('Body', '')
        if not date or not body or date < START_DATE or date >= STOP_DATE:
            return ''
        else:
            return body
    
    body = _assess_date_get_body()
    if body:

        def _gather_context_tag(tag, prefix):
            x = elem.get(tag, '')
            if x: x = prefix + '_' + x
            return x

        def _gather_context_tags():
            owner_user = _gather_context_tag('OwnerUserId', 'OWNER')
            post_type = _gather_context_tag('PostTypeId', 'TYPE')
            parent = _gather_context_tag('ParentId', 'PARENT')
            post = _gather_context_tag('Id', 'POST')
            tags = [owner_user, post_type, parent, post]
            return [t for t in tags if t]

        def _format_stackoverflow_tags(tags):
            tree = html.fromstring(tags)
            tags = ['TAG_' + t.tag for t in tree.xpath('//*')]
            return [t for t in tags if t]

        def _gather_stackoverflow_tags():
            tags = elem.get('Tags', [])
            if tags: tags = _format_stackoverflow_tags(tags)
            return [t for t in tags if t]

        def _gather_doc2vec_tags():
            return _gather_stackoverflow_tags() + _gather_context_tags()

        tags = _gather_doc2vec_tags()

        def _prepare_words(paragraph):
            p = paragraph
            p = p.translate(p.maketrans("","", string.punctuation))
            p = p.translate(p.maketrans("   ","   ", '\n\t'))
            p = p.lower()
            return [w for w in p.split(' ') if w]
        
        def _gather_paragraphs():
            body_tree = html.fromstring(body)
            return [p.text for p in body_tree.xpath('//p') if p.text]
        
        paragraphs = _gather_paragraphs()
        
        for p in paragraphs:
            words = _prepare_words(p)
            yield words, tags

            
def _write_paragraphs(paragraphs, file):
    n = 0
    for words, tags in paragraphs:
        n += 1
        file.write(','.join(words))
        file.write(';')
        file.write(','.join(tags))
        file.write('\n')
    return n


def write_secondary():
    n_elem = 0
    n_para = 0
    try:
        file = open(SECONDARY_PATH, 'w+')
        for event, elem in iterparse(SOURCE_PATH):
            n_elem += 1
            if (n_elem % 100000 == 0): log.info(str(n_para) + ' ' + str(n_elem))
            try:
                paragraphs = _paragraph_generator(elem)
                n_para += _write_paragraphs(paragraphs, file)
            except Exception as e:
                log.warning((type(e), e))
            finally:
                elem.clear()
    finally:
        file.close()


write_secondary()


class SecondaryReader():
    def __init__(self): pass
    def __iter__(self):
        with open(SECONDARY_PATH, 'r') as file:
            for line in file:
                line = line[:-1]
                words, tags = line.split(';')[0:2]
                words = words.split(',')
                tags = tags.split(',')
                return words, tags

In [4]:
class StackOverflowPostIterator:
    def __init__(self): pass
    def __iter__(self):
        return SecondaryReader().__iter__()

SAVE_PATH = '../saved/180_5_5_a.d2v'
        
def train_and_save():
    model = Doc2Vec(StackOverflowPostIterator(),
                    workers=5,
                    size=180, negative=5, window=5,
                    iter=1, alpha=0.1, sample=1e-5,
                    min_count=1)
    model.save(SAVE_PATH)
    return model


# model = train_and_save()
model = Doc2Vec.load(SAVE_PATH)


In [3]:
model.most_similar('agile')

[('scrum', 0.7025855779647827),
 ('testdriven', 0.5772689580917358),
 ('nameusage', 0.5521855354309082),
 ('prototyping', 0.5441562533378601),
 ('career', 0.5345290303230286),
 ('manufacturing', 0.5328176021575928),
 ('tdd', 0.5307821035385132),
 ('disciplines', 0.5286736488342285),
 ('vetting', 0.5281956195831299),
 ('collaborative', 0.5237389206886292)]

In [None]:
n = 10

for sentence in StackOverflowPostIterator():
    x = sentence
    n -= 1
    if (n < 0):
        break

x

In [None]:
from datetime import datetime
SOURCE_PATH = '../datasets/stackoverflow_posts.xml'

n = 0
m = 0
keys = set([])
min_date = None
max_date = None
start_time = datetime.now()

for event, elem in iterparse(SOURCE_PATH):
    n += 1
    keys.update(elem.keys())
    date = elem.get('CreationDate', '')
    body = elem.get('Body', '')
    elem.clear()
    if min_date is None or (date and date < min_date): min_date = date
    if max_date is None or (date and date > max_date): max_date = date
    if date and body and date >= '2015-01-01' and date < '2016-01-01': m += 1
    if (n % 1000000 == 0): print(m, n, min_date, max_date, keys)

end_time = datetime.now()

print(n, (end_time - start_time).total_seconds())
keys