In [1]:
from xml.etree.ElementTree import iterparse
from lxml import html

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

import string
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='../log/w2v_etc.log')

log = logging.getLogger('w2v_etc.log')

In [2]:
SOURCE_PATH = '../datasets/stackoverflow-posts-2015.xml'

class StackOverflowCommentIterator(object):
    def __init__(self):
        pass
    def __iter__(self):
        n_bad_elems = 0
        for event, elem in iterparse(SOURCE_PATH):
            try:
                ### context
                owner_user = 'OWNER_' + elem.get('OwnerUserId', '')
                post_type = 'TYPE_' + elem.get('PostTypeId', '')
                parent = 'PARENT_' + elem.get('ParentId', '')
                post = 'POST_' + elem.get('Id', '')
                ### stackoverflow tags 
                tags = elem.get('Tags')
                if tags is None:
                    tags = []
                else:
                    tag_tree = html.fromstring(tags)
                    tags = ['TAG_' + t.tag for t in tag_tree.xpath('//*')]
                tags = tags + [owner_user, post_type, parent, post]
                ### body text
                body = elem.get('Body')
                if body is not None:
                    body_tree = html.fromstring(body)
                    ps = [p.text for p in body_tree.xpath('//p') if p.text]
                    for i, p in enumerate(ps):
                        p = p.translate(p.maketrans("","", string.punctuation))
                        p = p.translate(p.maketrans("   ","   ", '\n\t'))
                        p = p.lower()
                        words = [w for w in p.split(' ') if w]
                        yield LabeledSentence(words=words, tags=tags + ['PARAGRAPH_' + str(i)])
                    # cs = [c.text for c in body_tree.xpath('//code')]
                    ### TODO -- body code
            except Exception as e:
                ### TODO -- deal with the invalid tags
                log.warning(e)
                n_bad_elems += 1
            elem.clear()
        self.n_bad_elems = n_bad_elems

SAVE_PATH = '../saved/240_5_a.d2v'
        
def train_and_save():
    model = Doc2Vec(StackOverflowCommentIterator(),
                    workers=5,
                    size=240, window=5, negative=8,
                    iter=2, alpha=0.1, sample=1e-5,
                    min_count=1)
    model.save(SAVE_PATH)
    return model


# model = train_and_save()
model = Doc2Vec.load(SAVE_PATH)


In [3]:
model.most_similar('agile')

[('scrum', 0.7025855779647827),
 ('testdriven', 0.5772689580917358),
 ('nameusage', 0.5521855354309082),
 ('prototyping', 0.5441562533378601),
 ('career', 0.5345290303230286),
 ('manufacturing', 0.5328176021575928),
 ('tdd', 0.5307821035385132),
 ('disciplines', 0.5286736488342285),
 ('vetting', 0.5281956195831299),
 ('collaborative', 0.5237389206886292)]

In [3]:
n = 300

for sentence in StackOverflowCommentIterator():
    x = sentence
    n -= 1
    if (n < 0):
        break

x

LabeledSentence(words=['you', 'probably', 'want', 'to', 'create', 'a', 'custom', 'externalidentityprovider', 'rather', 'than', 'a', 'custom', 'authenticationhandler', 'once', 'you', 'create', 'and', 'install', 'your', 'own', 'identity', 'provider', 'bundle', 'aem', 'will', 'consider', 'it', 'for', 'all', 'login', 'requests', 'essentially', 'aem', 'will', 'call', 'the', 'authenticate', 'method', 'on', 'all', 'registered', 'identity', 'providers', 'in', 'order', 'based', 'on', 'their', 'jaas', 'ranking', 'and', 'if', 'any', 'of', 'the', 'modules', 'login', 'successfully', 'it', 'considers', 'that', 'user', 'authenticated', 'you', 'also', 'dont', 'have', 'to', 'worry', 'about', 'creating', 'the', 'jcr', 'user', 'record', 'because', 'the', 'default', 'sync', 'handler', 'will', 'take', 'care', 'of', 'that', 'for', 'you', 'automatically'], tags=['OWNER_799', 'TYPE_2', 'PARENT_26485551', 'POST_27727471', 'PARAGRAPH_0'])

In [101]:
n = 0
keys = set([])
start_time = datetime.now()

for event, elem in iterparse(source_path):
    n += 1
    keys.update(elem.keys())
    elem.clear()
    if (n % 250000 == 0):
        print(n)

end_time = datetime.now()

print(n, (end_time - start_time).total_seconds())
keys

['ParentId', 'PostTypeId', 'LastActivityDate', 'CommentCount', 'Body', 'CreationDate', 'OwnerUserId', 'Score', 'Id']
2015-01-01T00:00:27.040
<p>create an header file "macros.h"</p>

<p>import this header into Prefix.pch</p>

<p>In this macros.h put all the frameworks and other important things.</p>

<p>If you are worried about performance, don't worry, look what apple says:</p>

<p>Headers and Performance</p>

<blockquote>
  <p>If you are worried that including a master header file may cause your
  program to bloat, don’t worry. Because OS X interfaces are implemented
  using frameworks, the code for those interfaces resides in a dynamic
  shared library and not in your executable. In addition, only the code
  used by your program is ever loaded into memory at runtime, so your
  in-memory footprint similarly stays small.
        As for including a large number of header files during compilation, once again, don’t worry. Xcode provides a precompiled
  header facility to speed up compile

{'Body',
 'CommentCount',
 'CreationDate',
 'Id',
 'LastActivityDate',
 'OwnerUserId',
 'ParentId',
 'PostTypeId',
 'Score'}