In [11]:
#df = pd.read_csv('data_nfr/non_lab_doc2vec_format.txt', sep='\t', escapechar='\\', encoding = "ISO-8859-1")
#dt = pd.read_csv('data_nfr/nfr_doc2vec_format.txt', sep='\t', escapechar='\\', encoding = "ISO-8859-1")
#df.to_csv("non_lab_doc2vec_format.txt", sep='\t', encoding='utf-8', index=False)
#dt.to_csv("nfr_doc2vec_format.txt", sep='\t', encoding='utf-8', index=False)

In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# random
from random import shuffle

# classifier
from sklearn.linear_model import LogisticRegression

Using TensorFlow backend.


In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [7]:
sources = {'data_lib_pool/PO_domain_libs_CSV_2.txt':'DATA_DOMAIN', 
           'data_lib_pool/PO_tactic_libs_CSV_3.txt':'DATA_TACTIC', 
           'data_lib_pool/Unlabelled/boneCP.txt':'BONE_CP',
           'data_lib_pool/Unlabelled/c3p0.txt':'C3_P0',
           'data_lib_pool/Unlabelled/Commons DBCP.txt':'Commons_DBCP',
           'data_lib_pool/Unlabelled/DbConnectionBroker.txt':'DbConnection_Broker',
           'data_lib_pool/Unlabelled/DBPool.txt':'DBPool',
           'data_lib_pool/Unlabelled/miniConnectionPoolManager.txt':'MiniConnection_PoolManager',
           'data_lib_pool/Unlabelled/primrose.txt':'Primrose',
           'data_lib_pool/Unlabelled/Proxool.txt':'Proxool',
           'data_lib_pool/Unlabelled/smartpool.txt':'Smartpool'
          }

sentences = LabeledLineSentence(sources)

In [10]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025,min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)

model.build_vocab(sentences.to_array())

In [13]:
for epoch in range(10):
    model.train(sentences,total_examples=model.corpus_count,epochs=model.iter)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [14]:
model.save('models_lib_pool/lib_pool.d2v')

In [15]:
model = Doc2Vec.load('models_lib_pool/lib_pool.d2v')

In [52]:
ms = model.docvecs.most_similar('DATA_DOMAIN_3132') 
print(ms)

[('DATA_TACTIC_6694', 0.9935643076896667), ('DATA_TACTIC_43240', 0.9935141801834106), ('MiniConnection_PoolManager_0', 0.9930168390274048), ('DATA_TACTIC_24067', 0.9926599860191345), ('DATA_TACTIC_70779', 0.9925956726074219), ('DATA_TACTIC_95375', 0.992023766040802), ('DATA_TACTIC_22841', 0.9920196533203125), ('DATA_DOMAIN_46659', 0.9918712377548218), ('DATA_TACTIC_81200', 0.9916352033615112), ('DATA_TACTIC_116415', 0.9914083480834961)]


In [25]:
import codecs
test_docs="data_lib_pool/PO_domain_libs_CSV_2.txt"
output_file="data_lib_pool/data_domain_vectors.txt"

test_docs = [ x.strip().split() for x in codecs.open(test_docs, "r", "utf-8").readlines() ]

#inference hyper-parameters
start_alpha=0.01
infer_epoch=1000

#infer test vectors
output = open(output_file, "w")
for d in test_docs:
    output.write( " ".join([str(x) for x in model.infer_vector(d, alpha=start_alpha, steps=infer_epoch)]) + "\n" )
output.flush()
output.close()