In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import precision_score

def pkgsearch2(pkg_index, word='-dev'):
    result = []
    for p in pkg_index:
        if p.endswith(word):
            result.append(p)
    return result

def pkgsearch3(pkg_index, word = r'^python[\d.]*-'):
    
    return [ p for p in pkg_index if re.match( word, p) ]


In [3]:
class InverseFold():
    def __init__(self, fold):
        self.fold = fold
    
    def split(self, *args, **kwargs):
        for training, testing in self.fold.split(*args, **kwargs):
            yield testing, training

def multi_regression_score(pidx_file_path, w_file_path, output_file_path, n =2 ):
    with open(pidx_file_path,'r') as file:
        pkg_index = json.load(file)
    w = np.loadtxt(w_file_path)
    
    docpkg = pkgsearch2(pkg_index, word='-doc')
    devpkg = pkgsearch2(pkg_index, word='-dev')
    dbgpkg = pkgsearch2(pkg_index, word='-dbg')
    perlpkg = pkgsearch2(pkg_index,word = '-perl')
    phppkg = pkgsearch3(pkg_index,word = r'^php-')
    pythonpkg = pkgsearch3(pkg_index, word = r'^python-')
    python3pkg = pkgsearch3(pkg_index, word = r'^python3-')
    rubypkg = pkgsearch3(pkg_index, word = r'^ruby-')
    
    pkgs = [docpkg, devpkg, dbgpkg, perlpkg, phppkg, pythonpkg, python3pkg, rubypkg]
    
    print([len(p) for p in pkgs])
        
    X = []
    Y = []

    for p in pkg_index:
        for i, targetpkg in enumerate(pkgs, 0):
            if p in targetpkg:
                X.append(w[pkg_index[p]])
                Y.append(i)

    logre = LogisticRegression(multi_class='ovr', max_iter=100)
    scoring = {'f1_micro':'f1_micro', 'f1_macro': 'f1_macro', 'accuracy':'accuracy'}
    scores = cross_validate(logre, X, Y, scoring=scoring, cv=InverseFold(StratifiedKFold(n, shuffle=True)))
    
    return np.around(np.average([scores['test_f1_micro'], scores['test_f1_macro'], 
                        scores['test_accuracy']], axis=1), decimals=3)


def test_multi(filelist):

    df = pd.DataFrame(columns=('f1_micro', 'f1_macro', 'accuracy'))

    for i, f in enumerate(filelist):
        df.loc[f[2]] = multi_regression_score(*f)

    return df

def test_ratio(f):
    
    ns = [2,3,4,5,10,20]
    
    ns.reverse()
    
    return [multi_regression_score(*f, n=m)[0] for m in ns]
        
def test_multi_ratio(filelist, output_file_path):
    
    df = pd.DataFrame(columns=['5%','10%','20%','25%','33%','50%'])
    
    for i, f in enumerate(filelist):
        df.loc[f[2]] = test_ratio(f)
    
    df.to_csv(output_file_path)

    return df

    

In [44]:
def multi_regression_score_fedora(pidx_file_path, w_file_path, output_file_path, n=5):

    with open(pidx_file_path,'r') as file:
        pkg_index = json.load(file)
    w = np.loadtxt(w_file_path)
    
    docpkg = pkgsearch2(pkg_index, word='-doc')
    devpkg = pkgsearch2(pkg_index, word='-devel')
    #dbgpkg = pkgsearch2(pkg_index, word='-dbg')
    perlpkg = pkgsearch3(pkg_index,word = r'^perl-')
    phppkg = pkgsearch3(pkg_index,word = r'^php-')
    pythonpkg = pkgsearch3(pkg_index, word = r'^python2-')
    python3pkg = pkgsearch3(pkg_index, word = r'^python3-')
    rubypkg = pkgsearch3(pkg_index, word = r'^rubygem-')


    pkgs = [docpkg, devpkg, perlpkg, phppkg, pythonpkg, python3pkg, rubypkg]
    
    print([len(p) for p in pkgs])
    
    X = []
    Y = []

    for p in pkg_index:
        for i, targetpkg in enumerate(pkgs, 0):
            if p in targetpkg:
                X.append(w[pkg_index[p]])
                Y.append(i)

    logre = LogisticRegression(multi_class='ovr', max_iter=100)
    scoring = {'f1_micro':'f1_micro', 'f1_macro': 'f1_macro', 'accuracy':'accuracy'}
    scores = cross_validate(logre, X, Y, scoring=scoring, cv=InverseFold(StratifiedKFold(n, shuffle=True)))
    
    return np.around(np.average([scores['test_f1_micro'], scores['test_f1_macro'], 
                        scores['test_accuracy']], axis=1), decimals=4)


def test_multi_fedora(filelist):

    df = pd.DataFrame(columns=('f1_micro', 'f1_macro', 'accuracy'))

    for i, f in enumerate(filelist):
        df.loc[f[2]] = multi_regression_score_fedora(*f)

    return df

def test_ratio_fedora(f):
    
    ns = [2,3,4,5,10,20]
    
    ns.reverse()
    
    return [multi_regression_score_fedora(*f, n=m)[0] for m in ns]
        
def test_multi_ratio_fedora(filelist, output_file_path):
    
    df = pd.DataFrame(columns=['5%','10%','20%','25%','33%','50%'])
    
    for i, f in enumerate(filelist):
        df.loc[f[2]] = test_ratio_fedora(f)
    
    df.to_csv(output_file_path)

    return df


In [4]:
resultfile = [['pkg.json','debian_nw_tn_d64w2lambda1e-05kn5r100tw1.01.01.0weights.txt','PCTADW-2'],
             ['pkg.json','debian_nw_tri0_tn_d128w2lambda1e-05kn5r100tw1.01.01.0weights.txt', 'PCTADW-1'],
              ['pkg.json','../result/dw128', 'DeepWalk'],
             ['pkg.json','../result/debian-dbow-128-d.vec','PV-DBOW'],
             ['pkg.json','../result/debian-dm-128-d.vec','PV-DM'],
             ['pkg.json','../result/debian-concate-dbow','DeepWalk-PV-DBOW'],
             ['pkg.json','../result/debian-concate-dm','DeepWalk-PV-DM']]

In [40]:
fedorafile = [['fedorapkg.json','fedora_nw_tn_d64w2lambda1e-05kn5r100tw1.01.01.0weights.txt','PCTADW-2'],
             ['fedorapkg.json','fedora_nw_tri0_tn_d128w2lambda1e-05kn5r100tw1.01.01.0weights.txt','PCTADW-1'],
             ['fedorapkg.json','../result/fdw128', 'DeepWalk'],
             ['fedorapkg.json','../result/fedora-dbow-128-d.vec','PV-DBOW'],
             ['fedorapkg.json','../result/fedora-dm-128-d.vec','PV-DM'],
             ['fedorapkg.json','../result/fedora-concate-dbow','DeepWalk-PV-DBOW'],
             ['fedorapkg.json','../result/fedora-concate-dm','DeepWalk-PV-DM']]

In [16]:
test_multi_fedora(fedorafile)

[1050, 4944, 3078, 897, 122, 2072]
[1050, 4944, 3078, 897, 122, 2072]
[1050, 4944, 3078, 897, 122, 2072]
[1050, 4944, 3078, 897, 122, 2072]
[1050, 4944, 3078, 897, 122, 2072]
[1050, 4944, 3078, 897, 122, 2072]
[1050, 4944, 3078, 897, 122, 2072]


Unnamed: 0,f1_micro,f1_macro,accuracy
PCTADW-2,0.9753,0.8943,0.9753
PCTADW-1,0.9757,0.8879,0.9757
DeepWalk,0.9175,0.8128,0.9175
PV-DBOW,0.8641,0.7026,0.8641
PV-DM,0.7378,0.5688,0.7378
DeepWalk-PV-DBOW,0.9726,0.8961,0.9726
DeepWalk-PV-DM,0.9613,0.8671,0.9613


In [41]:
ft2 = test_multi_ratio_fedora(fedorafile,'fedora-regression-test-f1-micro.csv')

[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072, 1238]
[1050, 4944, 3078, 897, 2372, 2072

In [42]:
ft2

Unnamed: 0,5%,10%,20%,25%,33%,50%
PCTADW-2,0.9172,0.9242,0.9297,0.9316,0.9328,0.9338
PCTADW-1,0.9176,0.9233,0.9275,0.9286,0.9294,0.9313
DeepWalk,0.8558,0.8598,0.8645,0.8675,0.8682,0.8683
PV-DBOW,0.6474,0.6755,0.6954,0.7015,0.7077,0.7132
PV-DM,0.5204,0.5562,0.5825,0.5849,0.5915,0.5968
DeepWalk-PV-DBOW,0.9079,0.9161,0.9227,0.9247,0.9269,0.9268
DeepWalk-PV-DM,0.8702,0.8883,0.8993,0.9003,0.9014,0.9039


In [43]:
print(ft2.round(3).to_latex())

\begin{tabular}{lrrrrrr}
\toprule
{} &     5\% &    10\% &    20\% &    25\% &    33\% &    50\% \\
\midrule
PCTADW-2         &  0.917 &  0.924 &  0.930 &  0.932 &  0.933 &  0.934 \\
PCTADW-1         &  0.918 &  0.923 &  0.928 &  0.929 &  0.929 &  0.931 \\
DeepWalk         &  0.856 &  0.860 &  0.864 &  0.868 &  0.868 &  0.868 \\
PV-DBOW          &  0.647 &  0.676 &  0.695 &  0.702 &  0.708 &  0.713 \\
PV-DM            &  0.520 &  0.556 &  0.582 &  0.585 &  0.592 &  0.597 \\
DeepWalk-PV-DBOW &  0.908 &  0.916 &  0.923 &  0.925 &  0.927 &  0.927 \\
DeepWalk-PV-DM   &  0.870 &  0.888 &  0.899 &  0.900 &  0.901 &  0.904 \\
\bottomrule
\end{tabular}



In [29]:
ft

Unnamed: 0,5%,10%,20%,25%,33%,50%
PCTADW-2,0.9078,0.9167,0.9239,0.9252,0.9274,0.9285
PCTADW-1,0.9084,0.9147,0.919,0.9205,0.9235,0.9248
DeepWalk,0.8365,0.8421,0.8479,0.8495,0.8519,0.8512
PV-DBOW,0.7416,0.7664,0.7866,0.7908,0.7961,0.8017
PV-DM,0.5937,0.6292,0.656,0.6634,0.6709,0.6763
DeepWalk-PV-DBOW,0.8998,0.9081,0.9168,0.9189,0.9198,0.9195
DeepWalk-PV-DM,0.8533,0.8763,0.8874,0.8899,0.8903,0.8916


In [25]:
print(ft.round(3).to_latex())

\begin{tabular}{lrrrrrr}
\toprule
{} &     5\% &    10\% &    20\% &    25\% &    33\% &    50\% \\
\midrule
PCTADW-2         &  0.969 &  0.975 &  0.979 &  0.980 &  0.981 &  0.982 \\
PCTADW-1         &  0.972 &  0.976 &  0.980 &  0.980 &  0.981 &  0.982 \\
DeepWalk         &  0.908 &  0.911 &  0.916 &  0.919 &  0.920 &  0.922 \\
PV-DBOW          &  0.813 &  0.836 &  0.853 &  0.856 &  0.862 &  0.870 \\
PV-DM            &  0.655 &  0.696 &  0.724 &  0.729 &  0.736 &  0.742 \\
DeepWalk-PV-DBOW &  0.963 &  0.970 &  0.974 &  0.975 &  0.978 &  0.979 \\
DeepWalk-PV-DM   &  0.932 &  0.950 &  0.961 &  0.963 &  0.965 &  0.967 \\
\bottomrule
\end{tabular}



In [5]:
t = test_multi_ratio(resultfile,'debian-regression-test-f1-micro.csv')

[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816, 1094]
[2062, 5758, 1464, 3688, 405, 3277, 1816

In [8]:
t

Unnamed: 0,5%,10%,20%,25%,33%,50%
PCTADW-2,0.91,0.918,0.922,0.923,0.924,0.925
PCTADW-1,0.911,0.917,0.92,0.921,0.923,0.923
DeepWalk,0.869,0.876,0.88,0.882,0.884,0.887
PV-DBOW,0.614,0.659,0.693,0.699,0.708,0.716
PV-DM,0.432,0.48,0.51,0.515,0.522,0.525
DeepWalk-PV-DBOW,0.902,0.913,0.92,0.922,0.924,0.926
DeepWalk-PV-DM,0.859,0.876,0.886,0.888,0.892,0.894
