In [1]:
import pandas as pd
import numpy as np
import requests

## Prepare repo list for further processing

In [None]:
df = pd.read_excel("repo-candidates/combined.xlsx")

# remove less useful fields, serve as main record
df.to_csv(
    "repo-candidates/main.csv",
    index=False,
    columns=[
        "id", "full_name", "topics", "default_branch",
        "homepage", "description", "fork", "language",
        "license", "pushed_at", "created_at",
        "updated_at", "size", "forks_count", "stargazers_count",
        "watchers_count", "open_issues_count", "has_discussions",
        "has_downloads", "has_issues", "has_pages", "has_projects",
        "has_wiki", "is_template", "web_commit_signoff_required",
    ]
)

# slim version for collecting go.mod files on cloud
df.to_csv(
    "repo-candidates/slim.csv",
    columns=[
        'id', 'full_name', 'stargazers_count',
        'forks_count', 'created_at'
    ]
)

In [37]:
df_main = pd.read_csv(
    "repo-candidates/main.csv",
    parse_dates=[ "pushed_at", "created_at", "updated_at" ]
)

In [133]:
df_main.describe()

Unnamed: 0,id,size,forks_count,stargazers_count,watchers_count,open_issues_count
count,67923.0,67923.0,67923.0,67923.0,67923.0,67923.0
mean,204668900.0,11216.68,42.002253,258.998204,258.998204,9.832958
std,167668100.0,98398.22,317.355847,1667.278762,1667.278762,76.225945
min,20912.0,0.0,0.0,10.0,10.0,0.0
25%,67041000.0,62.0,3.0,15.0,15.0,0.0
50%,156488500.0,321.0,7.0,29.0,29.0,1.0
75%,306932100.0,3644.0,20.0,85.0,85.0,5.0
max,677517000.0,14983450.0,38999.0,113739.0,113739.0,8643.0


## Collect go.mod files(run on cloud) 

## Download go.mod files and extract to mod-info dir

In [105]:
!scp data-science-station:/home/ubuntu/projects/big-data-task/play-data-science/msr-golang/mod-info.tgz .

mod-info.tgz                                  100%   78MB   2.4MB/s   00:33    


In [None]:
!rm -fr tmp1
!mkdir -p tmp1
!tar -xzf mod-info.tgz -C tmp1

## Parse go.mod files to get module reference count

In [None]:
%time
# call the script `parse-module-deps.py`
from parser import parse_deps

parse_deps(base_dir="mod-info.oci", deps_file="dependencies.csv", trace=False)

## Generate module refs (group by dep_module)

In [15]:
df_deps = pd.read_csv('mod-info.oci/dependencies.csv')
df_mod_refs = df_deps.groupby("dep_module").agg(
    refs=pd.NamedAgg(column="full_name", aggfunc="count"),
).sort_values("refs", ascending=False).reset_index()

In [54]:
df_mod_refs

Unnamed: 0,module,freq_ver,refs
0,0xacab.org/leap/obfsvpn,v0.0.0-20220626143947-feff527c00e5,1
123770,github.com/tencentcloud/tencentcloud-sdk-go/te...,v1.0.469,1
98475,github.com/tailscale/sqlite,v0.0.0-20221025150348-0716cf4a392b,1
98482,github.com/tailscale/win,v0.0.0-20230710211752-84569fd814a9,1
98503,github.com/taion809/haikunator,v0.0.0-20150324135039-4e414e676fd1,1
...,...,...,...
48838,github.com/google/uuid,v1.3.0,36062
189320,gopkg.in/yaml.v2,v2.4.0,36744
97205,github.com/stretchr/testify,v1.7.0,46853
96164,github.com/spf13/pflag,v1.0.5,50205


In [24]:
df_vers = df_deps.groupby(["dep_module", "dep_version"]).agg(refs=pd.NamedAgg(column="full_name", aggfunc="count"))
df_vers.reset_index(level=1, inplace=True)
df_vers.reset_index(level=0, inplace=True)
# get most referenced version
df_mod_refs = df_vers.sort_values("refs").drop_duplicates(["dep_module"], keep="last")
df_mod_refs.rename(columns={'dep_module': 'module', 'dep_version': 'freq_ver'}, inplace=True)

In [6]:
df_name_conv = df_mod_refs.query("module.str.startswith('github.com') == False")
df_name_conv.to_csv("name-conv-module-refs.csv", index=False)

## Associate non-github named modules with their github name

In [None]:
%%time
# call the script `convert-names.py`
from datagrab.repo import convert_names

convert_names("name-conv-module-refs.csv", progress_file="name-conv-progress.csv", trace=False)


In [8]:
df_11 = pd.read_csv("name-conv-module-refs.csv")
df_22 = pd.read_csv("name-conv-progress.csv")

## Join the main record with module ref count

In [76]:
df_mod_refs.query("module == 'gopkg.in/yaml.v2'")

Unnamed: 0,module,freq_ver,refs
189320,gopkg.in/yaml.v2,v2.4.0,36744


In [80]:
def copy_github_column(row):
    if row['github_name'] != row['github_name'] or row['github_name'] == '-':
        return row['module']
    else:
        return row['github_name']
        
def strip_github_prefix(row):
    return row["github_name"][len("github.com/"):]

    
# merge main record with module ref count
df_name_mapping = pd.read_csv("name-conv-progress.csv", parse_dates=['last_updated'])
#df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module", right_index=False)
df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module").drop(columns=['last_updated'])
df_1['github_name'] = df_1.apply(copy_github_column, axis=1)

In [84]:
df_1.query("module == 'gopkg.in/yaml.v3'")

Unnamed: 0,module,freq_ver,refs,github_name,fail_reason
25122,gopkg.in/yaml.v3,v3.0.1,18901,github.com/go-yaml/yaml,


In [51]:
# merge references to same library using different names
df_2 = df_1.groupby(["github_name", "freq_ver"]).agg(
    refs=pd.NamedAgg(column="refs", aggfunc="sum"),
).sort_values("refs", ascending=False)
df_2.reset_index(level=1, inplace=True)
df_2.reset_index(level=0, inplace=True)

In [None]:
df_2.head()

In [65]:
df_1 = df_1.drop(columns=["fail_reason", "module"])

In [85]:
df_name_mapping

Unnamed: 0,module,github_name,fail_reason,last_updated
0,k8s.io/apimachinery,github.com/kubernetes/apimachinery,,2023-08-28 04:38:06
1,golang.org/x/crypto,go.googlesource.com/crypto,,2023-08-28 04:38:06
2,k8s.io/client-go,github.com/kubernetes/client-go,,2023-08-28 04:38:07
3,gopkg.in/yaml.v2,github.com/go-yaml/yaml,,2023-08-28 04:38:07
4,k8s.io/api,github.com/kubernetes/api,,2023-08-28 04:38:07
...,...,...,...,...
2105,go.mozilla.org/cose,github.com/mozilla-services/go-cose,,2023-08-28 05:06:36
2106,go.mau.fi/mauview,github.com/tulir/mauview.git,,2023-08-28 05:06:39
2107,go.mau.fi/mautrix-gmessages/libgm,github.com/mautrix/gmessages.git,,2023-08-28 05:06:40
2108,go.mau.fi/cbind,github.com/tulir/cbind.git,,2023-08-28 05:06:42


In [None]:
df_gh = df_1.query("github_name.str.startswith('github.com')")
df_gh['full_name'] = df_gh.apply(strip_github_prefix, axis=1)
#df_gh = df_gh.merge(df_name_mapping, how="left", on="github_name").drop(columns=['last_updated', 'fail_reason'])

In [87]:
df_gh

Unnamed: 0,module,freq_ver,refs,github_name,fail_reason,full_name
1,github.com/tencentcloud/tencentcloud-sdk-go/te...,v1.0.469,1,github.com/tencentcloud/tencentcloud-sdk-go/te...,,tencentcloud/tencentcloud-sdk-go/tencentcloud/dms
2,github.com/tailscale/sqlite,v0.0.0-20221025150348-0716cf4a392b,1,github.com/tailscale/sqlite,,tailscale/sqlite
3,github.com/tailscale/win,v0.0.0-20230710211752-84569fd814a9,1,github.com/tailscale/win,,tailscale/win
4,github.com/taion809/haikunator,v0.0.0-20150324135039-4e414e676fd1,1,github.com/taion809/haikunator,,taion809/haikunator
5,github.com/taiphamd/efibootselector,v0.2.4,1,github.com/taiphamd/efibootselector,,taiphamd/efibootselector
...,...,...,...,...,...,...
25126,github.com/google/uuid,v1.3.0,36062,github.com/google/uuid,,google/uuid
25127,gopkg.in/yaml.v2,v2.4.0,36744,github.com/go-yaml/yaml,,go-yaml/yaml
25128,github.com/stretchr/testify,v1.7.0,46853,github.com/stretchr/testify,,stretchr/testify
25129,github.com/spf13/pflag,v1.0.5,50205,github.com/spf13/pflag,,spf13/pflag


In [88]:
df_gh.rename(columns={'module': 'alt'}, inplace=True)
#df_gh['alt'] = df_gh['alt'].fillna(df_gh['github_name'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gh.rename(columns={'module': 'alt'}, inplace=True)


In [91]:
df_gh

Unnamed: 0,alt,freq_ver,refs,github_name,fail_reason,full_name
1,github.com/tencentcloud/tencentcloud-sdk-go/te...,v1.0.469,1,github.com/tencentcloud/tencentcloud-sdk-go/te...,,tencentcloud/tencentcloud-sdk-go/tencentcloud/dms
2,github.com/tailscale/sqlite,v0.0.0-20221025150348-0716cf4a392b,1,github.com/tailscale/sqlite,,tailscale/sqlite
3,github.com/tailscale/win,v0.0.0-20230710211752-84569fd814a9,1,github.com/tailscale/win,,tailscale/win
4,github.com/taion809/haikunator,v0.0.0-20150324135039-4e414e676fd1,1,github.com/taion809/haikunator,,taion809/haikunator
5,github.com/taiphamd/efibootselector,v0.2.4,1,github.com/taiphamd/efibootselector,,taiphamd/efibootselector
...,...,...,...,...,...,...
25126,github.com/google/uuid,v1.3.0,36062,github.com/google/uuid,,google/uuid
25127,gopkg.in/yaml.v2,v2.4.0,36744,github.com/go-yaml/yaml,,go-yaml/yaml
25128,github.com/stretchr/testify,v1.7.0,46853,github.com/stretchr/testify,,stretchr/testify
25129,github.com/spf13/pflag,v1.0.5,50205,github.com/spf13/pflag,,spf13/pflag


In [129]:
df_final = df_main.merge(df_gh, how="left", on="full_name").drop(columns=["github_name", "fail_reason"])
df_final.sort_values("refs", ascending=False, inplace=True)
df_final = df_final[["full_name", "alt", "freq_ver", "refs", "stargazers_count", "created_at" ]]
df_final['created_at'] = df_final['created_at'].dt.year
#df_final.astype({"refs": "int32"})
df_final.rename(
    columns={
        'full_name':'Repository',
        'stargazers_count':'Stars',
        'refs':'Total Refs',
        'created_at':'Since',
        'freq_ver':'Most Used Version',
        'alt':'Import Name',
    },
    inplace=True
)

In [130]:
df_latex = df_final.head(20).astype({"Total Refs": "int32"})
df_latex

Unnamed: 0,Repository,Import Name,Most Used Version,Total Refs,Stars,Since
64598,pkg/errors,github.com/pkg/errors,v0.9.1,73707,8067,2015
66943,spf13/pflag,github.com/spf13/pflag,v1.0.5,50205,2158,2013
65461,stretchr/testify,github.com/stretchr/testify,v1.7.0,46853,20391,2012
55811,go-yaml/yaml,gopkg.in/yaml.v2,v2.4.0,36744,6413,2014
42201,google/uuid,github.com/google/uuid,v1.3.0,36062,4509,2016
65390,gorilla/mux,github.com/gorilla/mux,v1.8.0,21763,19002,2012
67259,sirupsen/logrus,github.com/sirupsen/logrus,v1.8.1,21252,23118,2013
57638,mitchellh/go-homedir,github.com/mitchellh/go-homedir,v1.1.0,20406,1330,2014
55810,go-yaml/yaml,gopkg.in/yaml.v3,v3.0.1,18901,6413,2014
65703,davecgh/go-spew,github.com/davecgh/go-spew,v1.1.1,18609,5683,2013


In [None]:
df_latex.to_latex("__libs.tex", index=False)

## Join module vulnerability record

## ****************************** DEBUG CELLS ****************************** 

In [94]:
cost_dfs = []
for date_range in date_ranges:
    df = pd.read_excel("round2/%s-repo-%d-%s-%s.xlsx" % (lang, stars, date_range[0], date_range[1]))
    cost_dfs.append(df)
combined = pd.concat(cost_dfs)
combined.to_csv('round2/combined.csv')


In [125]:
df_latest_ver = pd.read_csv("latest_ver.csv")

In [None]:
df_latest_ver

In [131]:
df_latex = df_latex.merge(df_latest_ver, how="inner", on="Repository")
df_latex

Unnamed: 0,Repository,Import Name,Most Used Version,Total Refs,Stars,Since,Latest Version
0,pkg/errors,github.com/pkg/errors,v0.9.1,73707,8067,2015,v0.9.1
1,spf13/pflag,github.com/spf13/pflag,v1.0.5,50205,2158,2013,v1.0.5
2,stretchr/testify,github.com/stretchr/testify,v1.7.0,46853,20391,2012,v1.8.4
3,go-yaml/yaml,gopkg.in/yaml.v2,v2.4.0,36744,6413,2014,v3.0.1
4,go-yaml/yaml,gopkg.in/yaml.v2,v2.4.0,36744,6413,2014,v3.0.1
5,go-yaml/yaml,gopkg.in/yaml.v3,v3.0.1,18901,6413,2014,v3.0.1
6,go-yaml/yaml,gopkg.in/yaml.v3,v3.0.1,18901,6413,2014,v3.0.1
7,google/uuid,github.com/google/uuid,v1.3.0,36062,4509,2016,v1.3.1
8,gorilla/mux,github.com/gorilla/mux,v1.8.0,21763,19002,2012,v1.8.0
9,sirupsen/logrus,github.com/sirupsen/logrus,v1.8.1,21252,23118,2013,v1.9.3


In [132]:
df_latex = df_latex[["Repository", "Import Name", "Latest Version", "Most Used Version", "Total Refs", "Stars", "Since"]]
df_latex.to_latex("__libs.tex")