In [1]:
import pandas as pd
import numpy as np
import requests

## Prepare repo list for further processing

In [2]:
df = pd.read_excel("go-repos/golang-repo-10-combined.xlsx")

# remove less useful fields, serve as main record
df.to_csv(
    "main.csv",
    index=False,
    columns=[
        "id", "full_name", "topics", "default_branch",
        "homepage", "description", "fork", "language",
        "license", "pushed_at", "created_at",
        "updated_at", "size", "forks_count", "stargazers_count",
        "watchers_count", "open_issues_count", "has_discussions",
        "has_downloads", "has_issues", "has_pages", "has_projects",
        "has_wiki", "is_template", "web_commit_signoff_required",
    ]
)

# slim version for collecting go.mod files on cloud
df.to_csv(
    "slim.csv",
    columns=[
        'id', 'full_name', 'stargazers_count',
        'forks_count', 'created_at'
    ]
)

In [3]:
df_main = pd.read_csv(
    "main.csv",
    parse_dates=[ "pushed_at", "created_at", "updated_at" ]
)

In [None]:
df_top20 = df_top.head(20)
df_top20.rename(columns={"full_name": "repository", "forks_count": "forks", "stargazers_count": "stars", "watchers_count": "watchers", "open_issues_count": "issues"}, inplace=True)
df_top20
df_top20.to_latex("__tab1.tex", index=False)

In [None]:
stats = df_main.describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99])
stats = stats.astype('int32').drop(columns=["id"])
stats.rename(columns={"forks_count": "forks", "stargazers_count": "stars", "watchers_count": "watchers", "open_issues_count": "issues"}, inplace=True)
stats.to_latex("__tab1.tex", index=False)

In [None]:
stats

## Collect go.mod files(run on cloud) 

## Download go.mod files and extract to mod-info dir

In [None]:
!scp data-science-station:/home/ubuntu/projects/big-data-task/play-data-science/msr-golang/mod-info.tgz .

In [None]:
!rm -fr tmp1
!mkdir -p tmp1
!tar -xzf mod-info.tgz -C tmp1

## Parse go.mod files to get module reference count

In [10]:
%%time
# call the script `parse-module-deps.py`
from parser import parse_deps_from_parquet

parse_deps_from_parquet(parquet_file="gomod.parquet", deps_file="dependencies-parquet.csv", trace=True)

CPU times: user 2min 27s, sys: 3.6 s, total: 2min 31s
Wall time: 2min 32s


## Generate module refs (group by dep_module)

In [None]:
df_deps = pd.read_csv('dependencies-parquet.csv')
df_mod_refs = df_deps.groupby("dep_module").agg(
    refs=pd.NamedAgg(column="full_name", aggfunc="count"),
).sort_values("refs", ascending=False).reset_index()
df_vers = df_deps.groupby(["dep_module", "dep_version"]).agg(refs=pd.NamedAgg(column="full_name", aggfunc="count"))
df_vers.reset_index(level=1, inplace=True)
df_vers.reset_index(level=0, inplace=True)
# get most referenced version
df_mod_refs = df_vers.sort_values("refs").drop_duplicates(["dep_module"], keep="last")
df_mod_refs.rename(columns={'dep_module': 'module', 'dep_version': 'freq_ver'}, inplace=True)

In [None]:
df_name_conv = df_mod_refs.query("module.str.startswith('github.com') == False")
df_name_conv.to_csv("name-conv-module-refs.csv", index=False)

## Associate non-github named modules with their github name

In [None]:
%%time
# call the script `convert-names.py`
from datagrab.repo import convert_names

convert_names("name-conv-module-refs.csv", progress_file="name-conv-progress.csv", trace=False)


## Join the main record with module ref count

In [None]:
def copy_github_column(row):
    if row['github_name'] != row['github_name'] or row['github_name'] == '-':
        return row['module']
    else:
        return row['github_name']
        
def strip_github_prefix(row):
    return row["github_name"][len("github.com/"):]

    
# merge main record with module ref count
df_name_mapping = pd.read_csv("name-conv-progress.csv", parse_dates=['last_updated'])
df_progress = pd.read_csv("mod-info.oci/progress.csv", parse_dates=["last_updated"])

#df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module", right_index=False)
df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module").drop(columns=['last_updated'])
df_1['github_name'] = df_1.apply(copy_github_column, axis=1)
# merge references to same library using different names
df_2 = df_1.groupby(["github_name", "freq_ver"]).agg(
    refs=pd.NamedAgg(column="refs", aggfunc="sum"),
).sort_values("refs", ascending=False)
df_2.reset_index(level=1, inplace=True)
df_2.reset_index(level=0, inplace=True)
df_gh = df_1.query("github_name.str.startswith('github.com')")
df_gh['full_name'] = df_gh.apply(strip_github_prefix, axis=1)
df_gh = df_gh.merge(df_progress, how="left", on="full_name").drop(columns=['last_updated', 'fail_reason'])
df_gh.rename(columns={'module': 'alt'}, inplace=True)
#df_gh = df_gh.merge(df_name_mapping, how="left", on="github_name").drop(columns=['last_updated', 'fail_reason'])

df_final = df_main.merge(df_gh, how="left", on="full_name").drop(columns=["github_name"])
df_final.sort_values("refs", ascending=False, inplace=True)
df_final = df_final[["full_name", "alt", "latest_version", "freq_ver", "refs", "stargazers_count", "created_at" ]]
df_final['created_at'] = df_final['created_at'].dt.year
#df_final.astype({"refs": "int32"})
df_final.rename(
    columns={
        'full_name':'Repository',
        'stargazers_count':'Stars',
        'refs':'Refs',
        'created_at':'Since',
        'freq_ver':'Most Used Version',
        'latest_version': 'Latest Version',
        'alt':'Import Name',
    },
    inplace=True
)

## Generate dependency graph

In [4]:
df_deps_grh = pd.read_csv('dependencies-parquet2.csv')


In [6]:
df_mod_refs = df_deps_grh.groupby(["dep_module", "full_name"]).agg(
    refs=pd.NamedAgg(column="dep_module", aggfunc="count"),
).sort_values("refs", ascending=False).reset_index()


In [7]:
df_mod_refs1 = df_mod_refs.query("refs > 17")
df_mod_refs1['dep_mod'] = df_mod_refs1.apply(lambda r: r['dep_module'].replace('github.com/', ''), axis=1)
df_mod_refs1.to_csv("go-deps.csv", columns=['full_name', 'dep_mod'], index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mod_refs1['dep_mod'] = df_mod_refs1.apply(lambda r: r['dep_module'].replace('github.com/', ''), axis=1)


In [9]:
df_pivot = df_mod_refs1.pivot_table(columns="dep_mod", index="full_name", values="refs").fillna(0)
#df_pivot.fillna(0).to_csv("go-deps-matrix.csv")
df_pivot

dep_mod,0chain/errors,0chain/gosdk,0xAX/notificator,0xERR0R/blocky,0xPolygon/go-ibft,0xPolygon/polygon-edge,0xPolygonHermez/zkevm-node,0xrawsec/golang-evtx,0xrawsec/golang-utils,0xsequence/ethkit,...,zs5460/art,zsais/go-gin-prometheus,zscaler/zscaler-sdk-go,zserge/lorca,ztrue/shutdown,ztrue/tracerr,zu1k/nali,zyedidia/generic,zyedidia/gopher-luar,zyedidia/poller
full_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0chain/blobber,20.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0chain/gosdk,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0chain/zboxcli,26.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0chain/zwalletcli,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0x2mev/mev-bsc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zvchain/zvchain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyedidia/knit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0
zyedidia/tcell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
zzsnn/proxypool,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Extract top 20 libraries for reporting

In [None]:
df_latex = df_final.head(20).astype({"Refs": "int32"})
df_latex

In [None]:
df_latex.to_latex("__libs.tex", index=False)

## Identify vulnerabilities

### Load golang vulnerability dataset

In [None]:
df_vuln = pd.read_csv("govuls.csv", parse_dates=["published", "last_updated"])
df_vuln = df_vuln[["module", "fix_version", "vul_id", "alias", "published"]]

In [None]:
df_vuln.query("alias=='CVE-2021-3121'")

### Load top 2000 libraries

In [None]:
df_top2000_lib = df_gh.sort_values("refs", ascending=False).head(2000).drop(columns=["freq_ver"])

In [None]:
df_top2000_lib.query("latest_version == latest_version")

### Load top 2000 libraries latest version's dependencies

In [None]:
df_deps.head()

In [None]:
df_top2000_lib_deps = df_top2000_lib.merge(df_deps, how="inner", left_on=["full_name", "latest_version"], right_on=["full_name", "version"]).drop(columns=["public_name"]) 

In [None]:
df_top2000_lib_deps.head(30)

### Match vul dataset by name

In [None]:
import semver
def to_semver(v):
    try:
        return semver.version.Version.parse(v[1:] if v.startswith("v") else v)
    except AttributeError as e:
        return None
df_sec_iss = df_top2000_lib_deps.merge(df_vuln, how="inner", left_on="dep_module", right_on="module")
df_sec_iss["fix_version"] = df_sec_iss.apply(lambda r: to_semver(r['fix_version']) , axis=1)
df_sec_iss["dep_version"] = df_sec_iss.apply(lambda r: to_semver(r['dep_version']) , axis=1)
df_sec_iss = df_sec_iss.query("dep_version < fix_version").drop(columns=["github_name", "module"])

In [None]:
# df_sec_iss.drop(columns=["dep_version", "module", "fix_version"], inplace=True)
df_sec_iss = df_sec_iss.sort_values("refs", ascending=False)

In [None]:
df_po = df_sec_iss.groupby(["alt", "version"]).agg({"refs": "max", "alias": lambda col: ",".join(col)})
df_po = df_po.reset_index(level=1)
df_po = df_po.reset_index(level=0)


In [None]:
df_po = df_po.sort_values(["refs"], ascending=False).drop(columns=['refs'])
#df_po = df_po.sort_values(["refs"], ascending=False)
df_po.rename(
    columns={
        'alias':'Vul IDs',
        'version': 'Version',
        'alt':'Library Name',
    },
    inplace=True
)


In [None]:
df_po.head(20).to_latex("__vul.tex", index=False)

In [None]:
df_sec_iss.to_csv("potential-sec-issue.csv")

## ****************************** DEBUG CELLS ****************************** 

In [202]:
df_awesome = pd.read_csv("main.csv", parse_dates=['created_at', 'updated_at', 'pushed_at'])
df_awesome_cat = pd.read_csv("go-awesome-categories.csv")
df_awesome = df_awesome.merge(df_awesome_cat, how="inner", on="full_name")
df_awesome.to_csv(
    "go-awesome-repos.csv",
    index=False,
    columns=[
        'id', 'full_name', 'stargazers_count',
        'forks_count', 'created_at'
    ]
)

In [None]:
cost_dfs = []
for date_range in date_ranges:
    df = pd.read_excel("round2/%s-repo-%d-%s-%s.xlsx" % (lang, stars, date_range[0], date_range[1]))
    cost_dfs.append(df)
combined = pd.concat(cost_dfs)
combined.to_csv('round2/combined.csv')


In [None]:
df_latest_ver = pd.read_csv("latest_ver.csv")

In [None]:
df_latest_ver

In [None]:
df_latex = df_latex.merge(df_latest_ver, how="inner", on="Repository")
df_latex

In [None]:
df_latex = df_latex[["Repository", "Import Name", "Latest Version", "Most Used Version", "Total Refs", "Stars", "Since"]]
df_latex.to_latex("__libs.tex")

### check go.mod file size

In [None]:
df_mod_size = pd.read_csv("gomod-size.csv")

In [None]:
df_mod_size.describe()

In [None]:
df_parquet = pd.read_parquet("gomod.parquet")


In [None]:
df_parquet.head()

### process commit data

In [9]:
# fix full_name to reverse repo/owner
def reverse_full_name(r):
    repo, owner = r['full_name'].split('/', 1)
    return f"{owner}/{repo}"
df_commit = pd.read_csv("commit-info/commits.csv", parse_dates=["author_date"])

df_commit['full_name'] = df_commit.apply(lambda r: reverse_full_name(r), axis=1)
df_commit.to_csv('commits-go-awesome.csv', index=False)

In [8]:
df_commit.dtypes

full_name              object
branch                 object
sha                    object
author_name            object
author_date    datetime64[ns]
verified                int64
dtype: object

In [2]:
df_commit = pd.read_csv('commits-go-awesome.csv', parse_dates=["author_date"])

In [None]:

from github import Github
from datagrab.github.common import load_access_token
from datagrab.github.common import load_repo_info
from datetime import datetime

client = Github(load_access_token(), per_page=100) 
repo = load_repo_info(client, "huggingface/transformers")
cmts = []
#for cmt in repo.get_issues_comments(since=datetime(2023, 9, 1, 0, 0, 0)):
for cmt in repo.get_issues_comments(sort='updated', direction='desc'):
    obj = vars(cmt).get('_rawData', {})
    cmts.append(obj)
cmts    