In [1]:
import pandas as pd
import numpy as np
import requests

## Prepare repo list for further processing

In [83]:
df = pd.read_excel("repo-candidates/combined.xlsx")

# remove less useful fields, serve as main record
df.to_csv(
    "repo-candidates/main.csv",
    index=False,
    columns=[
        "id", "full_name", "topics", "default_branch",
        "homepage", "description", "fork", "language",
        "license", "pushed_at", "created_at",
        "updated_at", "size", "forks_count", "stargazers_count",
        "watchers_count", "open_issues_count", "has_discussions",
        "has_downloads", "has_issues", "has_pages", "has_projects",
        "has_wiki", "is_template", "web_commit_signoff_required",
    ]
)

# slim version for collecting go.mod files on cloud
df.to_csv(
    "repo-candidates/slim.csv",
    columns=[
        'id', 'full_name', 'stargazers_count',
        'forks_count', 'created_at'
    ]
)

df_main = pd.read_csv(
    "repo-candidates/main.csv",
    parse_dates=[ "pushed_at", "created_at", "updated_at" ]
)

## Collect go.mod files(run on cloud) 

## Download go.mod files and extract to mod-info dir

In [105]:
!scp data-science-station:/home/ubuntu/projects/big-data-task/play-data-science/msr-golang/mod-info.tgz .

mod-info.tgz                                  100%   78MB   2.4MB/s   00:33    


In [None]:
!rm -fr tmp1
!mkdir -p tmp1
!tar -xzf mod-info.tgz -C tmp1

## Parse go.mod files to get module reference count

In [None]:
%time
# call the script `parse-module-deps.py`
from parser import parse_deps

parse_deps(base_dir="mod-info.oci", deps_file="dependencies.csv", trace=False)

## Generate module refs (group by dep_module)

In [108]:
df_deps = pd.read_csv('mod-info.oci/dependencies.csv')
df_mod_refs = df_deps.groupby("dep_module").agg(
    refs=pd.NamedAgg(column="full_name", aggfunc="count"),
).sort_values("refs", ascending=False).reset_index()

In [112]:
df_name_conv = df_mod_refs.query("dep_module.str.startswith('github.com') == False")
df_name_conv.rename(columns={'dep_module': 'module'}, inplace=True)
df_name_conv.to_csv("name-conv-module-refs.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_name_conv.rename(columns={'dep_module': 'module'}, inplace=True)


## Associate non-github named modules with their github name

In [2]:
%%time
# call the script `convert-names.py`
from datagrab.repo import convert_names

convert_names("name-conv-module-refs.csv", base_dir="mod-info.oci", trace=False)

fail to convert vbom.ml/util to github name due to HTTPSConnectionPool(host='vbom.ml', port=443): Max retries exceeded with url: /util?go-get=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x10dcc3b90>: Failed to resolve 'vbom.ml' ([Errno 8] nodename nor servname provided, or not known)"))
fail to convert kubean.io/api to github name due to HTTPSConnectionPool(host='kubean.io', port=443): Max retries exceeded with url: /api?go-get=1 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1002)')))
fail to convert opensearch.opster.io to github name due to HTTPSConnectionPool(host='opensearch.opster.io', port=443): Max retries exceeded with url: /?go-get=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x12537d750>: Failed to resolve 'opensearch.opster.io' ([Errno 8] nodename nor servname provided, or not known)"))
fail to convert gitlab.com/yawning/obfs4.git to 

## Join the main record with module ref count

In [51]:
df_mod_refs.rename(columns={"dep_module":"module"}, inplace=True)
df_mod_refs

Unnamed: 0,module,refs
0,github.com/stretchr/testify,117121
1,github.com/spf13/cobra,83612
2,github.com/pkg/errors,63958
3,github.com/google/uuid,50368
4,k8s.io/apimachinery,49117
...,...,...
20794,github.com/holoplot/go-evdev,1
20795,github.com/chris-wood/ohttp-go,1
20796,github.com/chrisdinn/vector-db,1
20797,github.com/chrismarget/cloudkey-led,1


In [101]:
def copy_github_column(row):
    if row['github_name'] != row['github_name'] or row['github_name'] == '-':
        return row['module']
    else:
        return row['github_name']
        
def strip_github_prefix(row):
    return row["github_name"][len("github.com/"):]

    
# merge main record with module ref count
df_name_mapping = pd.read_csv("mod-info.oci/name-conv-progress.csv", parse_dates=['last_updated'])
#df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module", right_index=False)
df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module").drop(columns=['last_updated'])
df_1['github_name'] = df_1.apply(copy_github_column, axis=1)
df_2 = df_1.groupby("github_name").agg(
    refs=pd.NamedAgg(column="refs", aggfunc="sum"),
).sort_values("refs", ascending=False).reset_index()
df_gh = df_2.query("github_name.str.startswith('github.com')")
df_gh['full_name'] = df_gh.apply(strip_github_prefix, axis=1)
df_gh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gh['full_name'] = df_gh.apply(strip_github_prefix, axis=1)


Unnamed: 0,github_name,refs,full_name
0,github.com/stretchr/testify,117144,stretchr/testify
1,github.com/spf13/cobra,83612,spf13/cobra
2,github.com/go-yaml/yaml,76346,go-yaml/yaml
3,github.com/pkg/errors,63958,pkg/errors
4,github.com/google/uuid,50368,google/uuid
...,...,...,...
20287,github.com/philippfranke/multipart-related,1,philippfranke/multipart-related
20288,github.com/philhug/go-trustlists,1,philhug/go-trustlists
20289,github.com/c-ollins/crabada,1,c-ollins/crabada
20290,github.com/c-seeger/mac-gen-go,1,c-seeger/mac-gen-go


In [103]:
df_final = df_main.merge(df_gh, how="left", on="full_name").drop(columns=["github_name"])
df_final.sort_values("refs", ascending=False, inplace=True)
df_final = df_final[["full_name", "refs", "forks_count", "stargazers_count", "watchers_count", "created_at" ]]
df_final.rename(
    columns={
        'full_name':'repository',
        'forks_count':'forks',
        'stargazers_count':'stars',
        'watchers_count':'watchers',
        'create_at':'inception',
    },
    inplace=True
)
df_final

Unnamed: 0,repository,refs,forks,stars,watchers,created_at
64982,stretchr/testify,117144.0,1489,20391,20391,2012-10-16 16:43:17+00:00
66422,spf13/cobra,83612.0,2730,33010,33010,2013-09-03 20:40:26+00:00
55489,go-yaml/yaml,76346.0,1029,6413,6413,2014-03-05 18:54:57+00:00
64128,pkg/errors,63958.0,671,8067,8067,2015-12-27 12:05:38+00:00
41948,google/uuid,50368.0,364,4509,4509,2016-02-12 22:17:59+00:00
...,...,...,...,...,...,...
67918,espra/core,,14,102,102,2009-12-30 02:40:26+00:00
67919,kklis/gomemcache,,19,66,66,2009-12-30 20:25:49+00:00
67920,kevinwatt/ed2kcrawler,,6,31,31,2009-12-23 03:31:04+00:00
67921,feyeleanor/GoFORTH,,0,12,12,2009-12-25 22:16:06+00:00


## Join module vulnerability record

## ****************************** DEBUG CELLS ****************************** 

In [94]:
cost_dfs = []
for date_range in date_ranges:
    df = pd.read_excel("round2/%s-repo-%d-%s-%s.xlsx" % (lang, stars, date_range[0], date_range[1]))
    cost_dfs.append(df)
combined = pd.concat(cost_dfs)
combined.to_csv('round2/combined.csv')
