In [4]:
import pandas as pd
import numpy as np
import requests

## Prepare repo list for further processing

In [31]:
df = pd.read_excel("repo-candidates/combined.xlsx")

# remove less useful fields, serve as main record
df.to_csv(
    "repo-candidates/main.csv",
    index=False,
    columns=[
        "id", "full_name", "topics", "default_branch",
        "homepage", "description", "fork", "language",
        "license", "pushed_at", "created_at",
        "updated_at", "size", "forks_count", "stargazers_count",
        "watchers_count", "open_issues_count", "has_discussions",
        "has_downloads", "has_issues", "has_pages", "has_projects",
        "has_wiki", "is_template", "web_commit_signoff_required",
    ]
)

# slim version for collecting go.mod files on cloud
df.to_csv(
    "repo-candidates/slim.csv",
    columns=[
        'id', 'full_name', 'stargazers_count',
        'forks_count', 'created_at'
    ]
)

df_main = pd.read_csv(
    "repo-candidates/main.csv",
    parse_dates=[ "pushed_at", "created_at", "updated_at" ]
)

## Collect go.mod files(run on cloud) 

## Download go.mod files and extract to mod-info dir

In [105]:
!scp data-science-station:/home/ubuntu/projects/big-data-task/play-data-science/msr-golang/mod-info.tgz .

mod-info.tgz                                  100%   78MB   2.4MB/s   00:33    


In [None]:
!rm -fr tmp1
!mkdir -p tmp1
!tar -xzf mod-info.tgz -C tmp1

## Parse go.mod files to get module reference count

In [None]:
%time
# call the script `parse-module-deps.py`
from parser import parse_deps

parse_deps(base_dir="mod-info.oci", deps_file="dependencies.csv", trace=False)

## Generate module refs (group by dep_module)

In [5]:
df_deps = pd.read_csv('mod-info.oci/dependencies.csv')
df_mod_refs = df_deps.groupby("dep_module").agg(
    refs=pd.NamedAgg(column="full_name", aggfunc="count"),
).sort_values("refs", ascending=False).reset_index()

In [6]:
df_mod_refs.rename(columns={'dep_module': 'module'}, inplace=True)
df_name_conv = df_mod_refs.query("module.str.startswith('github.com') == False")
df_name_conv.to_csv("name-conv-module-refs.csv", index=False)

## Associate non-github named modules with their github name

In [None]:
%%time
# call the script `convert-names.py`
from datagrab.repo import convert_names

convert_names("name-conv-module-refs.csv", progress_file="name-conv-progress.csv", trace=False)


## Join the main record with module ref count

In [7]:
df_mod_refs

Unnamed: 0,module,refs
0,github.com/stretchr/testify,149893
1,github.com/spf13/cobra,102532
2,github.com/pkg/errors,87758
3,k8s.io/apimachinery,64483
4,golang.org/x/crypto,63981
...,...,...
25126,github.com/taskcluster/taskcluster-lib-urls,1
25127,github.com/grijul/otpgen,1
25128,github.com/tarndt/sema,1
25129,github.com/tarm/goserial,1


In [28]:
def copy_github_column(row):
    if row['github_name'] != row['github_name'] or row['github_name'] == '-':
        return row['module']
    else:
        return row['github_name']
        
def strip_github_prefix(row):
    return row["github_name"][len("github.com/"):]

    
# merge main record with module ref count
df_name_mapping = pd.read_csv("name-conv-progress.csv", parse_dates=['last_updated'])
#df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module", right_index=False)
df_1 = df_mod_refs.merge(df_name_mapping, how="left", on="module").drop(columns=['last_updated'])
df_1['github_name'] = df_1.apply(copy_github_column, axis=1)

# merge references to same library using different names
df_2 = df_1.groupby("github_name").agg(
    refs=pd.NamedAgg(column="refs", aggfunc="sum"),
).sort_values("refs", ascending=False).reset_index()
#df_2 = df_1
df_gh = df_2.query("github_name.str.startswith('github.com')")
df_gh['full_name'] = df_gh.apply(strip_github_prefix, axis=1)
df_gh = df_gh.merge(df_name_mapping, how="left", on="github_name").drop(columns=['last_updated', 'fail_reason'])
df_gh.rename(columns={'module': 'alt'}, inplace=True)
df_gh['alt'] = df_gh['alt'].fillna(df_gh['github_name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gh['full_name'] = df_gh.apply(strip_github_prefix, axis=1)


In [15]:
df_name_mapping.groupby(["module", "github_name"]).agg(cnt=pd.NamedAgg(column="module", aggfunc="count")).query("cnt > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt
module,github_name,Unnamed: 2_level_1


In [29]:
df_gh

Unnamed: 0,github_name,refs,full_name,alt
0,github.com/stretchr/testify,149916,stretchr/testify,gopkg.in/stretchr/testify.v1
1,github.com/spf13/cobra,102532,spf13/cobra,github.com/spf13/cobra
2,github.com/go-yaml/yaml,94880,go-yaml/yaml,gopkg.in/yaml.v2
3,github.com/go-yaml/yaml,94880,go-yaml/yaml,gopkg.in/yaml.v3
4,github.com/go-yaml/yaml,94880,go-yaml/yaml,gopkg.in/yaml.v1
...,...,...,...,...
24340,github.com/planetscale/pargzip,1,planetscale/pargzip,github.com/planetscale/pargzip
24341,github.com/globocom/glbgelf,1,globocom/glbgelf,github.com/globocom/glbgelf
24342,github.com/cyclegen/tdx-go,1,cyclegen/tdx-go,github.com/cyclegen/tdx-go
24343,github.com/globocom/gokong,1,globocom/gokong,github.com/globocom/gokong


In [41]:
df_final = df_main.merge(df_gh, how="left", on="full_name").drop(columns=["github_name"])
df_final.sort_values("refs", ascending=False, inplace=True)
df_final = df_final[["full_name", "alt", "refs", "forks_count", "stargazers_count", "watchers_count", "created_at" ]]
df_final['created_at'] = df_final['created_at'].dt.year
#df_final.astype({"refs": "int32"})
df_final.rename(
    columns={
        'full_name':'repository',
        'forks_count':'forks',
        'stargazers_count':'stars',
        'watchers_count':'watchers',
        'created_at':'year',
    },
    inplace=True
)

In [49]:
df_final.head(50).astype({"refs": "int32"}).to_latex("__libs.tex", index=False)

## Join module vulnerability record

## ****************************** DEBUG CELLS ****************************** 

In [94]:
cost_dfs = []
for date_range in date_ranges:
    df = pd.read_excel("round2/%s-repo-%d-%s-%s.xlsx" % (lang, stars, date_range[0], date_range[1]))
    cost_dfs.append(df)
combined = pd.concat(cost_dfs)
combined.to_csv('round2/combined.csv')
