In [1]:
import os
import pandas as pd
import numpy as np
import json
import janitor

pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 100)

from utilities import read_jsons

In [2]:
# Read in 100 random sample
df_random_sample = (
    pd.read_csv("../input/random_sample.csv")
    .assign(slug=lambda df: df["github"].str.replace("https://github.com/", ""))
    .assign(slug=lambda df: df["slug"].apply(lambda x: x[:-1] if x.endswith("/") else x))
    .assign(slug=lambda df: df["slug"].str.lower().str.strip())
#     .assign(fileslug=lambda df: df["slug"].str.replace("/", "_"))
    .assign(treated=1)
    # Assign the two types
    .assign(boughtstars=lambda df: (df.index < 25).astype(int))
    .assign(treated2=lambda df: df["boughtstars"] + df["treated"])
)
df_random_sample.head(3)

  .assign(slug=lambda df: df["github"].str.replace("https://github.com/", ""))


Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release,gh_url_check,github,pypi,label,slug,treated,boughtstars,treated2
0,bird-ospf-link-db-parser,200.0,https://github.com/Andrew-Dickinson/bird-ospf-...,,2023-04-29T07:23:44,1.0,https://github.com/Andrew-Dickinson/bird-ospf-...,https://pypi.org/project/bird-ospf-link-db-par...,bird-ospf-link-db-parser\r\nhttps://github.com...,andrew-dickinson/bird-ospf-link-db-parser,1,1,2
1,asciicli,200.0,https://github.com/mrq-andras/asciicli,https://github.com/mrq-andras/asciicli,2023-04-28T07:22:55,1.0,https://github.com/mrq-andras/asciicli,https://pypi.org/project/asciicli/#history,asciicli\r\nhttps://github.com/mrq-andras/asci...,mrq-andras/asciicli,1,1,2
2,bdpotentiometer,200.0,https://github.com/bond-anton/BDPotentiometer,https://github.com/bond-anton/BDPotentiometer,2023-04-27T06:35:18,1.0,https://github.com/bond-anton/BDPotentiometer,https://pypi.org/project/bdpotentiometer/#history,bdpotentiometer\r\nhttps://github.com/bond-ant...,bond-anton/bdpotentiometer,1,1,2


In [3]:
# Repos characteristics
payloads = read_jsons("../output/repo_profile_payload/")

df_repos = (
    pd.DataFrame(payloads)
    .dropna(subset=["id"])
    .assign(slug=lambda df: df["full_name"].str.lower().str.strip())
    ## Get treatment assignment ----------------------------------------------
    .merge(df_random_sample, how="left", on="slug", validate="1:1", indicator=True)
    .assign(treated=lambda df: df["treated"].fillna(0).apply(int))
    .assign(treated2=lambda df: df["treated2"].fillna(0).apply(int))
    .assign(created_at=lambda df: pd.to_datetime(df['created_at']))
    .assign(year_created=lambda df: [dt.year for dt in df["created_at"]])
    ## Cleaning up additional features ---------------------------------------
    .assign(license_str=lambda df: [license["spdx_id"] if license else None for license in df["license"]])
    .assign(n_topics=lambda df: [len(topics) for topics in df["topics"]])
    # https://stackoverflow.com/a/8679592
    .assign(size_mb=lambda df: df["size"]/1024)
    .assign(is_org=lambda df: np.where(df["organization"].isna(), 0, 1))
    .assign(user=lambda df: [slug.split("/")[0] for slug in df["slug"]])
    .assign(owner_str=lambda df: [owner["login"].lower().strip() if owner else None for owner in df["owner"]])
    .assign(description_size=lambda df: df["description"].str.len().fillna(0).apply(int))
#     ## Getting readme stats --------------------------------------------------
#     .merge((pd.read_csv("../output/pypi_readme.csv", 
#                         usecols=["slug", "n_requirements", "raw_readme_len", "processed_readme_len"])), 
#            how="left", on="slug", validate="1:1"
#           )
)
assert (df_repos["user"] == df_repos["owner_str"]).all()
display(df_repos.head(3))
df_repos.info(verbose=True)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,forks_url,keys_url,collaborators_url,teams_url,hooks_url,issue_events_url,events_url,assignees_url,branches_url,tags_url,blobs_url,git_tags_url,git_refs_url,trees_url,statuses_url,languages_url,stargazers_url,contributors_url,subscribers_url,subscription_url,commits_url,git_commits_url,comments_url,issue_comment_url,contents_url,compare_url,merges_url,archive_url,downloads_url,issues_url,pulls_url,milestones_url,notifications_url,labels_url,releases_url,deployments_url,created_at,updated_at,pushed_at,git_url,ssh_url,clone_url,svn_url,homepage_x,size,stargazers_count,watchers_count,language,has_issues,has_projects,has_downloads,has_wiki,has_pages,has_discussions,forks_count,mirror_url,archived,disabled,open_issues_count,license,allow_forking,is_template,web_commit_signoff_required,topics,visibility,forks,open_issues,watchers,default_branch,permissions,temp_clone_token,network_count,subscribers_count,organization,parent,source,message,documentation_url,template_repository,slug,pkg,return_code,github_url,homepage_y,earliest_release,gh_url_check,github,pypi,label,treated,boughtstars,treated2,_merge,year_created,license_str,n_topics,size_mb,is_org,user,owner_str,description_size
0,632697072.0,R_kgDOJbYw8A,ezfinpy,renanmoretto/ezfinpy,False,"{'login': 'renanmoretto', 'id': 103861667, 'no...",https://github.com/renanmoretto/ezfinpy,,False,https://api.github.com/repos/renanmoretto/ezfinpy,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,https://api.github.com/repos/renanmoretto/ezfi...,2023-04-26 00:19:26+00:00,2023-05-03T00:57:56Z,2023-05-03T19:11:09Z,git://github.com/renanmoretto/ezfinpy.git,git@github.com:renanmoretto/ezfinpy.git,https://github.com/renanmoretto/ezfinpy.git,https://github.com/renanmoretto/ezfinpy,,17.0,1.0,1.0,Python,True,True,True,True,False,False,0.0,,False,False,0.0,"{'key': 'mit', 'name': 'MIT License', 'spdx_id...",True,False,False,[],public,0.0,0.0,1.0,main,"{'admin': False, 'maintain': False, 'push': Fa...",,0.0,1.0,,,,,,,renanmoretto/ezfinpy,,,,,,,,,,0,,0,left_only,2023,MIT,0,0.016602,0,renanmoretto,renanmoretto,0
1,629920730.0,R_kgDOJYvT2g,statplot,dingyizhao/statplot,False,"{'login': 'dingyizhao', 'id': 46778380, 'node_...",https://github.com/dingyizhao/statplot,Common plot code used in astrophysics,False,https://api.github.com/repos/dingyizhao/statplot,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,https://api.github.com/repos/dingyizhao/statpl...,2023-04-19 09:44:40+00:00,2023-04-25T08:11:21Z,2023-04-25T08:11:17Z,git://github.com/dingyizhao/statplot.git,git@github.com:dingyizhao/statplot.git,https://github.com/dingyizhao/statplot.git,https://github.com/dingyizhao/statplot,,3.0,0.0,0.0,Python,True,True,True,True,False,False,0.0,,False,False,0.0,"{'key': 'mit', 'name': 'MIT License', 'spdx_id...",True,False,False,[],public,0.0,0.0,0.0,main,"{'admin': False, 'maintain': False, 'push': Fa...",,0.0,1.0,,,,,,,dingyizhao/statplot,,,,,,,,,,0,,0,left_only,2023,MIT,0,0.00293,0,dingyizhao,dingyizhao,37
2,611058264.0,R_kgDOJGwCWA,imgutils,deepghs/imgutils,False,"{'login': 'deepghs', 'id': 126587470, 'node_id...",https://github.com/deepghs/imgutils,A convenient and user-friendly anime-style ima...,False,https://api.github.com/repos/deepghs/imgutils,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,https://api.github.com/repos/deepghs/imgutils/...,2023-03-08 02:32:20+00:00,2023-05-16T13:13:49Z,2023-05-18T08:56:31Z,git://github.com/deepghs/imgutils.git,git@github.com:deepghs/imgutils.git,https://github.com/deepghs/imgutils.git,https://github.com/deepghs/imgutils,https://deepghs.github.io/imgutils/,172568.0,2.0,2.0,Python,True,True,True,True,True,False,0.0,,False,False,2.0,"{'key': 'mit', 'name': 'MIT License', 'spdx_id...",True,False,False,"[anime, image-processing, python]",public,0.0,2.0,2.0,main,"{'admin': False, 'maintain': False, 'push': Fa...",,0.0,1.0,"{'login': 'deepghs', 'id': 126587470, 'node_id...",,,,,,deepghs/imgutils,,,,,,,,,,0,,0,left_only,2023,MIT,3,168.523438,1,deepghs,deepghs,141


<class 'pandas.core.frame.DataFrame'>
Int64Index: 582 entries, 0 to 581
Data columns (total 111 columns):
 #    Column                       Dtype              
---   ------                       -----              
 0    id                           float64            
 1    node_id                      object             
 2    name                         object             
 3    full_name                    object             
 4    private                      object             
 5    owner                        object             
 6    html_url                     object             
 7    description                  object             
 8    fork                         object             
 9    url                          object             
 10   forks_url                    object             
 11   keys_url                     object             
 12   collaborators_url            object             
 13   teams_url                    object             
 14   hooks_ur

In [38]:
# Users characteristics
df_users = (
    pd.read_csv("../output/users_profile.csv")
    .assign(user=lambda df: df["login"].str.lower().str.strip())
    .drop_duplicates("user")
    ## Get treatment assignment
    .merge((
        df_repos
        .select_columns(["user", "treated", "treated2", "full_name"])
        .drop_duplicates(["user", "treated"])
    ), how="left", on="user", validate="1:m"
    )
    .assign(treated=lambda df: df["treated"].fillna(0).apply(int))    
    ## Cleaning up additional features
    .assign(created_at=lambda df: pd.to_datetime(df['created_at']))
    .assign(year_created=lambda df: [dt.year for dt in df["created_at"]])    
    .assign(updated_at=lambda df: pd.to_datetime(df['created_at']))
    .assign(year_updated=lambda df: [dt.year for dt in df["updated_at"]])    
    .assign(list_co=lambda df: np.where(df["company"].isna(), 0, 1))
    .assign(list_loc=lambda df: np.where(df["location"].isna(), 0, 1))
    .assign(list_email=lambda df: np.where(df["email"].isna(), 0, 1))
    .assign(list_blog=lambda df: np.where(df["blog"].isna(), 0, 1))
    .assign(list_bio=lambda df: np.where(df["bio"].isna(), 0, 1))
    .assign(bio_size=lambda df: df["bio"].str.len().fillna(0).apply(int))
)
display(df_users.head(3))
df_users.info(verbose=True)

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,starred_url,subscriptions_url,organizations_url,repos_url,events_url,received_events_url,type,site_admin,name,company,blog,location,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at,retrieval_date,user,treated,treated2,full_name,year_created,year_updated,list_co,list_loc,list_email,list_blog,list_bio,bio_size
0,Arsybai,33319709,MDQ6VXNlcjMzMzE5NzA5,https://avatars.githubusercontent.com/u/333197...,,https://api.github.com/users/Arsybai,https://github.com/Arsybai,https://api.github.com/users/Arsybai/followers,https://api.github.com/users/Arsybai/following...,https://api.github.com/users/Arsybai/gists{/gi...,https://api.github.com/users/Arsybai/starred{/...,https://api.github.com/users/Arsybai/subscript...,https://api.github.com/users/Arsybai/orgs,https://api.github.com/users/Arsybai/repos,https://api.github.com/users/Arsybai/events{/p...,https://api.github.com/users/Arsybai/received_...,User,False,Arsybai,Clee Ltd,http://arsybai.com,indonesia,me@arsybai.com,,Just want to death by cuteness\r\n,,36,0,64,7,2017-11-02 12:38:27+00:00,2017-11-02 12:38:27+00:00,2023-05-20 21:41:26.180953,arsybai,0,0.0,Arsybai/Jawa-Language,2017,2017,1,1,1,1,1,32
1,GaNiziolek,54728889,MDQ6VXNlcjU0NzI4ODg5,https://avatars.githubusercontent.com/u/547288...,,https://api.github.com/users/GaNiziolek,https://github.com/GaNiziolek,https://api.github.com/users/GaNiziolek/followers,https://api.github.com/users/GaNiziolek/follow...,https://api.github.com/users/GaNiziolek/gists{...,https://api.github.com/users/GaNiziolek/starre...,https://api.github.com/users/GaNiziolek/subscr...,https://api.github.com/users/GaNiziolek/orgs,https://api.github.com/users/GaNiziolek/repos,https://api.github.com/users/GaNiziolek/events...,https://api.github.com/users/GaNiziolek/receiv...,User,False,Gabriel Niziolek,@TempoX-Ltda,,Brazil,,,Software Developer on TempoX | Django | Pyrami...,gniziolek,25,3,6,15,2019-08-31 00:19:09+00:00,2019-08-31 00:19:09+00:00,2023-05-20 21:41:26.180953,ganiziolek,1,1.0,GaNiziolek/FoccoERPy,2019,2019,1,1,0,0,1,68
2,MihailSalnikov,2613180,MDQ6VXNlcjI2MTMxODA=,https://avatars.githubusercontent.com/u/261318...,,https://api.github.com/users/MihailSalnikov,https://github.com/MihailSalnikov,https://api.github.com/users/MihailSalnikov/fo...,https://api.github.com/users/MihailSalnikov/fo...,https://api.github.com/users/MihailSalnikov/gi...,https://api.github.com/users/MihailSalnikov/st...,https://api.github.com/users/MihailSalnikov/su...,https://api.github.com/users/MihailSalnikov/orgs,https://api.github.com/users/MihailSalnikov/repos,https://api.github.com/users/MihailSalnikov/ev...,https://api.github.com/users/MihailSalnikov/re...,User,False,Mikhail Salnikov,NLP Group,https://medium.com/@MSalnikov,,,True,Researcher in DL and NLP,,27,24,7,2,2012-10-21 15:03:02+00:00,2012-10-21 15:03:02+00:00,2023-05-20 21:41:26.180953,mihailsalnikov,0,0.0,MihailSalnikov/EvaluateQA,2012,2012,1,0,0,1,1,24


<class 'pandas.core.frame.DataFrame'>
Int64Index: 545 entries, 0 to 544
Data columns (total 45 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   login                545 non-null    object             
 1   id                   545 non-null    int64              
 2   node_id              545 non-null    object             
 3   avatar_url           545 non-null    object             
 4   gravatar_id          0 non-null      float64            
 5   url                  545 non-null    object             
 6   html_url             545 non-null    object             
 7   followers_url        545 non-null    object             
 8   following_url        545 non-null    object             
 9   gists_url            545 non-null    object             
 10  starred_url          545 non-null    object             
 11  subscriptions_url    545 non-null    object             
 12  organizations_url    5

In [19]:
df_users["user"].nunique()

537

In [6]:
df_users.duplicated(subset="login", keep="first").sum()

8

In [39]:
df_users[df_users.duplicated(subset="login", keep=False)][["user", "full_name", "treated"]]

Unnamed: 0,user,full_name,treated
12,oca,OCA/e-commerce,0
13,oca,OCA/stock-logistics-workflow,1
15,brightway-lca,brightway-lca/bw_temporalis,1
16,brightway-lca,brightway-lca/bw_graph_tools,0
31,openvoiceos,OpenVoiceOS/ovos-classifiers,0
32,openvoiceos,OpenVoiceOS/jurebes,1
129,hansalemaos,hansalemaos/multisubprocess,0
130,hansalemaos,hansalemaos/locate_pixelcolor_cythonsingle,1
182,yuanjie-ai,yuanjie-ai/stopwords-zh,0
183,yuanjie-ai,yuanjie-ai/ChatLLM,1


In [7]:
df_repos.select_columns("treated").to_csv("../output/repo_baselines.csv", index=False)
df_users.to_csv("../output/user_baselines.csv", index=False)

In [8]:
(df_repos
 .select_columns(["year_created", "fork", "treated", "treated2", "size_mb", 'stargazers_count',
 'watchers_count',
 'language', 'has_issues', 'forks', 'open_issues', 'subscribers_count', 'n_topics'])
 .assign(fork=lambda df: np.where(df["fork"], 1, 0))
 .assign(has_issues=lambda df: np.where(df["has_issues"], 1, 0))
 .to_stata("../output/repo_baselines.dta", write_index=False)
)

In [9]:
(df_users
 .select_columns(["treated", "treated2", "type", "public_repos", "public_gists", 
                  "followers", "following", "year_created", "year_updated", "list_co", 
                  'list_email', 'list_blog', 'list_bio',  'bio_size'])
#  .assign(fork=lambda df: np.where(df["fork"], 1, 0))
#  .assign(has_issues=lambda df: np.where(df["has_issues"], 1, 0))
 .to_stata("../output/user_baselines.dta", write_index=False)
)