# Create Graph from Raw Data

In [2]:
import gzip
import json
from tqdm import tqdm
import networkx as nx
from collections import defaultdict
from itertools import combinations
from glob import glob

## Preprocess data

Data is downloaded from [GH Archive](https://www.gharchive.org/).

Each file is a zipped .gz json file, named by date: `YYYY-MM-DD-HR.json.gz`.

In [8]:
# get path to each file
filepaths = glob(f'./data/gh_archive/2020-01*.gz', recursive=True)
print(f'Found {len(filepaths)} number of zipped files')

Found 744 number of zipped files


In [9]:
# define bot users
bot_usernames = ['dependabot', 'dependabot-preview']

In [36]:
# extract information from each file
users_to_repos = defaultdict(set)
repos_to_users = defaultdict(set)
for path in tqdm(filepaths[:20]):
    for line in gzip.open(path, 'rt', encoding='utf8'):
        D = json.loads(line)
        username = D.get('actor', {}).get('display_login')
        repo = D.get('repo', {}).get('name')

        # filter out unwanted data
        if username in bot_usernames:
            continue

        # filter out data where stuff is missing
        if not user or not repo:
            continue
        
        # get commit message of push event
        if D['type'] == 'PushEvent':
            commits = D['payload']['commits']
            commit_msg = ' '.join([c['message'] for c in commits])

        # store information
        users_to_repos[username].add(repo)
        repos_to_users[repo].add(user)

  5%|▌         | 1/20 [00:03<01:02,  3.29s/it]


KeyboardInterrupt: 

## Create Graph

In [30]:
# Create graph
G = nx.Graph()
for users in repos_to_users.values():
    if len(users) < 2:
        continue
    for pair in combinations(users, r=2):
            G.add_edge(pair[0], pair[1])

# Extract GCC from graph
Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy()

# save G and Gcc
nx.write_gexf(G, './saved_graphs/G.gexf')
nx.write_gexf(Gcc, './saved_graphs/Gcc.gexf')

In [27]:
# print some stats
print(f'\t\tG\tGcc')
print(f"# of nodes:\t{len(list(G.nodes))}\t{len(list(Gcc.nodes))}")
print(f"# of links:\t{len(list(G.edges))}\t{len(list(Gcc.edges))}")

		G	Gcc
# of nodes:	53688	31313
# of links:	670264	642804
