In [1]:
from datetime import datetime

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt, pylab


def get_dataframe():
    filepath = '../data/django-packages-authors.csv'
    df = pd.read_csv(filepath, sep=';')
    df = df[df['author_email'].str.contains('dependabot') == False]
    df = df[df['author_email'].str.contains('noreply') == False]
    df = df[df['author_email'].str.contains('no.reply') == False]
    df = df[df['author_email'].str.contains('no-reply') == False]
    df = df[df['author_email'].str.contains('no author') == False]
    df = df[df['author_email'].str.contains('github') == False]
    df = df[df['author_email'].str.contains('none') == False]
    return df[df['commits_count'] > 1]


def get_common_authors(df):
    g = df.groupby(['author_email'], as_index = False).agg({'repo_id': '<>'.join})
    g[g['repo_id'].str.contains('<>')]

    authors = {}
    for row in g.iterrows():
        package = row[1]
        repos = package['repo_id'].split('<>')
        if package['author_email'] in authors:
            authors[package['author_email']] = authors[package['author_email']] + repos
        else:
            authors[package['author_email']] = repos

    return authors


def save_graph(graph, file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(20, 20), dpi=80)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph,pos)
    nx.draw_networkx_edges(graph,pos)
    nx.draw_networkx_labels(graph,pos)

    cut = 1.00
    xmax = cut * max(xx for xx, yy in pos.values())
    ymax = cut * max(yy for xx, yy in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)

    plt.savefig(file_name,bbox_inches="tight")
    pylab.close()
    del fig

In [2]:
df = get_dataframe()
authors = get_common_authors(df)
g = nx.Graph()

repos = df[['repo_id', 'platform', 'repo_stars', 'repo_last_commit_date']]
repos.drop_duplicates()

for row in repos.iterrows():
    package = row[1]
    if package['repo_id'] not in g.nodes:
        g.add_nodes_from([
            (package['repo_id'], {
                'platform': package['platform'], 
                'stars': int(package['repo_stars']),
                'last_commit_date': datetime.strptime(package['repo_last_commit_date'], '%Y-%m-%dT%H:%M:%S:%f')
            })])

for _, repos in authors.items():
    if len(repos) >= 2:
        for i in range(len(repos)):
            for j in range(i + 1, len(repos)):
                exists = (
                    (repos[i], repos[j]) in g.edges 
                    or (repos[j], repos[i]) in g.edges
                )
                if not exists:
                    g.add_edge(repos[i], repos[j])

In [3]:
# nx.draw(g, node_color='#A0CBE2',edge_color='#BB0000',width=2,edge_cmap=plt.cm.Blues) # nx.draw(g)
# plt.show()
# plt.savefig("graph.png", dpi=500, facecolor='w', edgecolor='w',orientation='portrait', papertype=None, format=None,transparent=False, bbox_inches=None, pad_inches=0.1) 
save_graph(g, 'teste.pdf')
# from networkx_viewer import Viewer
# app = Viewer(g)
# app.mainloop()