# Network Graph for Stack Overflow

Agustika Indah Mayangsari_23220071

---Menggunakan data stack overflow

# Graph

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

# Input data files check
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
import warnings
warnings.filterwarnings('ignore')

G = nx.Graph(day="Stackoverflow")
df_nodes = pd.read_csv('../input/stack_network_nodes.csv')
df_edges = pd.read_csv('../input/stack_network_links.csv')

for index, row in df_nodes.iterrows():
    G.add_node(row['name'], group=row['group'], nodesize=row['nodesize'])
    
for index, row in df_edges.iterrows():
    G.add_weighted_edges_from([(row['source'], row['target'], row['value'])])
    
color_map = {1:'#f09494', 2:'#eebcbc', 3:'#72bbd0', 4:'#91f0a1', 5:'#629fff', 6:'#bcc2f2',  
             7:'#eebcbc', 8:'#f1f0c0', 9:'#d2ffe7', 10:'#caf3a6', 11:'#ffdf55', 12:'#ef77aa', 
             13:'#d6dcff', 14:'#d2f5f0'} 

plt.figure(figsize=(25,25))
options = {
    'edge_color': '#000000',
    'width': 1,
    'with_labels': True,
    'font_weight': 'regular',
}
colors = [color_map[G.nodes[node]['group']] for node in G]
sizes = [G.nodes[node]['nodesize']*10 for node in G]

nx.draw(G, node_color=colors, node_size=sizes, pos=nx.spring_layout(G, k=0.25, iterations=50), **options)
ax = plt.gca()
ax.collections[0].set_edgecolor("#555555") 
plt.show()

# Degree Distribution

In [None]:
import collections

#draw degree distribution
components = nx.connected_components(G)
largest_component = max(components, key=len)
G_large = G.subgraph(largest_component)
def plot_degree_distribution(G_large):
    degree_sequence = sorted([d for n, d in G_large.degree()], reverse=True)  # degree sequence
    degreeCount = collections.Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())
    fig, ax = plt.subplots()
    plt.title("Degree Distribution")
    plt.bar(deg, cnt)
  
plot_degree_distribution(G_large)

Dilihat dari plot degree distribution di atas, gambarnya mengikuti distribusi power-law. Maka dapat disimpulkan bahwa network yang tergambar merupakan **scale-free network**.



# Average Distance

In [None]:
#print(nx.average_shortest_path_length(G))
#for C in (G.subgraph(c).copy() for c in nx.connected_components(G)):
    #print(nx.average_shortest_path_length(C))
print(nx.average_shortest_path_length(G_large))




In [None]:
from collections import defaultdict

def make_graph_structure():
    G=G_large
    graph = defaultdict(list) 
    for edge in G.edges(): 
      a, b = edge[0], edge[1] 
      graph[a].append(b) 
      graph[b].append(a) 
    return graph

def find_path(graph, start, end, path=[]):
    path = path + [start]
    if start == end:
        return path
    if start not in graph:
        return None
    for node in graph[start]:
        if node not in path:
            newpath = find_path(graph, node, end, path)
            if newpath: return newpath
    return None

def distance_distribution():
    G=G_large
    graph = make_graph_structure()
    jarak = [] 
    for edge in G.edges(): 
      a, b = edge[0], edge[1] 
      jarak.append(len(find_path(graph,a,b))-1)   
    print(jarak)
    unique_jarak = list(set(jarak))

    prob=[]
    n=len(jarak)
    for elmt in unique_jarak:
      count = jarak.count(elmt)
      prob_a=round(count/n , 3)
      prob.append(prob_a)
    print(prob)   
    df = pd.DataFrame({'data':unique_jarak, 'prob':prob})
    df_sorted= df.sort_values(['data'], ascending=[False])
    countjar=df.data.value_counts().sort_index()
    ax = plt.gca()
    df.plot(kind='line',x='data',y='prob',ax=ax, title='\n Distance Distribution', xlabel ='Distance (d)', ylabel = 'Distance Probability(Pd)')
    plt.show()

distance_distribution()