# Stack overflow tag network

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import math
from random import sample
from IPython.display import display 
import statistics as stat
from collections import Counter

# 1. read data to df

In [None]:
links=pd.read_csv('../input/stack-overflow-tag-network/stack_network_links.csv')
nodes=pd.read_csv('../input/stack-overflow-tag-network/stack_network_nodes.csv')

# display df and check null
display(links.head(5))
print('Contains null =',links.isnull().values.any())
display(nodes.head(5))
print('Contains null =',nodes.isnull().values.any())

<li>links['value']= the weight of the edge, the width of the edge</li>
<li>nodes['group']= the color of the node; the closeness of the node</li>
<li>nodes['nodesize']= the node size</li>

# 2. EDA

<li>number of nodes and edges</li>
<li>links['value'] distribution (bar or hist)</li>
<li>nodes['nodesize'] distribution</li>
<li>nodes['group'] statistics</li>

### 2.1 number of nodes and edges

In [None]:
# num of nodes and edges
num_links=len(links)
num_nodes=len(nodes)

print('Number of nodes =',num_nodes)
print('Number of links =', num_links)

### 2.2 links['value'] and nodes['nodesize'] distribution hist plot

In [None]:
# plot the histogram of the link value and node size
figsize=(12,4)
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=figsize)
fontsize=14

plt.subplot(1, 2, 1)
ax1=links.value.hist()
ax1.set_title('Link Value Distribution',fontsize=fontsize)
ax1.set_xlabel('link value')
ax1.plot()

plt.subplot(1, 2, 2)
ax2=nodes.nodesize.hist(color='red')
ax2.set_title('Node Size Distribution',fontsize=fontsize)
ax2.set_xlabel('Node Size')

### 2.3 nodes['group'] distribution bar plot

In [None]:
group_count=nodes['group'].value_counts()

group_name=group_count.index
group_name=group_name.astype(str)
group_cnt=group_count.values

print('group_name:',group_name)
print('group_cnt',group_cnt,'\n')
print('Length check =',len(group_cnt)==len(group_name))

In [None]:
# function for bar plot with numerical label
def bar_plt(figsize,title,width,x,y,xlabel,ylabel):
    fig, ax = plt.subplots(figsize=figsize)
    # set title
    ax.set_title(title,fontsize=14)
    # set x axis label location
    x_loc=np.arange(len(x))
    ax.set_xticks(x_loc)
    # height
    height=list(y)
    # bar rectangle
    rect = ax.bar(x, height, width,color='Lightblue')
    # set x and y label
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    # set the text for each rect
    for j in range(len(x_loc)):
        ax.text(x_loc[j],height[j],str(height[j]),ha='center')
    

In [None]:
# node grpup cout bar plot
bar_plt(figsize=(15,5),
        title='Node group count bar plot',
        width=0.4,
        x=group_name,
        y=group_cnt,
        xlabel='group name',
        ylabel='count')

#### 2.3.1 Define a function that could:<br>-- input group, return tags contains<br>-- input tags, return group belongs to

In [None]:
# input group number, return tags
def tags_in_group(group_number):
    tags=nodes['name'].loc[nodes['group']==group_number]
    tags=list(tags)
    print('Number of tags in group',group_number,':',len(tags))
    return tags

In [None]:
largest_group=group_count.idxmax()
print('Largest group is: ',largest_group)

tags_in_group(1)

#### group 1 contains tag : python; machine-learning;etc

In [None]:
# input tags, return groups belongs to
def group_for_tag(tag):
    group=nodes['group'].loc[nodes['name']==tag].values
    group=group[0]
    group_size=len(nodes.loc[nodes['group']==group])
    
    print ('The tag',tag,'is in group: ',group)
    print ('group',group,'size: ',group_size)
    return group

In [None]:
group_for_tag('python')
group_for_tag('java')
group_for_tag('mysql')

# 3. Subgraph: nodes,edges and attributes construction

<li>create a general workflow from dataframe to network</li>
<li>generate multiple dataframes based on need (e.g.:for each group; for specific tag connected tag)</li>
<li>draw graph based on corresponding dataframe</li>


### 3.1 General workflow from dataframe to network

<li>from df --- node:size=nodes['nodesize']<br>color=nodes['group']</li>
<li>from df --- edge:width=links['value'])</li>
<li>layout algorithm:spring, spiral,pagerank</li>
<li>highlight selected nodes: node_size=input<br>node_color=input?</li>


### 3.1.1 node size = node_size

In [None]:
nodes['nodesize'].describe()

In [None]:
# node size
node_size=nodes['nodesize']
# adjust the value for better display
node_size=pd.Series(node_size).array*100
node_size=list(node_size)

print(type(node_size))
print('check length: ',len(node_size)==len(nodes))

In [None]:
node_size[1]

### 3.1.2 node color = colors

In [None]:
#set color for graph
# color = nodes['group']
colors=nodes['group']
colors=colors.values

print(type(colors))
print('check length: ',len(colors)==len(nodes))

In [None]:
colors[1]

### 3.1.3 width of edges = widths

In [None]:
# links['value'] = widths * constant
base=2
widths=links['value']
widths=widths/min(widths)
widths=base**widths
widths=list(widths.values)

print(type(widths))
print('check length: ',len(widths)==len(links))


### 3.1.4 weighted edges to construct DiGraph

In [None]:
# function to generate weighted edges from edges dataframe
def edge_weight_gen(link):
    edges=[]
    for i in range(len(link)):
        edge=(link['source'][i],link['target'][i],link['value'][i])
        edges.append(edge)
    print('check length: ',len(edges)==len(link))
    return edges
  

In [None]:
edges=edge_weight_gen(links)

#### ---- construct labels (dict) for graph draw labels

In [None]:
# labels
nodelist=nodes['name']
labels=dict(zip(nodelist,nodelist))
print('type of labels: ',type(labels))

### 3.1.5 Construct DiGraph

In [None]:
G= nx.DiGraph()
G.add_weighted_edges_from(edges)
pos = nx.layout.spring_layout(G,k=0.8)

print('check length: ',len(G.edges())==len(links))
print(type(G))

In [None]:
# sample edge to convert the data type
sample_edges=sample(G.edges,5)
print(sample_edges)

# view the edge attributes
G.edges[sample_edges[0]]

#### 3 types of graph
<li>select tag as core (tag related graph)---highlight the core solution:construct graph</li>
<li>specific group network(no highlight) solutio: construct graph</li>
<li>whole network--highlight selected</li>

### Before that, view the whole network

In [None]:
def draw_graph(figsize,pos,G,node_size,title): # labels could choose 'None'
    fig = plt.figure(figsize=figsize)
    pos=pos
    G=G
    node_size=node_size
    
    nx.draw_networkx_nodes(G,
                           pos,
                           node_size=node_size,
                           node_color=colors,
                           alpha=1)
    
    nx.draw_networkx_edges(G,
                           pos,
                           edge_color='r',
                           width=widths,
                           alpha=0.5)
    
    nx.draw_networkx_labels(G, 
                            pos,
                            labels=labels,
                            font_color='blue',
                            font_size=120,
                            alpha=1,
                            verticalalignment='top')
    plt.title(title,fontsize=80,color='blue')


In [None]:
draw_graph(figsize=(200,200),
           pos=pos,
           G=G,
           node_size=node_size,
          title='Whole tag network view')

#### Some key tags that worth study:
<li>--linux</li>
<li>--python</li>
<li>--mysql</li>
<li>--c++</li>
<li>--java</li>
<li>--.net</li>

# 4. Degree analysis

In [None]:
# average degree of graph
degrees=[G.degree(n) for n in G.nodes()]
average_degree=stat.mean(degrees)
print('the average_degree =',average_degree)

### 4.1 Degree distribution plot --- scale-free graph

In [None]:
degrees_counter=Counter(degrees)
# The zip() function returns a zip object, which is an iterator of tuples where the first item in each passed iterator is paired together, 
# and then the second item in each passed iterator are paired together etc.
degrees,degrees_frequency=zip(*degrees_counter.items())
degrees=list(degrees)

degrees_frequency=list(degrees_frequency)

fig, ax = plt.subplots(figsize=(12,5))
# set title
ax.set_title('degree distribution',fontsize=14)
plt.bar(degrees,degrees_frequency,
        color='green')

### 4.2 degree of centrality 

In [None]:
def top_centrality_nodes(degree_cnt):
    dict_items=degree_cnt.items()
    dict_items_sorted=sorted(dict_items,key=lambda item:item[1],reverse=True)
    
    degree_cnt_sorted={k:v for k,v in dict_items_sorted}
    most_impt_nodes=list(degree_cnt_sorted)[:10]
    
    return most_impt_nodes

In [None]:
# function to plot the top centrality nodes
def top_cnt_plot(degree_cnt,cent_type):
    dict_items=degree_cnt.items()
    dict_items_sorted=sorted(dict_items,key=lambda item:item[1],reverse=True)
    # plot the bar chart for the top cnt tags
    top_10_tag=dict_items_sorted[:10]
    top_tag=[]
    top_cnt=[]
    for tag in top_10_tag:
        top_tag.append(tag[0])
        top_cnt.append(tag[1])
        
    fig = plt.figure(figsize=(11,4))
    plt.title('The top 10 '+cent_type+'-centrality tags',fontsize=14)
    plt.bar(top_tag,top_cnt,color='blue')
    # plot end

In [None]:
degree_cnt_katz=nx.katz_centrality_numpy(G)
degree_cnt_in=nx.in_degree_centrality(G)
degree_cnt_load=nx.load_centrality(G)

print('top 10 katz centrality\n',top_centrality_nodes(degree_cnt_katz))
print('top 10 in-degree centrality\n',top_centrality_nodes(degree_cnt_in))
print('top 10 load degree centrality\n',top_centrality_nodes(degree_cnt_load))

impt_nodes_katz=top_centrality_nodes(degree_cnt_katz)
impt_nodes_in=top_centrality_nodes(degree_cnt_in)
impt_nodes_load=top_centrality_nodes(degree_cnt_load)

In [None]:
top_cnt_plot(degree_cnt=degree_cnt_katz,
             cent_type='katz')

top_cnt_plot(degree_cnt=degree_cnt_in,
             cent_type='in')

top_cnt_plot(degree_cnt=degree_cnt_load,
             cent_type='load')

### 4.3 Plot the top 10 important node in tag network

In [None]:
# create dic for top 10 tag labels ---  for highlight purpose
def impt_labels(impt_nodes,G):
    impt_labels={}
    for node in G.nodes():
        if node in impt_nodes:
            #set the node name as the key and the label as its value 
            impt_labels[node] = node
    return impt_labels

In [None]:
impt_labels_load=impt_labels(impt_nodes_load,G)
impt_labels_in=impt_labels(impt_nodes_in,G)
impt_labels_katz=impt_labels(impt_nodes_katz,G)

In [None]:
def draw_graph_cnt(figsize,pos,graph,node_size,node_list,select_label,title): # labels could choose 'None'
    fig = plt.figure(figsize=figsize)
    pos=pos
    G=graph
    select_label=select_label
    nodelist=node_list
    
    nx.draw_networkx_nodes(G,
                           pos,
                           node_size=node_size,
                           node_color=colors,
                           alpha=1)
    
    nx.draw_networkx_nodes(G,
                           pos,
                           node_color='yellow',
                           node_size=20000,
                           nodelist=nodelist,
                           alpha=1)
    
    nx.draw_networkx_edges(G,
                           pos,
                           edge_color='r',
                           width=widths,
                           alpha=0.1)
     
    nx.draw_networkx_labels(G, 
                            pos,
                            labels=select_label,
                            font_color='red',
                            font_size=45,
                            alpha=1,
                            verticalalignment='top')
    plt.title(title,fontsize=65,color='blue')
    


In [None]:
draw_graph_cnt(figsize=(60,60),
               pos=pos,
               graph=G,
               node_size=node_size,
               node_list=impt_nodes_load,
               select_label=impt_labels_load,
               title='Top 10 high centrality tags in network -- load centreality')

In [None]:
draw_graph_cnt(figsize=(70,70),
               pos=pos,
               graph=G,
               node_size=node_size,
               node_list=impt_nodes_in,
               select_label=impt_labels_in,
               title='Top 10 high centrality tags in network -- in centrality')

In [None]:
draw_graph_cnt(figsize=(70,60),
               pos=pos,
               graph=G,
               node_size=node_size,
               node_list=impt_nodes_katz,
               select_label=impt_labels_katz,
               title='Top 10 high centrality tags in network -- katz centrality')

# 5. plot selected tag related network

<li>--linux</li>
<li>--python</li>
<li>--mysql</li>
<li>--c++</li>
<li>--java</li>
<li>--jquery</li>

### 5.1 function find the related nodes to the selcet node and convert it to G_int (interest)

In [None]:
def related_subgraph(select_node):
    subnodes=[]
    for source,target in G.edges:
        if source==select_node:
            subnodes.append(target)
            #print(target)
        if target==select_node:
            subnodes.append(source)
            #print(source)
        subnodes.append(select_node)
    subnodes=list(set(subnodes))
    subgraph= G.subgraph(subnodes)
    return subgraph


### Set lists for interested tag and create sub graph for them
#### -- Here you can add your own interest tag into this list

In [None]:
interest_tags=['python','linux','c++','java','jquery','mysql','javascript','json','asp.net','c']
sub_G_name=['G_py','G_linux','G_cpp','G_java','G_jq','G_sql','G_javascript','G_json','G_asp.net','G_c']
sub_G_list=[]
for i in range(len(interest_tags)):
    tag=interest_tags[i]
    sub_G_list.append(related_subgraph(tag))
    
sub_G_dic=dict(zip(sub_G_name,sub_G_list))

In [None]:
sub_G_dic

### 5.2 Functions use to plot the selected tag related network
    

In [None]:
# function take input name of graph, the name of interest rate tag
# e.g. 
# int_graph='G_py'
# int_tag_name='python'

def plot_int_tag_graph(int_graph,int_tag_name):
    # access the G_int
    G_int=sub_G_dic[int_graph]

    # width of the G_int
    def width_gen(G_int):
        # width of the G_int
        base=10
        width=nx.get_edge_attributes(G_int, 'weight')
        width=list(width.values())
        width=width/min(width)
        width=base*width
        return width
    width=width_gen(G_int)

    # function to extract nodes df for G_int
    def int_node_df(G_int):
        int_nodes=list(G_int.nodes)
        int_df=nodes[nodes['name'].isin(int_nodes)]
        return int_df
    int_df=int_node_df(G_int)


    # dict for assign attributes to the node (group and nodesize)
    int_node_dict=int_df.set_index('name').to_dict(orient='index')

    # set the network node attributes
    nx.set_node_attributes(G_int,int_node_dict)
    nx.get_node_attributes(G_int,'group')

    # get node size 
    int_node_size=list(nx.get_node_attributes(G_int,'nodesize').values())
    int_node_size=pd.Series(int_node_size).array*20

    # get node color
    int_node_color=list(nx.get_node_attributes(G_int,'group').values())

    # use func to create dict for all labels
    int_labels=impt_labels(list(G_int.nodes),G_int)
    
    def draw_graph_int(figsize,k,graph,node_size,node_color,node_int,edge_width,label,title): # labels could choose 'None'
        fig = plt.figure(figsize=figsize)
        G=graph
        label=label
        node_int=list(node_int)
        pos = nx.layout.spring_layout(G,k)

        nx.draw_networkx_nodes(G,
                               pos,
                               node_size=node_size,
                               node_color=node_color,
                               alpha=1)

        nx.draw_networkx_nodes(G,
                               pos,
                               node_color='red',
                               nodelist=node_int,
                               alpha=1)

        nx.draw_networkx_edges(G,
                               pos,
                               edge_color='blue',
                               width=edge_width,
                               alpha=0.1)

        nx.draw_networkx_labels(G, 
                                pos,
                                labels=label,
                                font_color='red',
                                font_size=75,
                                alpha=1,
                                verticalalignment='bottom')
        plt.title(title,fontsize=60,color='blue')

    # draw the graph
    fig=draw_graph_int(figsize=(50,40),
                       k=0.1,
                       graph=G_int,
                       node_size=int_node_size,
                       node_color=int_node_color,
                       node_int=[int_tag_name],
                       edge_width=width,
                       label=int_labels,
                       title=int_graph+' Network')


### 5.3. Select tag related network plot

In [None]:
# plot everything in the interest list
for i in range(len(interest_tags)):
    int_tag_name=interest_tags[i]
    int_graph=sub_G_name[i]
    plot_int_tag_graph(int_graph,
                       int_tag_name)