# The Networks That Bind Kaggle

![image](https://i.imgur.com/FTP7mNR.png)

In this challenge we are provided response data from Kaggle's 2020 Survey. Out of the endless possible ways to explore the data I wanted to choose one that highlights the **community** of kaggle. Kaggle is a diverse community unlike any other - where people from different cultures and backgrounds interact with each other to solve interesting and compelling problems- aswell as discussing the latest trends and technology in the data science space.

- **Methodology**

This notebook will focus on representing the kaggle community using *network* visualizations. Networks contain `nodes` and `edges`, in this data set each respondent is considered to be a `node` and the edges depend on how we define the relationship between them. We will explore a few different networks and hopefully find some interesting things about the community.

So, without further adeu.. let's take a look at the data!

Some things to note:
- Much of the code is hidden, but you can expand the hidden cells to see the full codebase used to create these visualizations.
- The graphs take some time to load so please be patient after first opening the notebook.

In [None]:
!pip install pyvis >> /dev/null

In [None]:
# %load kagnet.py
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from tqdm.notebook import tqdm
import multiprocessing
from pyvis.network import Network
import networkx as nx

from functools import partial
from tqdm.contrib.concurrent import process_map

pd.set_option('max_columns', 500)

def load_data(data_dir="../input/kaggle-survey-2020/", id_name="rid"):
    """
    Load the survey data and give each person a unique responder id (r_id)
    Extract the question text into a seperate dataframe

    id_name (str): The column name for the unique identifer for each responder.

    """
    surv = pd.read_csv(data_dir + "kaggle_survey_2020_responses.csv", low_memory=False)
    qtext = surv.loc[[0]]  # Text of questions
    surv = surv.drop(0)  # Remove question text
    surv = surv.reset_index().rename(columns={"index": id_name})
    return surv, qtext


def get_edge(surv, rid, qs=["Q1", "Q3", "Q4", "Q5", "Q6"]):
    """
    Finds the edge connections for an individual responder.
    Based on sum of similar answers to list of questions `qs`

    Args:
        rid (int): responder id to find associated links
        surv (dataframe): survey response dataframe
        qs (list, optional): list of questions to include
            Defaults to ['Q1','Q3','Q4','Q5','Q6'].

    Returns:
        dataframe: edges between base_rid and neighbor rid
    """
    qs_ = [f"{c}_" for c in qs]  # Count columns for questions
    r_vals = surv.query("rid == @rid")[qs]
    assert len(r_vals) == 1
    edge_df = surv[["rid"] + qs].query("rid != @rid").copy()
    for q in qs:
        edge_df[f"{q}_"] = 0
        if r_vals[q].values[0] == np.nan:
            continue
        edge_df.loc[edge_df[q] == r_vals[q].values[0], f"{q}_"] = 1
    edge_df["edge_weight"] = edge_df[qs_].sum(axis=1)
    edge_df["base_rid"] = rid
    return edge_df[["base_rid", "rid", "edge_weight"]]


def create_links(
    surv, min_weight=0, qs=["Q1", "Q3", "Q4", "Q5", "Q6"], parallel=None, chunksize=100
):
    """Creates the links between

    Args:
        surv (dataframe): survey dataframe
        filter_weight (int): minimum weight of edge to include
        qs (list): questions to include
    """

    if parallel is not None:
        func = partial(get_edge, surv)
        lnks = process_map(
            func, surv["rid"].unique(), max_workers=parallel, chunksize=chunksize
        )
    else:
        lnks = []
        for r in surv["rid"].unique():
#         for r in tqdm(surv["rid"].unique())
            lnk = get_edge(surv, r, qs=qs)
            lnks.append(lnk)

    links = pd.concat(lnks).reset_index(drop=True)
    links = links.query("edge_weight >= @min_weight").reset_index(drop=True)
    return links


def create_node_title(links, surv, responder_col, title_option=1):
    """Creates the apropriate title for each responder.

    Args:
        links (dataframe): contains links between rids
        surv (dataframe): contains survey responses
        title_option (int): title format option.
        responder_col (str, optional): rid column. Defaults to 'base_rid'.
    """
    links = links.merge(surv, how="left", left_on=responder_col, right_on="rid").copy()
    if title_option == 1:
        titles = (
            links["Q1"]
            + " year old "
            + links["Q5"].str.lower()
            + " from "
            + links["Q3"]
        )
    return titles


def create_kaggle_network(
    links,
    surv,
    node_size=5,
    heading="KaggleNet",
    height="500px",
    width="400px",
    bgcolor="#222222",
    font_color="white",
):
    kag_net = Network(
        height=height,
        width=width,
        bgcolor=bgcolor,
        font_color=font_color,
        heading=heading,
        notebook=True,
    )

    sources = links["base_rid"]
    targets = links["rid"]
    weights = links["edge_weight"]
    titles = create_node_title(links, surv, responder_col="base_rid")
    titles_dst = create_node_title(links, surv, responder_col="rid")
    edge_data = zip(sources, targets, weights, titles, titles_dst)

#     for e in tqdm(edge_data, total=len(sources)):
    for e in edge_data:
        src = e[0]
        dst = e[1]
        w = e[2] / 10
        tsrc = e[3]
        tdst = e[4]
        kag_net.add_node(src, src, title=tsrc, size=node_size)
        kag_net.add_node(dst, dst, title=tdst, size=node_size)
        kag_net.add_edge(src, dst, width=w)

    neighbor_map = kag_net.get_adj_list()

    # add neighbor data to node hover data
    for node in kag_net.nodes:
        node["title"] = str(node["title"])
    return kag_net

def create_kaggle_network_color(
    links,
    surv,
    lcolor, rcolor,
    node_size=5,
    heading="KaggleNet",
    height="500px",
    width="400px",
    bgcolor="#222222",
    font_color="white",
):
    kag_net = Network(
        height=height,
        width=width,
        bgcolor=bgcolor,
        font_color=font_color,
        heading=heading,
        notebook=True,
    )

    sources = links["base_rid"]
    targets = links["rid"]
    weights = links["edge_weight"]
    titles = create_node_title(links, surv, responder_col="base_rid")
    titles_dst = create_node_title(links, surv, responder_col="rid")
    edge_data = zip(sources, targets, weights, titles, titles_dst, lcolor, rcolor)

#     for e in tqdm(edge_data, total=len(sources)):
    for e in edge_data:
        src = e[0]
        dst = e[1]
        w = e[2] / 10
        tsrc = e[3]
        tdst = e[4]
        cl = e[5]
        cr = e[6]
        kag_net.add_node(src, src, title=tsrc, color=cl)
        kag_net.add_node(dst, dst, title=tdst, color=cr)
        kag_net.add_edge(src, dst, width=w)

    neighbor_map = kag_net.get_adj_list()

    # add neighbor data to node hover data
    for node in kag_net.nodes:
        node["title"] = str(node["title"])
    return kag_net

# Graph the Kaggle Community
**Zoom in and hover over a node to see a description of that kaggler.**

This first graph network includes the entire community. Specifically we are looking at responses to 5 of the first 6 questions on the survey which describe their:
- Age
- Country
- Education
- Job Title
- Years of Experience

Any respondents who had 3 or more questions answered the same are assigned an edge connecting each other. It's interesting to see how clumps of responders form. The largest clump appears to be the younger crowd, and within that clump an even tighter group appears almost as a ball. Zooming in and hovering over some of these responders I've noticed that this tightest clump appears to be younger students, many from India. They are a strong sub-group of the kaggle population! At the same time it's cool to see that most every respondent is somehow connected to others, showing the fact that kaggle consists of a wide, diverse, group of data sciensts that are still tightly tied together.

This gif shows an example of how you can explore the graphs.....

In [None]:
from IPython.display import Video
Video("https://i.imgur.com/a0skoUX.mp4")

In [None]:
# Try on everyone
surv, qtext = load_data()
surv_ = surv.sample(500, random_state=529)
links = create_links(surv_, min_weight=3, parallel=None, chunksize=10)
kag_net = create_kaggle_network(links, surv_, node_size=5,
                                height="1000px",
                                width="100%",)
kag_net.barnes_hut() #physics?
kag_net.show("kagnet.html")

# Respondents from The United States

Next I wanted to look at the connections between kagglers in my home country, the United States. I've expanded the list of questions that were included in this analysis to show relationships between the same as before (age, education, title, etc.) but also additional questions, like the coding language they would reccomend, size of their company and their annual compensation.

I find it interesting that:
- The dark red group appears to be those that hold the "data scientist" title.
- A few edge groups emerge. If you zoom in on these groupings of 2-5 responders who aren't connected to the main group you will find: 
    - A pair of statisticians from the older age bracket (50-70)
    - A pair of data analists who use similar coding tools
    - More edge groups with some similarities that are especially unique in the community. I encourage you to zoom in and check them out!

In [None]:
surv, qtext = load_data()

# Questions to use for network
qs = ['Q1', # Age
#       'Q2', # Gender
#       'Q3', # Country
      'Q4', # Education
      'Q5', # Title
      'Q6', # How long coding
#       'Q7_Part_1', # Prog Lang
      'Q8', # Lang recommend
#       'Q11', # Compute platform
      'Q15', # Years using ML
      'Q20', # Size of company
      'Q21', # DS employees
      'Q24', # Compensation
      'Q30', # Big Data product
#       'Q32', # BI Tools
     ]

surv_ = surv[['rid','Q3'] + qs].dropna().query('Q3 == "United States of America"') \
    .sample(500, random_state=42)

links = create_links(surv_, min_weight=6,
                     qs=qs)
links['edge_weight'] = links['edge_weight'] - links['edge_weight'].min() + 1


color_col = 'Q5'
categories = surv[color_col].unique()
# colors = np.linspace(0, 1, len(categories))
colors = sns.color_palette("deep", len(categories)).as_hex()
colordict = dict(zip(categories, colors))  
surv_["color"] = surv_[color_col].apply(lambda x: colordict[x])

lcolor = links.merge(surv_[['rid','color']], how='left', left_on='base_rid', right_on='rid')['color'].values
rcolor = links.merge(surv_[['rid','color']], how='left', left_on='rid', right_on='rid')['color'].values

kag_net = create_kaggle_network_color(links, surv_, 
                                      lcolor, rcolor,
                                      node_size=50,
                                      font_color='white',
                                      height="1000px",
                                      width="100%",
                                      heading="US Kagglers Colored by Career")
kag_net.barnes_hut() #physics?
kag_net.show("kagnet_us.html")

# Women of Kaggle Networked
Women unfortunately are underrepresented in the kaggle community- a gap we all hope becomes more balanced in the future. We can take encouragement from the awesome group of female kagglers we have who responded to the survey. The graph again is colored by job title, and I find it interesting that this graph seems to show much more balance than the others. Software engineers, data scientists, and analyists are all well represented, and they share a lot of common responses in the survey.

In [None]:
surv, qtext = load_data()

# Questions to use for network
qs = ['Q1', # Age
#       'Q2', # Gender
      'Q3', # Country
      'Q4', # Education
      'Q5', # Title
      'Q6', # How long coding
#       'Q7_Part_1', # Prog Lang
      'Q8', # Lang recommend
#       'Q11', # Compute platform
      'Q15', # Years using ML
      'Q20', # Size of company
      'Q21', # DS employees
      'Q24', # Compensation
      'Q30', # Big Data product
#       'Q32', # BI Tools
     ]

surv_ = surv[['rid','Q2'] + qs].dropna().query('Q2 == "Woman"')
links = create_links(surv_, min_weight=6,
                     qs=qs)
links['edge_weight'] = links['edge_weight'] - links['edge_weight'].min() + 1


color_col = 'Q5'
categories = surv[color_col].unique()
# colors = np.linspace(0, 1, len(categories))
colors = sns.color_palette("deep", len(categories)).as_hex()
colordict = dict(zip(categories, colors))  
surv_["color"] = surv_[color_col].apply(lambda x: colordict[x])

lcolor = links.merge(surv_[['rid','color']],
                     how='left',
                     left_on='base_rid',
                     right_on='rid')['color'].values
rcolor = links.merge(surv_[['rid','color']],
                     how='left',
                     left_on='rid',
                     right_on='rid')['color'].values

kag_net = create_kaggle_network_color(links, surv_, 
                                      lcolor, rcolor,
                                      node_size=50,
                                      font_color='white',
                                      height="1000px",
                                      width="100%",
                                      heading="Women of Kaggle")
kag_net.barnes_hut() #physics?
kag_net.show("kagnet_women.html")

# More Connections - Colored by Country.
Finally we graph the entire community again, but this time coloring by country. It's inspiring to remember that each dot represents a human being who is engaged with the community enough to want to respond to the survey! What a strong showing!!

A few things to note:
- Several groups of misfits appear again, with clumps of 2-10 responders who share no strong connections to the main group, but are strongly connected to each other.
- India is represented again as a very strong tight knit community in the center (purple colored).
- Explore more yourself and see what interesting groupings you can find.

In [None]:
surv, qtext = load_data()

# Questions to use for network
qs = ['Q1', # Age
      'Q2', # Gender
      'Q3', # Country
      'Q4', # Education
      'Q5', # Title
      'Q6', # How long coding
#       'Q7_Part_1', # Prog Lang
      'Q8', # Lang recommend
#       'Q11', # Compute platform
      'Q15', # Years using ML
#       'Q20', # Size of company
#       'Q21', # DS employees
      'Q24', # Compensation
#       'Q30', # Big Data product
#       'Q32', # BI Tools
     ]

surv_ = surv[['rid'] + qs].dropna().sample(500)

links = create_links(surv_, min_weight=6,
                     qs=qs)
links['edge_weight'] = links['edge_weight'] - links['edge_weight'].min() + 1
links['edge_weight'] = links['edge_weight'].clip(0, 1)

color_col = 'Q3'
categories = surv[color_col].unique()
# colors = np.linspace(0, 1, len(categories))
colors = sns.color_palette("pastel", len(categories)).as_hex()
colordict = dict(zip(categories, colors))  
surv_["color"] = surv_[color_col].apply(lambda x: colordict[x])

lcolor = links.merge(surv_[['rid','color']],
                     how='left',
                     left_on='base_rid',
                     right_on='rid')['color'].values
rcolor = links.merge(surv_[['rid','color']],
                     how='left',
                     left_on='rid',
                     right_on='rid')['color'].values

kag_net = create_kaggle_network_color(links, surv_, 
                                      lcolor, rcolor,
                                      font_color='white',
                                      height="1000px",
                                      width="100%",
                                      heading="Kaggler Network Colored by Country")
kag_net.barnes_hut() #physics?
kag_net.show("kagnet_global.html")

# Thanks for reading!

I appreciate you taking the time to read my notebook and hope you found something interesting! Please leave a comment if you have suggestions for things you would like to see.

# References:
- **From DataFrame to Network Graph** https://towardsdatascience.com/from-dataframe-to-network-graph-bbb35c8ab675
- **pyvis tutorial** https://pyvis.readthedocs.io/en/latest/tutorial.html