##Kagglers teams and users graph

The goal of this script is:

* to show how different users form teams,
* to find centers of community among Kagglers (it may help you if you want to compete as a team:)),
* to see the largest teams in the history of Kaggle competitions
* to look at overloaded graph :)

In [None]:
# -*- coding: utf-8 -*-
import plotly
plotly.offline.init_notebook_mode(connected=True) # run at the start of every ipython noteboook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import plotly.plotly as py
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs 
import networkx as nx

%matplotlib inline
plt.rcParams['figure.figsize'] = [16,10]

In [None]:
conn = sqlite3.connect(r'../input/database.sqlite')
teams = pd.read_sql_query("select * from Teams", conn)
users = pd.read_sql_query("select * from Users", conn)
teammembers = pd.read_sql_query("select * from TeamMemberships",conn)

In [None]:
teams_q = teammembers.groupby('TeamId').UserId.count()
teams_q = teams_q[teams_q > 1].reset_index() ## I work with teams that have more than one participant 
teammembers_cut = teammembers.merge(teams_q,on='TeamId')
users_q = teammembers_cut.groupby('UserId_x').TeamId.count()
users_q = users_q[users_q > 1].reset_index() ## I work with users that teamed more than once due to computational limitations
teammembers_cut = teammembers_cut.merge(users_q,left_on='UserId_x', right_on='UserId_x')
teammembers_cut = teammembers_cut.sort_values(['UserId_y', 'TeamId_x'])
teammembers_cut = teammembers_cut.merge(users_q,left_on='UserId_x', right_on='UserId_x')
teammembers_cut = teammembers_cut.merge(teams, left_on='TeamId_x', right_on='Id')
teammembers_cut = teammembers_cut.merge(users, left_on='UserId_x', right_on='Id')

tm4graph = teammembers_cut[['TeamId_x','UserId_x']]
tm4graph['TeamId_x'] = 'Team_' + tm4graph['TeamId_x'].astype('str')
tm4graph['UserId_x'] = 'User_' + tm4graph['UserId_x'].astype('str')

In [None]:
## Implementation of force atlas to networkx from here https://github.com/tpoisot/nxfa2/blob/master/forceatlas.py

from scipy.sparse import spdiags, coo_matrix
import scipy as sp
import numpy as np

import matplotlib.pyplot as plt


## Now the layout function
def forceatlas2_layout(G, iterations=10, linlog=False, pos=None, nohubs=False,
                       kr=0.001, k=None, dim=2):
    """
    Options values are
    g                The graph to layout
    iterations       Number of iterations to do
    linlog           Whether to use linear or log repulsion
    random_init      Start with a random position
                     If false, start with FR
    avoidoverlap     Whether to avoid overlap of points
    degreebased      Degree based repulsion
    """
    # We add attributes to store the current and previous convergence speed
    for n in G:
        G.node[n]['prevcs'] = 0
        G.node[n]['currcs'] = 0
        # To numpy matrix
    # This comes from the spares FR layout in nx
    A = nx.to_scipy_sparse_matrix(G, dtype='f')
    nnodes, _ = A.shape

    try:
        A = A.tolil()
    except Exception as e:
        A = (coo_matrix(A)).tolil()
    if pos is None:
        pos = np.asarray(np.random.random((nnodes, dim)), dtype=A.dtype)
    else:
        pos = pos.astype(A.dtype)
    if k is None:
        k = np.sqrt(1.0 / nnodes)
        # Iterations
    # the initial "temperature" is about .1 of domain area (=1x1)
    # this is the largest step allowed in the dynamics.
    t = 0.1
    # simple cooling scheme.
    # linearly step down by dt on each iteration so last iteration is size dt.
    dt = t / float(iterations + 1)
    displacement = np.zeros((dim, nnodes))
    for iteration in range(iterations):
        displacement *= 0
        # loop over rows
        for i in range(A.shape[0]):
            # difference between this row's node position and all others
            delta = (pos[i] - pos).T
            # distance between points
            distance = np.sqrt((delta ** 2).sum(axis=0))
            # enforce minimum distance of 0.01
            distance = np.where(distance < 0.01, 0.01, distance)
            # the adjacency matrix row
            Ai = np.asarray(A.getrowview(i).toarray())
            # displacement "force"
            Dist = k * k / distance ** 2
            if nohubs:
                Dist = Dist / float(Ai.sum(axis=1) + 1)
            if linlog:
                Dist = np.log(Dist + 1)
            displacement[:, i] += \
                (delta * (Dist - Ai * distance / k)).sum(axis=1)
            # update positions
        length = np.sqrt((displacement ** 2).sum(axis=0))
        length = np.where(length < 0.01, 0.01, length)
        pos += (displacement * t / length).T
        # cool temperature
        t -= dt
        # Return the layout
    return dict(zip(G, pos))

In [None]:
axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title='' 
          )

layout=Layout(title= "Kaggle teams/users universe",  
    font= Font(size=12),
    showlegend=True,
    autosize=False,
    width=700,
    height=700,
    xaxis=XAxis(axis),
    yaxis=YAxis(axis),          
    margin=Margin(
        l=40,
        r=40,
        b=85,
        t=100,
    ),
    hovermode='closest',
    annotations=Annotations([
           Annotation(
           showarrow=False, 
            text='',  
            xref='paper',     
            yref='paper',     
            x=0,  
            y=-0.1,  
            xanchor='left',   
            yanchor='bottom',  
            font=Font(
            size=14 
            )     
            )
        ]),           
    )

In [None]:
edges_to_use = 5000 # and computational limitations again
G=nx.Graph()
G.add_edges_from(tm4graph.values[0:edges_to_use])
pos = forceatlas2_layout(G,iterations=300, nohubs=True)
N = G.number_of_nodes()
E = G.edges()
labels = G.nodes()

Xv_teams=[pos[k][0] for k in labels if "Team" in k]
Yv_teams=[pos[k][1] for k in labels if "Team" in k]
Xv_users=[pos[k][0] for k in labels if "User" in k]
Yv_users=[pos[k][1] for k in labels if "User" in k]

labels_team = [teammembers_cut.iloc[0:edges_to_use,:]
               .loc[teammembers_cut.TeamId_x==int(k.replace('Team_','')),'TeamName']
               .values[0]
               for k in labels if "Team" in k]
labels_users = [teammembers_cut.iloc[0:edges_to_use,:]
                .loc[teammembers_cut.UserId_x==int(k.replace('User_','')),'DisplayName']
                .values[0]
                for k in labels if "User" in k]

Xed=[]
Yed=[]
for edge in E:
    Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
    Yed+=[pos[edge[0]][1],pos[edge[1]][1], None]

trace3=Scatter(x=Xed,
               y=Yed,
               mode='lines',
               line=Line(color='rgb(200,200,200)', width=2),
               name='Links',
               hoverinfo='none'
               )
trace4=Scatter(x=Xv_teams,
               y=Yv_teams,
               mode='markers',
               name='Teams',
               marker=Marker(symbol='dot',
                             size=[teammembers_cut.iloc[0:edges_to_use,:]
                            .loc[teammembers_cut.TeamId_x==int(k.replace('Team_','')),'UserId_y']
                            .values[0]
                            for k in labels if "Team" in k],
                             color='rgb(146,209,81)',
                             line=Line(color='rgb(50,50,50)', width=0.5)
                             ),
               text=list(map(lambda x: ''.join(['Team: '+''.join(x[0]).strip()
                                   + '<br>Users: '+','.join(x[1])+u'<br>']),
                        zip(labels_team, [teammembers_cut.iloc[0:edges_to_use,:]
               .loc[teammembers_cut.TeamId_x==int(k.replace('Team_','')),'DisplayName'].values.tolist()
               for k in labels if "Team" in k]))),
               hoverinfo='text'
               )
trace5=Scatter(x=Xv_users,
               y=Yv_users,
               mode='markers',
               name='Users',
               marker=Marker(symbol='dot',
                             size=[teammembers_cut.iloc[0:edges_to_use,:]
                            .loc[teammembers_cut.UserId_x==int(k.replace('User_','')),'TeamId_y']
                            .values[0]*0.5
                            for k in labels if "User" in k],
                             color='#000000',
                             line=Line(color='rgb(50,50,50)', width=0.5)
                             ),
               text=list(map(lambda x: ''.join(['User: '+''.join(x[0]).strip()
                                   + '<br>Teams: '+','.join(x[1])+u'<br>']),
                        zip(labels_users,[teammembers_cut.iloc[0:edges_to_use,:]
                .loc[teammembers_cut.UserId_x==int(k.replace('User_','')),'TeamName'].values.tolist()
                for k in labels if "User" in k]))),
               hoverinfo='text'
               )

data1=Data([trace3, trace4, trace5])
fig1=Figure(data=data1, layout=layout)
plotly.offline.iplot(fig1)