In [18]:
### Import packages

import numpy as np
import pandas as pd
import plotly.graph_objects as go

In [30]:
### Define nodes

# Node colours
NODE_CSV = '#666666'
NODE_BETWEEN = '#39254d'
NODE_PARQUET = '#678238'

# Node DataFrame
labels_list = ['calendar.csv',
              'listings.csv',
              'neighbourhoods.csv',
              'reviews.csv',
              'merged listings', 
              'all hosts',
              'hosts.parquet',
              'host_verification.parquet',
              'neighbourhoods.parquet',
              'listings.parquet',
              'listing_amenities.parquet',
              'listing_complements.parquet',
              'reviews.parquet',
              'calendar.parquet']

labels_colour_list = [NODE_CSV] * 4
labels_colour_list.extend([NODE_BETWEEN] * 2)
labels_colour_list.extend([NODE_PARQUET] * 8)

df_sankey_nodes = pd.DataFrame({'node': labels_list, 'colour': labels_colour_list})
df_sankey_nodes.reset_index(drop = False, inplace = True)
df_sankey_nodes.rename(columns = {'index': 'node_id'}, inplace = True)
df_sankey_nodes

Unnamed: 0,node_id,node,colour
0,0,calendar.csv,#666666
1,1,listings.csv,#666666
2,2,neighbourhoods.csv,#666666
3,3,reviews.csv,#666666
4,4,merged listings,#39254d
5,5,all hosts,#39254d
6,6,hosts.parquet,#678238
7,7,host_verification.parquet,#678238
8,8,neighbourhoods.parquet,#678238
9,9,listings.parquet,#678238


In [38]:
### Define links

# Scale function for data size
def scale_function(num):
    return np.log(num)

# Size of links
LINK_CALENDAR = scale_function(24322724)
LINK_REVIEW = scale_function(1043004)
LINK_NEIGHBOURHOOD = scale_function(33)
LINK_LISTINGS = scale_function(66641)
LINK_MERGED_LISTS = LINK_LISTINGS + LINK_NEIGHBOURHOOD

links_list = [['calendar.csv', 'calendar.parquet', LINK_CALENDAR],
             ['reviews.csv', 'reviews.parquet', LINK_REVIEW],
             ['neighbourhoods.csv', 'merged listings', LINK_NEIGHBOURHOOD],
             ['neighbourhoods.csv', 'neighbourhoods.parquet', LINK_NEIGHBOURHOOD],
             ['listings.csv', 'merged listings', LINK_LISTINGS],
             ['merged listings', 'all hosts', LINK_MERGED_LISTS * 0.30],
             ['merged listings', 'listings.parquet', LINK_MERGED_LISTS * 0.35],
             ['merged listings', 'listing_amenities.parquet', LINK_MERGED_LISTS * 0.20],
             ['merged listings', 'listing_complements.parquet', LINK_MERGED_LISTS * 0.15],
             ['all hosts', 'hosts.parquet', LINK_MERGED_LISTS * 0.30 * 0.70],
             ['all hosts', 'host_verification.parquet', LINK_MERGED_LISTS * 0.30 * 0.30]]

df_sankey_links = pd.DataFrame(links_list, columns = ['source', 'target', 'value'])

# Add node id
df_sankey_links = pd.merge(df_sankey_links,
                          df_sankey_nodes[['node_id', 'node']].rename(columns = {'node_id': 'source_id',
                                                                                 'node': 'source'}),
                          how = 'left',
                          on = 'source')

df_sankey_links = pd.merge(df_sankey_links,
                          df_sankey_nodes[['node_id', 'node']].rename(columns = {'node_id': 'target_id',
                                                                                 'node': 'target'}),
                          how = 'left',
                          on = 'target')

# Add node colour
df_sankey_links['colour'] = '#dbf3fa'
df_sankey_links.loc[df_sankey_links['source'].str.endswith('csv'), 'colour'] = '#fff0d1'
df_sankey_links.loc[df_sankey_links['target'] == 'all hosts', 'colour'] = '#d2b2e5'
df_sankey_links

Unnamed: 0,source,target,value,source_id,target_id,colour
0,calendar.csv,calendar.parquet,17.006922,0,13,#fff0d1
1,reviews.csv,reviews.parquet,13.857616,3,12,#fff0d1
2,neighbourhoods.csv,merged listings,3.496508,2,4,#fff0d1
3,neighbourhoods.csv,neighbourhoods.parquet,3.496508,2,8,#fff0d1
4,listings.csv,merged listings,11.107075,1,4,#fff0d1
5,merged listings,all hosts,4.381075,4,5,#d2b2e5
6,merged listings,listings.parquet,5.111254,4,9,#dbf3fa
7,merged listings,listing_amenities.parquet,2.920717,4,10,#dbf3fa
8,merged listings,listing_complements.parquet,2.190537,4,11,#dbf3fa
9,all hosts,hosts.parquet,3.066752,5,6,#dbf3fa


In [39]:
### Plot Sankey diagram

fig = go.Figure(data = [go.Sankey(
    valueformat = '.0f',
    valuesuffix = ' users',
    # Define nodes
    node = dict(
      pad = 15,
      thickness = 15,
      line = dict(color = "black", width = 0.5),
      label =  df_sankey_nodes['node'],
      color =  df_sankey_nodes['colour']
    ),
    # Add links
    link = dict(
      source =  df_sankey_links['source_id'],
      target =  df_sankey_links['target_id'],
      value =  df_sankey_links['value'],
#       label =  df_access_sankey_link['label'],
      color =  df_sankey_links['colour']
))])

fig.update_layout(title_text = 'Data flow from sources to parquet files',
                  title_x = 0.5,
                  font_size = 14)
fig.show()