In [12]:
import pyspark

MAX_MEMORY = "8g"  # 24 gives OOM here.

spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") 
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )
spark

In [13]:
import subprocess
subprocess.run("wget -nc https://datasets.imdbws.com/name.basics.tsv.gz",shell=True)
#dbutils.fs.mkdirs('/tmp/imdb/')
#dbutils.fs.cp('file:///databricks/driver/name.basics.tsv.gz','dbfs:/FileStore/imdb/name.basics.tsv.gz')

df = spark.read.option("delimiter", "\t").option('header',True).csv('name.basics.tsv.gz')
display(df.limit(5).toPandas())

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0072308,tt0031983"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0075213,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0057345,tt0049189,tt0056404"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0050986,tt0060827,tt0069467"


In [14]:
%pip install --upgrade pip 
%pip install -q ordered_set

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [15]:
import re
import colorsys
import random
import json
from ordered_set import OrderedSet

node_colors = {}

def randcolor():
  rgb = colorsys.hls_to_rgb(random.random(),0.8,random.random()*.2 + .8)
  return "#" + ''.join([f'{int(255*x):02x}' for x in rgb])

def get_color_for_node(node):
  if node_colors.get(node): return node_colors.get(node);
  color = randcolor()
  node_colors[node] = color
  return color

def show_linkchart(dots,engine='neato'):
    jscript = '''
    var graphviz = d3.select("#graph").graphviz()
        .width("100%")
        .height(600)
        .zoomScaleExtent([0.01,100])
        .fit(1)
        .engine(engine).transition(function() {{
            return d3.transition()
                .delay(100)
                .duration(1000);
        }}).renderDot(d0);

    function render() {
      var dot = dots[dotIndex];
      graphviz
        .renderDot(dot)
        .on("end", function () {
            dotIndex = (dotIndex + 1) % dots.length;
        });
   }
  '''
    h=f'''<!DOCTYPE html>
<meta charset="utf-8">
<body>
<script src="https://d3js.org/d3.v5.min.js"></script>
<script src="https://unpkg.com/@hpcc-js/wasm@0.3.11/dist/index.min.js"></script>
<script src="https://unpkg.com/d3-graphviz@3.0.5/build/d3-graphviz.js"></script>
<button onClick="dotIndex=0;console.log(graphviz); graphviz.resetZoom();render()">less</button>
<button onClick="render()">more</button>
<div id="graph" style="text-align: center; width:100%; height:604px; border: 1px solid #dddddd;"></div>
<script>
var dots = {dots};
var engine = '{engine}';
var dotIndex = 1;
d0 = dots[0];
{jscript}
</script>
'''
    return h
  #with open("/dbfs/FileStore/ramayer/tmp/1.html",'w') as f: f.write(h)

#nodes,edges = query_links('alvin lovett')
#links = query_links('anthony reed')
#links = query_by_sql("select q,u from tmp_flattened_interesting_search_terms where q='anthony reed'")
#dot = to_dot(links)
#print(dot)



#l0 = query_by_sql('''select q,u from tmp_flattened_interesting_search_terms where q = 'alvin lovett' limit 3''')
#l1 = query_by_sql('''select q,u from tmp_flattened_interesting_search_terms where q = 'alvin lovett' limit 5''')
#l2 = query_by_sql('''select q,u from tmp_flattened_interesting_search_terms where q = 'alvin lovett' limit 99''')
#displayHTML(show_linkchart2([l0,l1,l2],engine='neato'))


In [19]:
import math
import graphviz
import itertools
def make_dot(n):
  nodes = [i for i in range(n)]
  random.seed(0) 
  edges = [(random.randint(0, int(n/10)),n) for n in range(n)]
  d = graphviz.Digraph(filename='rank_same.gv')
  d.attr(rankdir='LR', size='8,5',splines="spline",ranksep="0.25",overlap="prism",nodesep="0.25")
  #	graph [ranksep=0.25, overlap=prism, nodesep=0.25, splines=true];
  #  node [style="filled"; dir="none"];
  
  edges_out = {k:len(list(g)) for k,g in itertools.groupby(sorted(edges,key=lambda x:x[0]),lambda x:x[0])}
  edges_in  = {k:len(list(g)) for k,g in itertools.groupby(sorted(edges,key=lambda x:x[1]),lambda x:x[1])}
  for n in nodes:
    d.node(str(n),shape="octagon", style='filled', color=get_color_for_node(n))
  for k,g in itertools.groupby(sorted(edges),lambda x:x[0]):
    edges_in_group = [x for x in g]
    sq = math.ceil(math.sqrt(len(edges_in_group)))
    
    #sq = len(edges_in_group)
    #sq=10
    for e in edges_in_group:
      busyness = min(edges_in.get(e[0],0) + edges_out.get(e[0],0), edges_in.get(e[1],0) + edges_out.get(e[1],0))
      sq = busyness
      if random.random() > 0.5:
        d.edge(str(e[0]), str(e[1]),dir="none",len=str(random.randint(1,sq)),minlen=str(random.randint(1,sq)))
      else:
        d.edge(str(e[1]), str(e[0]),dir="none",len=str(random.randint(1,sq)),minlen=str(random.randint(1,sq)))
      
  #d = d.unflatten(5,4,5)
  return str(d)

dots = [make_dot(n*5) for n in range(1,5)]


In [20]:
# this fails
# HTML(src_html)

from IPython.core.display import HTML
import base64

src_html = show_linkchart(dots,engine='dot')
b64 = base64.b64encode(src_html.encode('utf-8'))
src = f"data:text/html;base64,{b64.decode('utf-8')}"
HTML(f'<iframe style="width:100%;height:600px" src="{src}">link</a>')


