In [1]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

In [105]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext(appName="YourTest", master="local[*]")

ValueError: ignored

In [3]:
import requests
import re
import pandas as pd
from lxml import html

In [59]:
def grab_team_key(team_url):
    "Returns team key from the bball reference team url"
    return team_url[-4:-1]
    
def grab_player_key(player_url):
    "Returns player key from the bball reference player url"
    return re.search('(?<=/)[^/]+(?=.html)', player_url).group(0)  

In [18]:
url = "https://www.basketball-reference.com/teams/CHA/"
url[-4:-1]

'CHA'

In [13]:
grab_team_key("https://www.basketball-reference.com/teams/CHA/")

'www.basketball-reference.com'

In [54]:
def create_team_yearly_list(team_url):
    
    # Read html, find appropriate table
    page = requests.get(team_url)
    tree = html.fromstring(page.content)
    team_key = grab_team_key(team_url)
    team_table = tree.xpath('//*[@id="{}"]/*/tr'.format(team_key))

    # each element of per_game_cols will contain the name of the column, and the list of values in said column
    yearly_urls = []

    base_url = "https://www.basketball-reference.com"
    for row_index in range(1, len(team_table)):
        row_element = team_table[row_index]

        #Iterate through each element of the row
        column_index = 0

        element = row_element[0]
        links = element.iterlinks()
        for link_tuple in links:
          yearly_urls.append(base_url + link_tuple[2])

        #yearly_urls.append(data)

    return yearly_urls


In [55]:
create_team_yearly_list("https://www.basketball-reference.com/teams/CHA/")

['https://www.basketball-reference.com/teams/CHO/2021.html',
 'https://www.basketball-reference.com/teams/CHO/2020.html',
 'https://www.basketball-reference.com/teams/CHO/2019.html',
 'https://www.basketball-reference.com/teams/CHO/2018.html',
 'https://www.basketball-reference.com/teams/CHO/2017.html',
 'https://www.basketball-reference.com/teams/CHO/2016.html',
 'https://www.basketball-reference.com/teams/CHO/2015.html',
 'https://www.basketball-reference.com/teams/CHA/2014.html',
 'https://www.basketball-reference.com/teams/CHA/2013.html',
 'https://www.basketball-reference.com/teams/CHA/2012.html',
 'https://www.basketball-reference.com/teams/CHA/2011.html',
 'https://www.basketball-reference.com/teams/CHA/2010.html',
 'https://www.basketball-reference.com/teams/CHA/2009.html',
 'https://www.basketball-reference.com/teams/CHA/2008.html',
 'https://www.basketball-reference.com/teams/CHA/2007.html',
 'https://www.basketball-reference.com/teams/CHA/2006.html',
 'https://www.basketball

In [57]:
team_yearly_urls = []
with open("team_urls.txt", 'r') as f:
  filelines = f.readlines()
  team_urls = []
  for url in filelines:
    url = url.strip()
    team_urls.append(url)

for url in team_urls:
  yearly_urls = create_team_yearly_list(url)
  team_yearly_urls = team_yearly_urls + yearly_urls


In [58]:
with open("team_yearly_urls.txt", 'w') as f:
  for url in team_yearly_urls:
    f.write("%s\n" % url)
    

In [66]:
def get_roster_list(team_year_url):
    
    # Read html, find appropriate table
    page = requests.get(team_year_url)
    tree = html.fromstring(page.content)
    team_key = grab_team_key(team_url)
    roster_table = tree.xpath('//*[@id="roster"]/*/tr')

    # each element of per_game_cols will contain the name of the column, and the list of values in said column
    players = []

    for row_index in range(1, len(roster_table)):
        row_element = roster_table[row_index]

        #Iterate through each element of the row
        column_index = 0

        name_element = row_element[1]
        links = name_element.iterlinks()
        for link_tuple in links:
          player_url = link_tuple[2]

        player_key = grab_player_key(player_url)
        player_name = name_element.text_content()

        players.append( (player_name, player_key) )


    return players




In [67]:
get_roster_list("https://www.basketball-reference.com/teams/DEN/2012.html")

[('Arron Afflalo', 'afflaar01'),
 ('Chris Andersen', 'anderch01'),
 ('Corey Brewer', 'breweco01'),
 ('DeMarre Carroll', 'carrode01'),
 ('Wilson Chandler', 'chandwi01'),
 ('Kenneth Faried', 'farieke01'),
 ('Rudy Fernández', 'fernaru01'),
 ('Danilo Gallinari', 'gallida01'),
 ('Jordan Hamilton', 'hamiljo02'),
 ('Al Harrington', 'harrial01'),
 ('Nenê Hilário', 'hilarne01'),
 ('Kosta Koufos', 'koufoko01'),
 ('Ty Lawson', 'lawsoty01'),
 ('JaVale McGee', 'mcgeeja01'),
 ('Andre Miller', 'millean02'),
 ('Timofey Mozgov', 'mozgoti01'),
 ('Julyan Stone', 'stoneju01')]

In [76]:
team_yearly_rdd = sc.textFile("team_yearly_urls.txt").map(lambda x: x.strip())
team_yearly_rdd.take(1)
team_roster_rdd = team_yearly_rdd.map(lambda x: (x, get_roster_list(x)))
team_roster_rdd.take(1)
team_roster_rdd.saveAsTextFile("roster_lists.txt")

In [96]:
import ast
roster_lists =  sc.textFile('/content/drive/MyDrive/CS631-Project/roster_lists/part-00*')
roster_lists = roster_lists.map(lambda x:  ast.literal_eval(x))

[('Bill Bridges', 'bridgbi01'),
 ('Mel Counts', 'countme01'),
 ('Gail Goodrich', 'goodrga01'),
 ('Travis Grant', 'granttr01'),
 ('Happy Hairston', 'hairsha01'),
 ('Connie Hawkins', 'hawkico01'),
 ('Nate Hawthorne', 'hawthna01'),
 ('Stan Love', 'lovest01'),
 ('Jim Price', 'priceji01'),
 ('Pat Riley', 'rileypa01'),
 ('Elmore Smith', 'smithel01'),
 ('Kermit Washington', 'washike01'),
 ('Jerry West', 'westje01')]

In [None]:


def create_roster_edges(roster):
  edges = []
  for player_tuple in roster[1]:
    player_key = player_tuple[1]
    for other_player_tuple in roster[1]:
      other_player_key = other_player_tuple[1]
      if other_player_key != player_key:
        edges.append( (player_key, set([other_player_key])) )

  return edges


roster_graph = roster_lists.flatMap(lambda x: create_roster_edges(x))
roster_graph = roster_graph.reduceByKey(lambda x, y: x.union(y))
roster_graph.take(1)

In [101]:
roster_graph.saveAsTextFile("/content/drive/MyDrive/CS631-Project/roster_graph")

In [79]:
roster_lists.take(1)

["('https://www.basketball-reference.com/teams/LAL/1974.html', [('Bill Bridges', 'bridgbi01'), ('Mel Counts', 'countme01'), ('Gail Goodrich', 'goodrga01'), ('Travis Grant', 'granttr01'), ('Happy Hairston', 'hairsha01'), ('Connie Hawkins', 'hawkico01'), ('Nate Hawthorne', 'hawthna01'), ('Stan Love', 'lovest01'), ('Jim Price', 'priceji01'), ('Pat Riley', 'rileypa01'), ('Elmore Smith', 'smithel01'), ('Kermit Washington', 'washike01'), ('Jerry West', 'westje01')])"]

**Making the Graph**

In [115]:
roster_graph = sc.textFile("/content/drive/MyDrive/CS631-Project/roster_graph/part-00*")
roster_graph = roster_graph.map(lambda x: ast.literal_eval(x))
roster_graph = roster_graph.map(lambda x: (x[0], list(x[1])))

In [116]:
roster_graph.take(1)[0][1][1]

'lehmage01'

['bridgbi01',
 'goodrga01',
 'hairsha01',
 'priceji01',
 'rileypa01',
 'smithel01',
 'washike01',
 'westje01',
 'ellisle01',
 'mcmilji01']

In [174]:
def shortest_path(node, adj_list):
  vertex_rdd = adj_list.map(lambda x: x[0])
  vertex_list = vertex_rdd.collect()
  num_vertices = len(vertex_list)

  def initialize_tuple(node_tuple, main_node):
    if node_tuple[0] == main_node:
      return (node_tuple[0], [node_tuple[1], 0, set()] )
    else:
      return (node_tuple[0], [node_tuple[1], None, set()] )

  formatted_graph = adj_list.map(lambda x: initialize_tuple(x, node) )
  # formatted [node_id, [adj_list, distance_to_node, vertices_visited]]
  # vertices_visited will always be an empty set unless node_id is node

  def visiting_mapper(node_tuple, main_node):
    # in the mapper, will send tuples formatted [node_id, [adj_list, distance_to_node, vertices_visited]]
    # adj_list will only be non-empty in one tuple
    # vertices_visited will only be non-empty in the node_id
    if node_tuple[1][1] is None:
      return [node_tuple]

    new_node_tuples = []
    nodes_visited = set()
    for neighbour in node_tuple[1][0]:
      distance = node_tuple[1][1] + 1
      adj_list = []

      new_node_tuples.append( (neighbour, [adj_list, distance, set()]) )
      new_node_tuples.append( (main_node, [adj_list, None, set([neighbour])]) )

    new_node_tuples.append(node_tuple)

    return new_node_tuples

  def distance_reducer(node_tuple1, node_tuple2):
      if node_tuple1[1] is None:
        distance = node_tuple2[1]
      elif node_tuple2[1] is None:
        distance = node_tuple1[1]
      else:
        distance = min(node_tuple1[1], node_tuple2[1])

      adj_list = node_tuple1[0] + node_tuple2[0]
      vertices_visited = node_tuple1[2].union(node_tuple2[2])

      return [adj_list, distance, vertices_visited]

  last_num_visited = 0
  iter = 0
  while True:    
    formatted_graph_mapped = formatted_graph.flatMap(lambda x: visiting_mapper(x, node))
    formatted_graph_reduced = formatted_graph_mapped.reduceByKey(lambda x,y: distance_reducer(x, y))

    main_node_tuple = formatted_graph_reduced.filter(lambda x: x[0] == node)
    total_nodes_visited = len(main_node_tuple.collect()[0][1][2])
    if total_nodes_visited == num_vertices or total_nodes_visited == last_num_visited:
      print("Num Visited: {}".format(total_nodes_visited))
      iter += 1
      break
    else:
      formatted_graph = formatted_graph_reduced
      last_num_visited = total_nodes_visited
      
      iter += 1
      

  shortest_paths = formatted_graph_reduced.map(lambda x: (x[0], x[1][1]))
  print("Iterations: {}".format(iter))
  return shortest_paths


In [194]:
import time
tic = time.time()
shaq = shortest_path('onealsh01', roster_graph)
toc = time.time()

print(toc-tic)

6
109.91211318969727


In [197]:
a.filter(lambda x: x[0] == "curryst01").collect()

[('curryst01', 2)]

In [199]:
roster_graph_list = roster_graph.collect()

In [208]:
id = 0
player_node_lookup = {}
for adj_list_row in roster_graph_list:
  player_id = adj_list_row[0]
  player_node_lookup[player_id] = player_id
  id += 1

import networkx as nx
G = nx.Graph()
for adj_list_row in roster_graph_list:
  player_id = adj_list_row[0]
  node_id = player_node_lookup[player_id]
  G.add_nodes_from([
    (node_id, {'id': player_id}),
])

for adj_list_row in roster_graph_list:
  player_id = adj_list_row[0]
  node_id = player_node_lookup[player_id]
  for neighbour in adj_list_row[1]:
    neighbour_node = player_node_lookup[neighbour]
    G.add_edge(node_id, neighbour_node)


In [210]:
nx.write_gexf(G, path = "played_with_graph.nx")