<h2>Reworked Super stars to match format of Undervalued</h2>

In [0]:
ATTRIBUTES = {
  'game_ID' : 1,
  'date' : 2,
  'player_ID' : 3,
  'player' : 4,
  'team' : 5,
  'position' : 9,
  'salary' : 11,
  'points' : 13
}

INDICES = [val for key, val in ATTRIBUTES.items()]

CURRENT_DATE = '2019-10-05'

COMPOSITION = {
  'C' : 2,
  'G' : 2, 
  'D' : 2,
  'W' : 3
}

BUDGET = 50000

TEAM = []

In [0]:
# File location and type
file_location = "/FileStore/tables/DataTable.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

#display(df)

In [0]:
RDD = df.rdd.map(list)
RDD.cache()

In [0]:
# extract only relevant columns
RDD_filtered_by_column = RDD.map(lambda entry: [entry[i] for i in INDICES])

# update attribute indices
ATTRIBUTES = {
  'game_ID' : 0,
  'date' : 1,
  'player_ID' : 2,
  'player' : 3,
  'team' : 4,
  'position' : 5,
  'salary' : 6,
  'points' : 7
}

In [0]:
# filter out rows that contain invalid 'position' and 'points' entries
RDD_cleaned = RDD_filtered_by_column.filter(lambda entry: entry[ATTRIBUTES['position']] != "#N/A" 
                                          and entry[ATTRIBUTES['points']] != "#N/A" 
                                          and entry[ATTRIBUTES['salary']] != "#N/A")
#RDD_cleaned.collect()

In [0]:
# group rows by 'player_ID'
RDD_paired_by_player = RDD_cleaned.map(lambda entry: [entry[ATTRIBUTES['player_ID']], entry])
RDD_grouped_by_player = RDD_paired_by_player.groupByKey().mapValues(list).map(lambda entry: list(entry))
#RDD_grouped_by_player.collect()

In [0]:
# sort rows by 'date' for each player
RDD_sorted_by_date = RDD_grouped_by_player.map(lambda entry: [entry[0], sorted(entry[1], key = lambda x: x[ATTRIBUTES['date']])])
#RDD_sorted_by_date.collect()

In [0]:
def compute_averagePoints(entry):
  """
    Returns the following attributes for each player (for each game):
    
      - game_ID
      - most recent game date
      - ID
      - name
      - team
      - position
      - salary in most recent game
      - points in most recent game
      - running average of points up to most recent game
      
    ** Assumes that entries for each player are sorted by date.
    
  """
  ID, rows = tuple(entry)
  count = 0
  total_points = 0
  total_cost = 0
  
  for row in rows:
    count += 1
    total_points += float(row[ATTRIBUTES['points']])
    row.append(float(total_points) / count)
    row[ATTRIBUTES['salary']] = int(row[ATTRIBUTES['salary']])
    
  return [ID, rows]

RDD_average_points = RDD_sorted_by_date.map(compute_averagePoints)

# add attribute-index mappings
ATTRIBUTES['average_points'] = 8
#RDD_average_points.collect()


In [0]:
def filter_by_current_date(entry):
  
  ID, rows = tuple(entry)
  new_rows = []
  
  for row in rows:
    if row[ATTRIBUTES['date']] < CURRENT_DATE:
      new_rows.append(row)
  
  if len(new_rows):
    return [ID, new_rows[-1]]
  else:
    return [ID, []]
  
RDD_filtered_by_date_super_stars = RDD_average_points.map(filter_by_current_date).filter(lambda x:x[1] != [])
#print(RDD_filtered_by_date.collect())

<h2>Order by average points</h2>

In [0]:
def find_super_stars(entry):
  ID, rows = tuple(entry)
  superStars = []
  
  nStars = int(len(rows)/10)
  
  i = 0
  while i < nStars:
    superStars.append(rows[i])
    i += 1
  
  return [ID, superStars]

    

RDD_paired_by_position_super_stars = RDD_filtered_by_date_super_stars.map(lambda entry: [entry[1][ATTRIBUTES['position']], entry])
RDD_grouped_by_position_super_stars = RDD_paired_by_position_super_stars.groupByKey().mapValues(list).map(lambda entry: list(entry))
RDD_sorted_by_average_points_scored = RDD_grouped_by_position_super_stars.map(
                                    lambda entry: [entry[0], sorted(entry[1], 
                                                                    key = lambda x: -x[1][ATTRIBUTES['average_points']])])

RDD_super_stars = RDD_sorted_by_average_points_scored.map(find_super_stars)
RDD_super_stars.collect()

In [0]:
def find_available_players(entry):
  
  ID, rows = tuple(entry)
  
  for row in rows:
    if row[ATTRIBUTES['date']] == CURRENT_DATE:
      return True
    
  return False
  
RDD_available_superstars = RDD_grouped_by_player.filter(find_available_players).map(lambda x: x[0])

In [0]:
def filter_by_availability(entry):
  
  position, players = tuple(entry)
  
  for player in players:
    ID, _ = tuple(player)
    if RDD_available_players.filter(lambda player: player == ID):
      return True
    
  return False

RDD_super_stars_filtered_by_availability = RDD_sorted_by_average_points_scored.filter(lambda entry: entry)
#RDD_super_stars_filtered_by_availability.collect()

<h2>Find the available superstars </h2>

In [0]:
def find_super_stars(entry):
  ID, rows = tuple(entry)
  superStars = []
  
  nStars = int(len(rows)/10)
  
  i = 0
  while i < nStars:
    superStars.append(rows[i])
    i += 1
  
  return [ID, superStars]

RDD_super_stars = RDD_super_stars_filtered_by_availability.map(find_super_stars)
RDD_super_stars.collect()