In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))


# Print function docstrings

help(start_spark)
help(stop_spark)
help(display_spark)
help(show_as_html)

Help on function start_spark in module __main__:

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)
    Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)

Help on function stop_spark in module __main__:

stop_spark()
    Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).

Help on function display_spark in module __main__:

display_spark()
    Display the status of the active Spark session if one is currently running.

Help on function show_as_html in module __main__:

show_as_html(df, n=20)
    Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n 

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=4, master_memory=4)

0,1
spark.sql.shuffle.partitions,64
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.sql.warehouse.dir,file:/users/home/mda205/DATA420%20Assignment%202/spark-warehouse
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.app.id,app-20230608192650-0487
spark.cores.max,16
spark.driver.port,46185


In [3]:
# Write your imports and code here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StandardScaler

from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

from pyspark.ml.evaluation import RankingEvaluator
from pyspark.ml.recommendation import ALS

import json
import numpy as np

from pretty import SparkPretty  # download pretty.py from LEARN and put it in your M:\ or home directory
pretty = SparkPretty(limit=5)

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib notebook

# Data Processing

## Q1

this isnt an answer for Q1 but just a note to you -- i know you commented on my style/readability on the last assignment and because of that i am taking COSC480 next semester (all my knowledge on actually writing code is just duct taped together) but the good news is the majority of this assignment is copy and pastes of your code so much less of this is my bad style :D

### Q1b

In [4]:
# Determine ideal number of partitions

conf = sc.getConf()

N = int(conf.get("spark.executor.instances"))
M = int(conf.get("spark.executor.cores"))
partitions = 4 * N * M

print(f'ideal # partitions = {partitions}')

ideal # partitions = 64


### Q1 c

In [5]:
def get_Datasets_counts():
    
    for name in datasets:

        metadata_schema = StructType([
            StructField("name", StringType()),
            StructField("type", StringType()),
        ])
        metadata = spark.read.csv(
            f'/data/msd/audio/attributes/{name}.attributes.csv', schema=metadata_schema)

        schema_actual = StructType([
            StructField(name, lookup[typename], True) for name, typename in metadata.collect()])

        features = spark.read.csv(f'/data/msd/audio/features/{name}.csv', 
                              schema=schema_actual, quote="'").withColumnRenamed(schema_actual[-1].name, "track_id")

        print(name+" has:")
        print(f"{len(features.columns)} columns")
        print(f"{features.count()} rows")
        df=features.select((F.countDistinct("track_id")).alias("total unique songs"))
        df.show()

In [6]:
# get_Datasets_counts()

## Q2 (All you)

In [7]:
# !hdfs dfs -ls -h hdfs:///data/msd/genre/

Found 3 items
-rw-r--r--   8 jsw93 supergroup     11.1 M 2021-09-29 10:35 hdfs:///data/msd/genre/msd-MAGD-genreAssignment.tsv
-rw-r--r--   8 jsw93 supergroup      8.4 M 2021-09-29 10:35 hdfs:///data/msd/genre/msd-MASD-styleAssignment.tsv
-rw-r--r--   8 jsw93 supergroup     10.6 M 2021-09-29 10:35 hdfs:///data/msd/genre/msd-topMAGD-genreAssignment.tsv


In [8]:
# !hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt | head

9d8
< ERROR: <SOFQHZM12A8C142342 TRMWMFG128F92FFEF2> Josipa Lisac  -  razloga  !=  Lisac Josipa  -  1000 razloga
19d17
< ERROR: <SODXUTF12AB018A3DA TRMWPCD12903CCE5ED> Lutan Fyah  -  Nuh Matter the Crisis Feat. Midnite  !=  Midnite  -  Nah Matter the Crisis
29d26
< ERROR: <SOASCRF12A8C1372E6 TRMHIPJ128F426A2E2> Gaetano Donizetti  -  L'Elisir d'Amore: Act Two: Come sen va contento!  !=  Gianandrea Gavazzeni_ Orchestra E Coro Del Maggio Musicale Fiorentino_ Carlo Bergonzi_ Renata Scotto  -  L'Elisir D'Amore_ Act 2: Come Sen Va Contento (Adina) (Donizetti)
33d29
< ERROR: <SOITDUN12A58A7AACA TRMHXGK128F42446AB> C.J. Chenier  -  Ay, Ai Ai  !=  Clifton Chenier  -  Ay_ Ai Ai
52d47
< ERROR: <SOLZXUM12AB018BE39 TRMRSOF12903CCF516> 許志安  -  男人最痛  !=  Andy Hui  -  Nan Ren Zui Tong
cat: Unable to write to output stream.


In [9]:
# !hdfs dfs -cat /data/msd/tasteprofile/mismatches/sid_mismatches.txt | head

ERROR: <SOUMNSI12AB0182807 TRMMGKQ128F9325E10> Digital Underground  -  The Way We Swing  !=  Linkwood  -  Whats up with the Underground
ERROR: <SOCMRBE12AB018C546 TRMMREB12903CEB1B1> Jimmy Reed  -  The Sun Is Shining (Digitally Remastered)  !=  Slim Harpo  -  I Got Love If You Want It
ERROR: <SOLPHZY12AC468ABA8 TRMMBOC12903CEB46E> Africa HiTech  -  Footstep  !=  Marcus Worgull  -  Drumstern (BONUS TRACK)
ERROR: <SONGHTM12A8C1374EF TRMMITP128F425D8D0> Death in Vegas  -  Anita Berber  !=  Valen Hsu  -  Shi Yi
ERROR: <SONGXCA12A8C13E82E TRMMAYZ128F429ECE6> Grupo Exterminador  -  El Triunfador  !=  I Ribelli  -  Lei M'Ama
ERROR: <SOMBCRC12A67ADA435 TRMMNVU128EF343EED> Fading Friend  -  Get us out!  !=  Masterboy  -  Feel The Heat 2000
ERROR: <SOTDWDK12A8C13617B TRMMNCZ128F426FF0E> Daevid Allen  -  Past Lives  !=  Bhimsen Joshi  -  Raga - Shuddha Sarang_ Aalap
ERROR: <SOEBURP12AB018C2FB TRMMPBS12903CE90E1> Cristian Paduraru  -  Born Again  !=  Yespiring  -  Journey Stages
ERROR: <SO

In [10]:
mismatches_schema = StructType([
    StructField("song_id", StringType(), True),
    StructField("song_artist", StringType(), True),
    StructField("song_title", StringType(), True),
    StructField("track_id", StringType(), True),
    StructField("track_artist", StringType(), True),
    StructField("track_title", StringType(), True)
])

path = "/scratch-network/courses/2023/DATA420-23S1/data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt"
with open(path, "r") as f:
    lines = f.readlines()
    sid_matches_manually_accepted = []
    for line in lines:
        if line.startswith("< ERROR: "):
            a = line[10:28]
            b = line[29:47]
            c, d = line[49:-1].split("  !=  ")
            e, f = c.split("  -  ")
            g, h = d.split("  -  ")
            sid_matches_manually_accepted.append((a, e, f, b, g, h))

matches_manually_accepted = spark.createDataFrame(sc.parallelize(sid_matches_manually_accepted, 8), schema=mismatches_schema)
# show_as_html(matches_manually_accepted)

path = "/scratch-network/courses/2023/DATA420-23S1/data/msd/tasteprofile/mismatches/sid_mismatches.txt"
with open(path, "r") as f:
    lines = f.readlines()
    sid_mismatches = []
    for line in lines:
        if line.startswith("ERROR: "):
            a = line[8:26]
            b = line[27:45]
            c, d = line[47:-1].split("  !=  ")
            e, f = c.split("  -  ")
            g, h = d.split("  -  ")
            sid_mismatches.append((a, e, f, b, g, h))

mismatches = spark.createDataFrame(sc.parallelize(sid_mismatches, 64), schema=mismatches_schema)
# show_as_html(mismatches)

In [11]:
# Load and parse triplets in spark

triplets_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("song_id", StringType(), True),
    StructField("plays", IntegerType(), True)
])
triplets = (
    spark.read.format("csv")
    .option("header", "false")
    .option("delimiter", "\t")
    .option("codec", "gzip")
    .schema(triplets_schema)
    .load("hdfs:///data/msd/tasteprofile/triplets.tsv/")
    .cache()
)

In [12]:
# Anti join mismatches to manually accepted, and anti join the remaining mismatches to triplets

mismatches_not_accepted = mismatches.join(matches_manually_accepted, on="song_id", how="left_anti")
triplets_not_mismatched = triplets.join(mismatches_not_accepted, on="song_id", how="left_anti")

print(f"matches_manually_accepted = {matches_manually_accepted.count()}")
print(f"mismatches                = {mismatches.count()}")
print(f"triplets                  = {triplets.count()}")
print(f"triplets_not_mismatched   = {triplets_not_mismatched.count()}")

matches_manually_accepted = 488
mismatches                = 19094
triplets                  = 48373586
triplets_not_mismatched   = 45795111


### Q2 b

In [13]:
# Dataset names and attribute type mapping

attributes_path = 'hdfs:///data/msd/audio/attributes/'

hadoop = sc._jvm.org.apache.hadoop
fs = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration()
path = hadoop.fs.Path(attributes_path)
getfile=fs.get(conf).listStatus(path)

datasets=[]
modified_filenames=[]

for f in getfile:
    
    filename = f.getPath().getName()
    datasets.append(filename[:-15])
    
    mod_filename = filename.replace("-","_")
    modified_filenames.append(mod_filename[:-17])
    

lookup = {
    'real': DoubleType(),
    'NUMERIC': DoubleType(),
    'float': DoubleType(),
    'string': StringType(),
    'STRING': StringType(),
}

In [14]:
# Choose a dataset name, load attribute names, and define schemas based on attribute names

def SingleDataset(dataname):

    name = f'{dataname}'

    metadata_schema = StructType([
        StructField("name", StringType()),
        StructField("type", StringType())])

    metadata = spark.read.csv(f'/data/msd/audio/attributes/{name}.attributes.csv', schema=metadata_schema)

    schema_actual = StructType([
        StructField(name, lookup[typename], True) for name, typename in metadata.collect()])

    SingleData = spark.read.csv(f'/data/msd/audio/features/{name}.csv', 
                                schema=schema_actual, quote="'").withColumnRenamed(schema_actual[-1].name, "track_id")
    
    
    def DFCounts(name, df):
        print(name+" has:")
        print(f"{len(df.columns)} columns")
        print(f"{df.count()} rows")
        df=df.select((F.countDistinct("track_id")).alias("total unique songs"))
        df.show()
    
    counts = lambda : DFCounts(name, SingleData) # printing this information is optional by calling count.()
    
    return SingleData, counts


This whole notebook pretty much just makes functions to use later (maybe) i guess. or at least my contribution to it

In [15]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()