In [1]:
import pickle
import pandas as pd
import glob
import configparser
import os
import altair as alt
import numpy as np


In [2]:
config_options = configparser.ConfigParser()
conf_dir = os.environ.get('SPARK_CONF_DIR') or 'conf'  # Options to support Spark CLuster and local modes
config_options.read('{}/spark.conf'.format(conf_dir))  # Load entries defined in 'spark-start' shell script
dataBaseDirectory = dict(config_options.items("SPARK_APP_CONFIGS")).get('spark.sql.warehouse.dir')
print(dataBaseDirectory)

/scratch/siads699s23_class_root/siads699s23_class/shared_data/team_16_algorhythms/data/spark_table_warehouse


In [3]:
trackIndexDictFile = 'version_20230804_030210/dict_of_index_to_track_uri_and_names_20230804_030210.pkl'
pathtodict = "{}/saved_files/data_representations/{}".format(dataBaseDirectory,trackIndexDictFile)
with open(pathtodict, 'rb') as file:
    indexTrackDict = pickle.load(file)
    
# Read in dataframe using the dictionary which has index: numpy.record(with 3 elements)
indexTrackDictDf = pd.DataFrame.from_dict(indexTrackDict, orient='index', columns=['track'])

# Use vectorized operations to split the numpy record into separate columns
indexTrackDictDf['track_uri'] = indexTrackDictDf['track'].str[0]
indexTrackDictDf['track_name'] = indexTrackDictDf['track'].str[1]
indexTrackDictDf['artist_name'] = indexTrackDictDf['track'].str[2]

# Drop the original 'track' column
indexTrackDictDf.drop(columns=['track'], inplace=True)

In [4]:
# Print a sample of the index showing track URI
indexTrackDictDf.sample(15)

Unnamed: 0,track_uri,track_name,artist_name
1228452,spotify:track:5C5NpFojn0PdAz0ZqiXGEt,Cotton Tail,Ben Webster
937420,spotify:track:6vJT4fxvmjcXComT2nRRE7,Grab A Body - B-Side,My Morning Jacket
1787407,spotify:track:62syCi5Lpy3ey4uLqU3gMU,Rock Love,Lando Chill
516284,spotify:track:2AQh8PzzaRk9p9ymJReXwu,Did You Wanna Die,Youth Brigade
2114652,spotify:track:1M92atDuShDTaAOEkNXGr4,Como Te Perdi,Grupo Maravilla De Robin Revilla
2122893,spotify:track:5gvEU8LlX3KhXMqKWiLasg,Chinedum,Flavor
1133189,spotify:track:6CVGaRH8P1OQlFrCk0Vrqa,Strange Noises in the Dark,Austin Lounge Lizards
1133177,spotify:track:381N0PSxC20dP6qAse9O5X,Para Que Regrese,Mi Sonora
29218,spotify:track:5OpnvBUT2qqtd2eTy9TMTz,Columbian Exchange,Fxxxxy
598628,spotify:track:1aq0Fs2Qkivhu8dn7TskfE,Viva Louremil,Mestre Acordeon


In [5]:
# Specify the path to the CSV file for Spotify data that contains ALL tracks 
spotifyCsvFile = "{}/{}"  \
            .format(dataBaseDirectory, 'spotifyapi_tracks_all.csv' )
print("Spotify feature data CSV is : {}".format(spotifyCsvFile))
featureDf = pd.read_csv(spotifyCsvFile).dropna()

Spotify feature data CSV is : /scratch/siads699s23_class_root/siads699s23_class/shared_data/team_16_algorhythms/data/spark_table_warehouse/spotifyapi_tracks_all.csv


In [6]:
featureDf.sample(15).set_index('track_uri')

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
track_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
spotify:track:0WLEwoxMXs9IFUgb2Jgevk,0.432,0.0198,7,-30.029,1,0.04,0.958,0.0,0.232,0.196,104.434,23027.0,5
spotify:track:1BWb2tnk5LYKvPb4sifpjD,0.701,0.74,5,-5.689,0,0.125,0.11,0.0,0.0967,0.7,127.888,201562.0,4
spotify:track:79UErtK3HJstQf7XykSmI3,0.764,0.505,1,-12.281,1,0.0829,0.0223,0.000266,0.32,0.0951,161.05,208187.0,4
spotify:track:5jDbwyetPAbzWU5YoTnHQ4,0.361,0.136,0,-19.613,1,0.035,0.979,0.913,0.228,0.277,130.897,913278.0,4
spotify:track:3WbYOiTaVcFfq9YETajLHz,0.62,0.573,6,-8.323,1,0.295,0.484,0.0,0.392,0.558,92.729,446955.0,4
spotify:track:3Kbh07ClPQUAFnf6e1mIum,0.894,0.442,0,-8.527,1,0.373,0.0911,0.0,0.0676,0.657,98.037,218442.0,4
spotify:track:2pCLewVg8ZoOKzLgurnrBQ,0.386,0.957,2,-3.946,1,0.125,0.000166,0.00225,0.0967,0.194,80.949,281560.0,4
spotify:track:6culLN522QzyF4JTwnIBT8,0.748,0.554,1,-10.454,0,0.909,0.936,0.0,0.597,0.827,124.27,69172.0,5
spotify:track:6fWJFI8MXTBhMWr2Q0mayH,0.46,0.588,1,-10.269,0,0.0414,0.279,0.291,0.0791,0.414,101.027,326008.0,4
spotify:track:7pfm2SPFWlx3YhKPpFJjaM,0.672,0.712,11,-7.561,1,0.0484,0.0568,0.000653,0.0564,0.794,171.948,231667.0,4


#### Pre-requisite :
Before running the next cell, ensure to have run `4_gather_node_degree_information.py` either directly from command line as Python script or run the `batch-job-4-gather-node-degree-info.sh` that runs the aforementioned Python program on a Slurm cluster.

In [7]:
node_degree_file = glob.glob('{}/*.parquet'.format(
    dataBaseDirectory + "/spotify_track_node_degree_buckets"))
print("Number of parquet files for node_degree_file were found to be : {}".format(len(node_degree_file)))

nodeDegreeCountsDf = pd.read_parquet(node_degree_file)

Number of parquet files for node_degree_file were found to be : 1


In [8]:
nodeDegreeCountsDf.head(10)

Unnamed: 0,buckets,count
0,5000-10000,44995
1,2000-5000,112286
2,1000-2000,144926
3,50-100,325434
4,100-200,572169
5,0-10,8388
6,500-1000,224884
7,>10000,40393
8,10-50,219655
9,200-500,569061


In [9]:
# Define order to see results in
bucketsOrder = ['0-10','10-50','50-100','100-200','200-500','500-1000','1000-2000','2000-5000','5000-10000','>10000']

# Define the chart using Altair
chart = alt.Chart(nodeDegreeCountsDf).mark_bar().encode( 
    x=alt.X('buckets:O'
            , sort=bucketsOrder, axis=alt.Axis(title='Degree of node', labelFontSize=12)),  # O for ordinal, which indicates categorical data
    y=alt.Y('count:Q'
            , axis=alt.Axis(title='Number of tracks', labelFontSize=12))   # Q for quantitative, which indicates numerical data
).properties(
    title=alt.TitleParams(text='Degree Distribution of Nodes(songs) in the Network', fontSize=20),  # Set the title font size to 12
    width=400
).configure_axis(
    labelFontSize=12  # Adjust the font size of axis labels
)

# Show the chart
chart

#### Pre-requisite :
Before running the next cell, ensure to have run `6_pytorch_geometric_data_obj_generator.py` either directly from command line as Python script or run the `batch-job-6-pytorch-geometric-data-generator.sh` that runs the aforementioned Python program on a Slurm cluster.

In [10]:
# After running batch-6 script, you will have data representations available
# , pick out the folder name and use as dataVersion below
dataVersion = 'version_20230804_030210'
versionDirectoryName = '{}/{}'.format(dataBaseDirectory + "/saved_files/data_representations", dataVersion)

# Pick out the 1st from the list (assume there is only 1 in each version directory as per design)
search_path = os.path.join(versionDirectoryName, 'data_obj_*.pkl')

try:
    matching_file = glob.glob(search_path)[0]
    print(matching_file)
except:
    print("""
    Error: Check if folder you provided as argument exists inside 'saved_folder/data_representations'.
    Path to review the contents is : {}.
    """.format(dataBaseDirectory))

with open(matching_file, 'rb') as file:
    data = pickle.load(file)
print("The entire 'data' object has the below representation : \n{}".format(data))


/scratch/siads699s23_class_root/siads699s23_class/shared_data/team_16_algorhythms/data/spark_table_warehouse/saved_files/data_representations/version_20230804_030210/data_obj_2262191_nodes_2297244876_edges_20230804_030210.pkl
The entire 'data' object has the below representation : 
Data(x=[2262191, 29], edge_index=[2, 2297244876], edge_weight=[2297244876], num_nodes=2262191)


In [11]:
xRanges = [0, 0.0001,0.001,0.01,0.1,1,10,100,1000] # Define range on x-axis
hist, edges = np.histogram(data.edge_weight.numpy(), bins=xRanges) # Generate histogram data based on ranges

# Generate in bucket ranges
bucket_counts = []
for i in range(len(xRanges) - 1):
    label = f"{xRanges[i]}-{xRanges[i+1]}"
    count = int(hist[i])
    bucket_counts.append((label, count))

# Setup as pandas dataframe to be able to use in altair
edgeWeightRangesDf = pd.DataFrame(bucket_counts, columns=['Range', 'Count'])

# Desired ordering 
bucketsOrder = [label for label,count in bucket_counts]
# Display as bar chart
chart = alt.Chart(edgeWeightRangesDf).mark_bar().encode(
    x=alt.X('Range:O', sort=bucketsOrder, axis=alt.Axis(title='Edge weight of edge', labelFontSize=12)),
    y=alt.Y('Count:Q', scale=alt.Scale(type='symlog')
            , axis=alt.Axis(title='Number of edges (in log scale)', labelFontSize=12)),
    #y=alt.Y('Count'),
).properties(
    title=alt.TitleParams(text='Log-Scaled Distribution of Edge weights in the Network', fontSize=20),
    width=400
).configure_axis(
    labelFontSize=12  # Adjust the font size of axis labels
)


chart