In [4]:
# Import dependencies
import findspark
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [5]:
# Initialize findspark
findspark.init()

# Initialize the spark session
spark = SparkSession.builder.appName("SK_model").getOrCreate()

In [6]:
# Load in the pre-processed data
df = spark.read.csv("../Resources/filtered_encoded_genre_unbinned.csv", sep=",", header=True, inferSchema=True)
df.show()

+-----------------+--------------------+----------+-----------+--------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+------------------+----------------+----------------+----------------+----------------+----------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   primary_artist|          track_name|popularity|track_genre|explicit|danceability|energy|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|      duration_min|time_signature_0|time_signature_1|time_signature_3|time_signature_4|time_signature_5|key_0|key_1|key_2|key_3|key_4|key_5|key_6|key_7|key_8|key_9|key_10|key_11|num_artists_binned_1|num_artists_binned_2|num_artists_binned_3|num_artists_binned_4|num_artists_binned_5|num_artists_binned_6|
+-----------------+--------------------+----

In [7]:
# Remove columns from the dataset
df = df.drop("primary_artist", "track_name","artists", "track_id", "album_name")
df.show()

+----------+-----------+--------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+------------------+----------------+----------------+----------------+----------------+----------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|popularity|track_genre|explicit|danceability|energy|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|      duration_min|time_signature_0|time_signature_1|time_signature_3|time_signature_4|time_signature_5|key_0|key_1|key_2|key_3|key_4|key_5|key_6|key_7|key_8|key_9|key_10|key_11|num_artists_binned_1|num_artists_binned_2|num_artists_binned_3|num_artists_binned_4|num_artists_binned_5|num_artists_binned_6|
+----------+-----------+--------+------------+------+--------+----+-----------+------------+----------------+--------+----

In [8]:
# Convert the data to a pandas Dataframe
p_df = df.toPandas()
p_df.head()

Unnamed: 0,popularity,track_genre,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,key_8,key_9,key_10,key_11,num_artists_binned_1,num_artists_binned_2,num_artists_binned_3,num_artists_binned_4,num_artists_binned_5,num_artists_binned_6
0,73,acoustic,0,0.676,0.461,-6.746,0,0.143,0.0322,1e-06,...,0,0,0,0,1,0,0,0,0,0
1,55,acoustic,0,0.42,0.166,-17.235,1,0.0763,0.924,6e-06,...,0,0,0,0,1,0,0,0,0,0
2,57,acoustic,0,0.438,0.359,-9.734,1,0.0557,0.21,0.0,...,0,0,0,0,0,1,0,0,0,0
3,71,acoustic,0,0.266,0.0596,-18.515,1,0.0363,0.905,7.1e-05,...,0,0,0,0,1,0,0,0,0,0
4,82,acoustic,0,0.618,0.443,-9.681,1,0.0526,0.469,0.0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
# Remove the specified columns
columns_to_drop = ["num_artists_binned_1", "num_artists_binned_2", "num_artists_binned_3","num_artists_binned_4", "num_artists_binned_5", "num_artists_binned_6"]
p_df_drop = p_df.drop(columns=columns_to_drop)

# Print the updated DataFrame
p_df_drop.head()

Unnamed: 0,popularity,track_genre,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,73,acoustic,0,0.676,0.461,-6.746,0,0.143,0.0322,1e-06,...,0,0,0,0,0,0,0,0,0,0
1,55,acoustic,0,0.42,0.166,-17.235,1,0.0763,0.924,6e-06,...,0,0,0,0,0,0,0,0,0,0
2,57,acoustic,0,0.438,0.359,-9.734,1,0.0557,0.21,0.0,...,0,0,0,0,0,0,0,0,0,0
3,71,acoustic,0,0.266,0.0596,-18.515,1,0.0363,0.905,7.1e-05,...,0,0,0,0,0,0,0,0,0,0
4,82,acoustic,0,0.618,0.443,-9.681,1,0.0526,0.469,0.0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
# List out columns
columns_list = p_df_drop.columns.tolist()
print(columns_list)

['popularity', 'track_genre', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_min', 'time_signature_0', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11']


In [11]:
# Create a scaler object
scaler = MinMaxScaler(feature_range=(0, 1))

In [12]:
# Scale 'popularity'
popularity_values = p_df_drop['popularity'].values.reshape(-1, 1)
scaled_popularity = scaler.fit_transform(popularity_values)
p_df_drop['popularity'] = scaled_popularity

In [13]:
# Scale 'loudness'
loudness_values = p_df_drop['loudness'].values.reshape(-1, 1)
scaled_loudness = scaler.fit_transform(loudness_values)
p_df_drop['loudness'] = scaled_loudness

In [14]:
# Scale 'tempo'
tempo_values = p_df_drop['tempo'].values.reshape(-1, 1)
scaled_tempo = scaler.fit_transform(tempo_values)
p_df_drop['tempo'] = scaled_tempo

In [15]:
# Update the columns in the DataFrame with the scaled values
p_df_drop['popularity'] = scaled_popularity
p_df_drop

Unnamed: 0,popularity,track_genre,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,0.73,acoustic,0,0.676,0.4610,0.791392,0,0.1430,0.0322,0.000001,...,0,0,0,0,0,0,0,0,0,0
1,0.55,acoustic,0,0.420,0.1660,0.597377,1,0.0763,0.9240,0.000006,...,0,0,0,0,0,0,0,0,0,0
2,0.57,acoustic,0,0.438,0.3590,0.736123,1,0.0557,0.2100,0.000000,...,0,0,0,0,0,0,0,0,0,0
3,0.71,acoustic,0,0.266,0.0596,0.573701,1,0.0363,0.9050,0.000071,...,0,0,0,0,0,0,0,0,0,0
4,0.82,acoustic,0,0.618,0.4430,0.737103,1,0.0526,0.4690,0.000000,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.21,world-music,0,0.172,0.2350,0.612952,1,0.0422,0.6400,0.928000,...,0,0,0,1,0,0,0,0,0,0
113545,0.22,world-music,0,0.174,0.1170,0.577345,0,0.0401,0.9940,0.976000,...,0,0,0,0,0,0,0,0,0,0
113546,0.22,world-music,0,0.629,0.3290,0.714648,0,0.0420,0.8670,0.000000,...,0,0,0,0,0,0,0,0,0,0
113547,0.41,world-music,0,0.587,0.5060,0.714759,1,0.0297,0.3810,0.000000,...,0,0,0,0,0,1,0,0,0,0


In [16]:
# Get unique words in 'track_genre' column
unique_genres = p_df_drop['track_genre'].unique()

# Display the unique genres
print(unique_genres)

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [17]:
# Get the count of unique genres
unique_genre_count = p_df_drop['track_genre'].nunique()

# Print the sum of unique genres
print("Sum of unique genres:", unique_genre_count)

Sum of unique genres: 114


In [18]:
# List of unique genres
genres = ['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues',
          'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club',
          'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'disney',
          'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
          'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock',
          'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian', 'indie-pop',
          'indie', 'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin',
          'latino', 'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode',
          'party', 'piano', 'pop-film', 'pop', 'power-pop', 'progressive-house', 'psych-rock', 'punk-rock', 'punk',
          'r-n-b', 'reggae', 'reggaeton', 'rock-n-roll', 'rock', 'rockabilly', 'romance', 'sad', 'salsa', 'samba',
          'sertanejo', 'show-tunes', 'singer-songwriter', 'ska', 'sleep', 'songwriter', 'soul', 'spanish', 'study',
          'swedish', 'synth-pop', 'tango', 'techno', 'trance', 'trip-hop', 'turkish', 'world-music']

# Create a dictionary to map genres to numbers
genre_mapping = {genre: i for i, genre in enumerate(genres)}

# Print the genre mapping
for genre, code in genre_mapping.items():
    print(genre, code)

acoustic 0
afrobeat 1
alt-rock 2
alternative 3
ambient 4
anime 5
black-metal 6
bluegrass 7
blues 8
brazil 9
breakbeat 10
british 11
cantopop 12
chicago-house 13
children 14
chill 15
classical 16
club 17
comedy 18
country 19
dance 20
dancehall 21
death-metal 22
deep-house 23
detroit-techno 24
disco 25
disney 26
drum-and-bass 27
dub 28
dubstep 29
edm 30
electro 31
electronic 32
emo 33
folk 34
forro 35
french 36
funk 37
garage 38
german 39
gospel 40
goth 41
grindcore 42
groove 43
grunge 44
guitar 45
happy 46
hard-rock 47
hardcore 48
hardstyle 49
heavy-metal 50
hip-hop 51
honky-tonk 52
house 53
idm 54
indian 55
indie-pop 56
indie 57
industrial 58
iranian 59
j-dance 60
j-idol 61
j-pop 62
j-rock 63
jazz 64
k-pop 65
kids 66
latin 67
latino 68
malay 69
mandopop 70
metal 71
metalcore 72
minimal-techno 73
mpb 74
new-age 75
opera 76
pagode 77
party 78
piano 79
pop-film 80
pop 81
power-pop 82
progressive-house 83
psych-rock 84
punk-rock 85
punk 86
r-n-b 87
reggae 88
reggaeton 89
rock-n-roll 90
roc

In [19]:
# Convert the genre mapping dictionary to a DataFrame
genre_mapping_df = pd.DataFrame(list(genre_mapping.items()), columns=['genre', 'label'])

# Save the genre mapping DataFrame as a CSV file
genre_mapping_df.to_csv('genre_mapping.csv', index=False)

In [20]:
# Add a new column 'genre_code' to the DataFrame and populate it with the genre codes
p_df_drop['genre_code'] = p_df_drop['track_genre'].map(genre_mapping)
p_df_drop

Unnamed: 0,popularity,track_genre,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,genre_code
0,0.73,acoustic,0,0.676,0.4610,0.791392,0,0.1430,0.0322,0.000001,...,0,0,0,0,0,0,0,0,0,0
1,0.55,acoustic,0,0.420,0.1660,0.597377,1,0.0763,0.9240,0.000006,...,0,0,0,0,0,0,0,0,0,0
2,0.57,acoustic,0,0.438,0.3590,0.736123,1,0.0557,0.2100,0.000000,...,0,0,0,0,0,0,0,0,0,0
3,0.71,acoustic,0,0.266,0.0596,0.573701,1,0.0363,0.9050,0.000071,...,0,0,0,0,0,0,0,0,0,0
4,0.82,acoustic,0,0.618,0.4430,0.737103,1,0.0526,0.4690,0.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.21,world-music,0,0.172,0.2350,0.612952,1,0.0422,0.6400,0.928000,...,0,0,1,0,0,0,0,0,0,113
113545,0.22,world-music,0,0.174,0.1170,0.577345,0,0.0401,0.9940,0.976000,...,0,0,0,0,0,0,0,0,0,113
113546,0.22,world-music,0,0.629,0.3290,0.714648,0,0.0420,0.8670,0.000000,...,0,0,0,0,0,0,0,0,0,113
113547,0.41,world-music,0,0.587,0.5060,0.714759,1,0.0297,0.3810,0.000000,...,0,0,0,0,1,0,0,0,0,113


In [21]:
p_df_drop.to_csv('scaled_data.csv', index=False)

In [25]:
# Specify the columns to drop
columns_to_drop = ["track_genre"]

# Remove the specified columns
p_df_drop_genre = p_df_drop.drop(columns=columns_to_drop)

# Print the updated DataFrame
p_df_drop_genre.head()

Unnamed: 0,popularity,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,genre_code
0,0.73,0,0.676,0.461,0.791392,0,0.143,0.0322,1e-06,0.358,...,0,0,0,0,0,0,0,0,0,0
1,0.55,0,0.42,0.166,0.597377,1,0.0763,0.924,6e-06,0.101,...,0,0,0,0,0,0,0,0,0,0
2,0.57,0,0.438,0.359,0.736123,1,0.0557,0.21,0.0,0.117,...,0,0,0,0,0,0,0,0,0,0
3,0.71,0,0.266,0.0596,0.573701,1,0.0363,0.905,7.1e-05,0.132,...,0,0,0,0,0,0,0,0,0,0
4,0.82,0,0.618,0.443,0.737103,1,0.0526,0.469,0.0,0.0829,...,0,0,0,0,0,0,0,0,0,0


In [26]:
p_df_drop_genre.to_csv('dropped_genre_data.csv', index=False)

In [None]:
# List out columns
columns_list = p_df_drop.columns.tolist()
print(columns_list)