In [175]:
# import statements

import pandas as pd
import plotly.express as px  # (version 4.7.0)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import plotly.graph_objects as go


In [176]:
top_songs = pd.read_csv("top_songs.csv")
top_songs_test = top_songs.groupby('year', as_index=False).mean()

In [177]:
px.line(top_songs_test, x="year", y=["energy", 'danceability'])

In [178]:
# convert dataset so that the audio features are values between 0 and 1
#  top_songs[cols] = sr_df[cols].apply(lambda x: ((x-x.mean()) / (x.std())))

# Sorted Bar Chart Viz

In [179]:
line_copy = top_songs.copy()

# The most popular genres throughout the years
# - With selection feature/slider feature for year
#   - Be able to select different time frames
#   - "Select all" option
# - Display top 10 genres sorted by the count
#Put all the genres into dictionary
genres = dict()
line_copy['artist_genre'] = line_copy['artist_genre'].apply(lambda x : x.replace('[', ''))
line_copy['artist_genre'] = line_copy['artist_genre'].apply(lambda x : x.replace(']', ''))
line_copy['artist_genre'] = line_copy['artist_genre'].apply(lambda x : x.replace("'", ''))
line_copy['artist_genre'] = line_copy['artist_genre'].apply(lambda x : x.split(','))
for lst in line_copy['artist_genre']:
    
    for genre in lst:
        if genre in genres:
            genres[genre] += 1
        else:
            genres[genre] = 1
top_genres_keys = sorted(genres, key=genres.get, reverse=True)[1:10]
top_genres_values = [genres[key] for key in top_genres_keys]

fig = px.bar(x=top_genres_keys, y=top_genres_values, title = 'Top Genres', labels={'x':'Artist Genre', 'y':'Song Count'})
fig

In [180]:
# The artists with the most top songs throughout the years
# - With selection feature/slider feature for year
#   - Be able to select different time frames
#   - "Select all" option
# - Display 10 artists sorted by count of charting songs
line_copy = top_songs.copy()
line_copy['artist_name'] = line_copy['artist_name'].apply(lambda x : x.replace('[', ''))
line_copy['artist_name'] = line_copy['artist_name'].apply(lambda x : x.replace(']', ''))
line_copy['artist_name'] = line_copy['artist_name'].apply(lambda x : x.replace("'", ''))
line_copy['artist_name'] = line_copy['artist_name'].apply(lambda x : x.split(','))

artists_dict = dict()
for artists in line_copy['artist_name']:
    for artist in artists:
        if artist in artists_dict:
            artists_dict[artist] += 1
        else:
            artists_dict[artist] = 1
top_artists_keys = sorted(artists_dict, key=artists_dict.get, reverse=True)[1:10]
top_artists_values = [artists_dict[key] for key in top_artists_keys]

fig = px.bar(x=top_artists_keys, y=top_artists_values, title = 'Top Artists', labels={'x':'Artist Name', 'y':'Song Count'})

fig

In [188]:
def create_data_genre(selected_genres, features):
    #Takes in feature to graph
    output = pd.DataFrame()
    line_copy = top_songs.copy()
    main_genres_options = ['r&b', 'hip hop', 'country', 'rock', 'metal', 'edm', 'indie', 'pop']

    for genre in main_genres_options:
        #make line plot data
        indices = [x for x in line_copy['artist_genre'].index if genre in line_copy['artist_genre'][x]]

        df = line_copy.loc[indices]
        #df = df.groupby('year').mean().reset_index()
        df['genre'] = genre
        output = pd.concat([output, df])
        
        
    # fig = px.line(output, x='year', y=genre_feature, color = 'subgenre')
    
    # fig.update_layout({
    # 'plot_bgcolor': ' #212121',
    # 'paper_bgcolor': ' #212121',
    # 'font_color': 'white',
    # })

    # fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='darkgray')
    # fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='darkgray')
    output = output.loc[(output['genre'] == selected_genres[0]) | (output['genre'] == selected_genres[1])].reset_index(drop=True)
    output = output.iloc[output['songs_id'].drop_duplicates().index]
    columns = ['genre', 'songs_id', 'songs_name', 'artist_name', 'year'] + features

    return output[columns]

    main_genres_options = ['r&b', 'hip hop', 'country', 'rock', 'metal', 'edm', 'indie', 'pop']
songs = create_data_genre(['rock', 'hip hop'], ['loudness', 'acousticness'])

def knn(df):
    X = df.iloc[:,-2:] #Get the two selected features

    #Scale the quantitative variables to be between 0 and 1
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    y = df['genre'] #Label is the genre

    #Split into train/test (80/20)
    X_train, X_test, y_train, y_test = train_test_split(X, y.astype(str), test_size=0.20, random_state=0)

    # Fit the model on training data, predict on test data
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    y_score = clf.predict_proba(X_test)[:, 1]

    cols = X_test.columns
    y_cols = y_test.unique()
    fig = px.scatter(
        X_test, x=cols[0], y=cols[1],
        color=y_score, color_continuous_scale='RdBu',
        symbol=y_test, symbol_map={y_cols[0]: 'circle-dot', y_cols[1]: 'square-dot'},
        labels={'symbol': 'genre', 'color': 'Prediction Score: 0 ({0}) - 1 ({1})'.format(y_cols[0], y_cols[1])},
        title='Classifier Results {0} and {1} ({2},{3})'.format(y_cols[0], y_cols[1], cols[0], cols[1])
    )
    fig.update_traces(marker_size=11, marker_line_width=1.5)
    fig.update_layout(legend_orientation='h')
    return fig

plt = knn(songs)

plt