# Importing Libraries



In [0]:
# Importing Libraries
import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

import time
import pickle

# Loading Data



In [0]:
data_path = 'data/music.csv'
music_df = pd.read_csv(data_path) # Reads our dataset
music_df.dropna(inplace=True) # Removes rows with null values
music_df.columns

# Data Cleaning



In [0]:
# DATA CLEANING
# Impute tempo column
tempo_data = []
for tempo in music_df['tempo']:
    if tempo == '?':
        tempo_data.append(1)
    else:
        tempo_data.append(0)
music_df['tempo_mod'] = tempo_data # Insert a column that checks if tempo value will be imputed or not
music_df['tempo'] = music_df['tempo'].replace({'?':120}) # Imputes tempo column by replacing ? with 120
music_df['tempo'] = music_df['tempo'].astype(float) # Changes data type to float

# Creating new column 'has_feat' which looks for songs with an artist feature
has_feat = []
for name in music_df['track_name']:
    if 'ft.' in name.lower():
        has_feat.append(1)
    elif '(with ' in name.lower():
        has_feat.append(1)
    elif 'feat.' in name.lower():
        has_feat.append(1)
    else:
        has_feat.append(0)
music_df['has_feat'] = has_feat

# We attempted to impute this column but did not find improved results
music_df = music_df[music_df['duration_ms'] != -1.] # dropped any duration value with a -1

# We are not imputing popularity column because it is the target
music_df = music_df[music_df['popularity'] != 0] # dropped any popularity with a 0

# Dropped useless columns
columns_to_drop = ['instance_id','track_name', 'obtained_date', 'artist_name']
music_df.drop(columns_to_drop, axis=1, inplace=True)

# Exploratory Data Analysis



In [0]:
# Search for outliers in numerical data
#num_data = ['acousticness', 'danceability', 'duration_ms', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
#for num_col in num_data:
#    fig = px.histogram(music_df, x=num_col)
#    fig.show()
#    fig = px.box(music_df, x=num_col)
#    fig.show()

In [0]:
# There are no strong correlations between numerical variables and the popularity column
fig = px.imshow(music_df.corr(), text_auto=False, height=700, width=888)
fig.update_layout(title={
    'text': "Music Data Correlation Matrix",
    'font': {'size': 25},
    'y': 0.97})
fig.show()
#fig.write_html("correlation_heatmap.html")

In [0]:
fig = px.histogram(music_df, x='popularity', title='Popularity Distribution', height=450, width=700)
fig.update_layout(title={
    'text': "Popularity Distribution",
    'font': {'size': 25},
    'y': 0.9})
fig.show()
#fig.write_html("html_files/popularity_uni.html")

In [0]:
fig = px.histogram(music_df, x='music_genre', title='Music Genres', color="music_genre", height=450, width=700)
fig.update_layout(showlegend = False,
    title={
    'text': "Music Genres",
    'font': {'size': 25},
    'y': 0.9}
)
fig.show()
#fig.write_html("html_files/genres_uni.html")

In [0]:
fig = px.box(music_df, x='music_genre', color='music_genre', y='popularity', title='Popularity vs Music Genre', height=450, width=700)
fig.update_layout(showlegend = False,
    title={
    'font': {'size': 25},
    'y': 0.9}
)
fig.show()
#fig.write_html("html_files/popularity_genre.html")

In [0]:
fig = px.box(music_df, x='mode', color='mode', y='popularity', title='Popularity vs Mode', height=450, width=700)
fig.update_layout(showlegend = False,
    title={
    'font': {'size': 25},
    'y': 0.9})
fig.show()
#fig.write_html("html_files/popularity_mode.html")

In [0]:
fig = px.box(music_df, x='key', y='popularity', title='Popularity vs Key', height=450, width=700, color='key', category_orders={"key": ["A", "A#", "B", "C", "C#", "D", "D#","E", "F", "F#", "G", "G#"]})
fig.update_layout(showlegend = False,
    title={
    'font': {'size': 25},
    'y': 0.9}
)
fig.show()
#fig.write_html("html_files/popularity_key.html")

# Data Preprocessing



In [0]:
# DATA PREPROCESSING

# Categorical variable encoding
music_df['mode'] = music_df['mode'].replace({'Major':1, 'Minor':0}) #label encoding
music_df['key'] = music_df['key'].replace({'A':0,'A#':1,'B':2,'C':3,'C#':4,'D':5,'D#':6,'E':7,'F':8,'F#':9,'G':10, 'G#':11}) #label encoding
music_df = pd.get_dummies(music_df) # one hot encoding

# Divide data for supervised machine learning
target = music_df['popularity']
inputs = music_df.drop('popularity', axis=1)
x_train, x_test, y_train, y_test = train_test_split(inputs, target, test_size=0.1, random_state=42)

# Scale the data
scaler = StandardScaler()
x_train = pd.DataFrame(data=scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(data=scaler.transform(x_test), columns=x_test.columns)

In [0]:
#filename = 'scaler.sav'
#pickle.dump(scaler, open(filename, 'wb'))

# Machine Learning



## Ridge Regressor



In [0]:
start_time = time.perf_counter()
ridge_grid = {'alpha':[0.001, 0.01, 0.1, 1, 10]}
ridge_model = Ridge()
ridge_search = GridSearchCV(ridge_model, ridge_grid, verbose=1)
ridge_search.fit(x_train, y_train)
stop_time = time.perf_counter()
print(stop_time - start_time)
print(ridge_search.best_params_)

In [0]:
y_pred_ridge = ridge_search.predict(x_test)
print(r2_score(y_test, y_pred_ridge))
print(mean_absolute_error(y_test, y_pred_ridge))

In [0]:
fig = px.scatter(x=y_pred_ridge, y=y_test)
fig.update_layout(
    width=600,
    height=400,
    title={
        'text': "Ridge Regression Test Results",
        'font': {'size': 25},
        'y': 0.93
    },
    xaxis_title="Predicted Values",
    yaxis_title="Actual Values"
)
fig.show()
fig.write_html("html_files/test_ridge.html")

## LGBM Regressor



In [0]:
start_time = time.perf_counter()
lgbm_grid = {
    'learning_rate':[0.001, 0.01, 0.1, 1.],
    'max_depth': [-1, 3, 5, 7, 9, 11]
}
lgbm_model = LGBMRegressor()
lgbm_search = GridSearchCV(lgbm_model, lgbm_grid, verbose=1)
lgbm_search.fit(x_train, y_train)
stop_time = time.perf_counter()
print(stop_time - start_time)
print(lgbm_search.best_params_)

In [0]:
y_pred_lgbm = lgbm_search.predict(x_test)
print(r2_score(y_test, y_pred_lgbm))
print(mean_absolute_error(y_test, y_pred_lgbm))

In [0]:
fig = px.scatter(x=y_pred_lgbm, y=y_test)
fig.update_layout(
    width=600,
    height=400,
    title={
        'text': "LGBM Regression Test Results",
        'font': {'size': 25},
        'y': 0.93
    },
    xaxis_title="Predicted Values",
    yaxis_title="Actual Values"
)
fig.show()
fig.write_html("html_files/test_lgbm.html")

In [0]:
#filename = 'lgbm_model.sav'
#pickle.dump(lgbm_search, open(filename, 'wb'))

## SGD Regressor



In [0]:
start_time = time.perf_counter()
sgd_grid = {
    'loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'learning_rate': ['invscaling', 'constant', 'optimal', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1]
}
sgd_model = SGDRegressor()
sgd_search = GridSearchCV(sgd_model, sgd_grid, verbose = 1)
sgd_search.fit(x_train, y_train)
stop_time = time.perf_counter()
print(stop_time - start_time)
print(sgd_search.best_params_)

In [0]:
y_pred_sgd = sgd_search.predict(x_test)
print(r2_score(y_test, y_pred_sgd))
print(mean_absolute_error(y_test, y_pred_sgd))

In [0]:
fig = px.scatter(x=y_pred_sgd, y=y_test)
fig.update_layout(
    width=600,
    height=400,
    title={
        'text': "SGD Regression Test Results",
        'font': {'size': 25},
        'y': 0.93
    },
    xaxis_title="Predicted Values",
    yaxis_title="Actual Values"
)
fig.show()
fig.write_html("html_files/test_sgd.html")