# Predicting 2020 audio features as a baseline

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [2]:
# set file path
file_path = '/Users/sebastian/Documents/Uni/Sheffield (MSc)/2. Semester/Data Analysis and Viz/spotify_audio_feature_analysis/'

In [3]:
data = pd.read_csv(file_path + 'data/5_reduced_weighted_data.csv')

# data.dtypes

Unnamed: 0                 int64
Country                   object
Date                      object
Week                       int64
valence                  float64
danceability             float64
energy                   float64
tempo                    float64
Rank                       int64
Song.ID                   object
reversed_rank              int64
weighted_valence         float64
weighted_danceability    float64
weighted_energy          float64
weighted_tempo           float64
dtype: object

In [4]:
data["Country"] = pd.Categorical(data.Country)
data["Date"] = pd.to_datetime(data.Date)
data["Week"] = pd.Categorical(data.Week)

# data.describe()

In [5]:
# splitting data prior to OneHotEncoding as the date variable gets lost in the process
training_data = data[data["Date"] < "2020-01-01"]
test_data = data[data["Date"] >= "2020-01-01"]

In [6]:
# for x select Country and Week
x_train = training_data.iloc[:, [1, 3]].values

x_test = test_data.iloc[:, [1, 3]].values

In [7]:
# for y select valence [11], danceability [12], energy [13], tempo [14]
y_train_valence = training_data.iloc[:, 11].values
y_train_danceability = training_data.iloc[:, 12].values
y_train_energy = training_data.iloc[:, 13].values
y_train_tempo = training_data.iloc[:, 14].values

y_test_valence = training_data.iloc[:, 11].values
y_test_danceability = training_data.iloc[:, 12].values
y_test_energy = training_data.iloc[:, 13].values
y_test_tempo = training_data.iloc[:, 14].values

In [8]:
# OneHotEncoding country and week variable
ct = ColumnTransformer(transformers = [("encoder", OneHotEncoder(), [0, 1])], remainder = "passthrough")
x_train = ct.fit_transform(np.array(x_train))
x_test = ct.fit_transform(np.array(x_test))

In [9]:
from sklearn.preprocessing import PolynomialFeatures
import scipy.special

poly = PolynomialFeatures(interaction_only = True)
x_train = poly.fit_transform(x_train)
x_test = poly.fit_transform(x_test)

In [30]:
# Training the linear model
regressor = LinearRegression()
regressor.fit(x_train, y_test_valence) # for each dependent variable (valene, danceability, energy, tempo)

LinearRegression()

In [31]:
# Predicting the Test set results
y_pred_valence = regressor.predict(x_test) # for each dependent variable (valene, danceability, energy, tempo)

In [40]:
df = pd.DataFrame({'weighted_valence_pred':y_pred_valence,
                   'weighted_danceability_pred':y_pred_danceability,
                   'weighted_energy_pred':y_pred_energy,
                   'weighted_tempo_pred':y_pred_tempo})

In [41]:
df.shape

(755044, 4)

In [42]:
df.to_csv(file_path + 'data/6_weighted_audio_features_pred.csv')