# Predicting 2020 audio features as a baseline

In [5]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [11]:
# set file path
file_path = '/Users/sebastian/Documents/Uni/Sheffield (MSc)/2. Semester/Data Analysis and Viz/spotify_audio_feature_analysis/'

In [13]:
data = pd.read_csv(file_path + 'data/5_clean_data.csv')

# data.dtypes

In [14]:
data

Unnamed: 0,song,artist,country,date,week,streams,rank,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,song_id
0,Chantaje (feat. Maluma),Shakira,Argentina,2017-01-06,1,1321075.0,1,0.852,0.773,8,-2.921,0,0.0776,0.18700,0.000030,0.1590,0.907,6mICuAdrwEjh6Y6lroV2Kg
1,Reggaetón Lento (Bailemos),CNCO,Argentina,2017-01-06,1,1293092.0,2,0.761,0.838,4,-3.073,0,0.0502,0.40000,0.000000,0.1760,0.710,3AEZUABDXNtecAOSC1qTfo
2,Cuando Se Pone a Bailar,Rombai,Argentina,2017-01-06,1,942751.0,3,0.588,0.682,11,-7.169,0,0.1730,0.08510,0.000027,0.0840,0.937,1MpKZi1zTXpERKwxmOu1PH
3,Vente Pa' Ca (feat. Maluma),Ricky Martin,Argentina,2017-01-06,1,940972.0,4,0.663,0.920,11,-4.070,0,0.2260,0.00431,0.000017,0.1010,0.533,7DM4BPaS7uofFul3ywMe46
4,Safari,"J Balvin, Pharrell Williams, BIA, Sky",Argentina,2017-01-06,1,822844.0,5,0.508,0.687,0,-4.361,1,0.3260,0.55100,0.000003,0.1260,0.555,6rQSrBHf7HlZjtcMZ4S4bO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2574047,Lời Yêu Em,Vũ.,Vietnam,2021-02-05,5,26915.0,196,0.520,0.248,6,-16.656,0,0.0601,0.95600,0.000041,0.1190,0.355,7zKDCM8YoeC3tqHV5ZmFuy
2574048,eight(Prod.&Feat. SUGA of BTS),IU,Vietnam,2021-02-05,5,26895.0,197,0.676,0.869,1,-1.573,1,0.0423,0.11500,0.000000,0.1320,0.594,0pYacDCZuRhcrwGUA5nTBe
2574049,Therefore I Am,Billie Eilish,Vietnam,2021-02-05,5,26866.0,198,0.889,0.340,11,-7.773,0,0.0697,0.21800,0.130000,0.0550,0.716,54bFM56PmE4YLRnqpW6Tha
2574050,They Said,"Binz, Touliver",Vietnam,2021-02-05,5,26683.0,199,0.771,0.767,9,-3.111,0,0.1520,0.43200,0.000170,0.0938,0.548,6Yr49px8UOb8LEwY2eORNV


In [4]:
data["Country"] = pd.Categorical(data.Country)
data["Date"] = pd.to_datetime(data.Date)
data["Week"] = pd.Categorical(data.Week)

# data.describe()

In [5]:
# splitting data prior to OneHotEncoding as the date variable gets lost in the process
training_data = data[data["Date"] < "2020-01-01"]
test_data = data[data["Date"] >= "2020-01-01"]

In [6]:
# for x select Country and Week
x_train = training_data.iloc[:, [1, 3]].values

x_test = test_data.iloc[:, [1, 3]].values

In [7]:
# for y select valence [11], danceability [12], energy [13], tempo [14]
y_train_valence = training_data.iloc[:, 11].values
y_train_danceability = training_data.iloc[:, 12].values
y_train_energy = training_data.iloc[:, 13].values
y_train_tempo = training_data.iloc[:, 14].values

y_test_valence = training_data.iloc[:, 11].values
y_test_danceability = training_data.iloc[:, 12].values
y_test_energy = training_data.iloc[:, 13].values
y_test_tempo = training_data.iloc[:, 14].values

In [8]:
# OneHotEncoding country and week variable
ct = ColumnTransformer(transformers = [("encoder", OneHotEncoder(), [0, 1])], remainder = "passthrough")
x_train = ct.fit_transform(np.array(x_train))
x_test = ct.fit_transform(np.array(x_test))

In [9]:
from sklearn.preprocessing import PolynomialFeatures
import scipy.special

poly = PolynomialFeatures(interaction_only = True)
x_train = poly.fit_transform(x_train)
x_test = poly.fit_transform(x_test)

In [30]:
# Training the linear model
regressor = LinearRegression()
regressor.fit(x_train, y_test_valence) # for each dependent variable (valene, danceability, energy, tempo)

LinearRegression()

In [31]:
# Predicting the Test set results
y_pred_valence = regressor.predict(x_test) # for each dependent variable (valene, danceability, energy, tempo)

In [40]:
df = pd.DataFrame({'weighted_valence_pred':y_pred_valence,
                   'weighted_danceability_pred':y_pred_danceability,
                   'weighted_energy_pred':y_pred_energy,
                   'weighted_tempo_pred':y_pred_tempo})

In [41]:
df.shape

(755044, 4)

In [42]:
df.to_csv(file_path + 'data/6_weighted_audio_features_pred.csv')