In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import linear_model
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import KFold, train_test_split

In [26]:
import glob
import os

# setting the path for joining multiple files
files = os.path.join("../data/", "*.csv")

# list of merged files returned
files = glob.glob(files)

print("Resultant CSV after joining all CSV files at a particular location...");

# joining files with concat and read_csv
data = pd.concat(map(pd.read_csv, files), ignore_index=True)
data.shape

Resultant CSV after joining all CSV files at a particular location...


(26752, 22)

In [27]:
data.columns = ['Artist_Name', 'Track_Name', 'Popularity', 'Genres', 'Playlist',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature']

df = data.drop_duplicates(subset =["Track_Name", "Artist_Name"],keep = 'first')
df.shape

(18554, 22)

In [28]:
## No missing values 
## Step 1 - create dummies for variables without any order
df_new = pd.get_dummies(df, columns=['time_signature', 'key', 'mode'])
df_new['duration_min'] = df['duration_ms']/60000
pd.options.display.max_columns = None

In [29]:
X_other, X_test, y_other, y_test = train_test_split(df_new.drop(['Popularity'],axis=1), df_new['Popularity'], \
                                                    test_size= 0.2, random_state=42)
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
result = next(kf.split(X_other), None)
X_train = X_other.iloc[result[0]]
X_val =  X_other.iloc[result[1]]
y_train = y_other.iloc[result[0]]
y_val =  y_other.iloc[result[1]]

X_train.shape, X_val.shape, X_test.shape

((11874, 38), (2969, 38), (3711, 38))

In [30]:
df_new[['duration_min', 'duration_ms']]

Unnamed: 0,duration_min,duration_ms
0,3.116217,186973
1,2.430883,145853
2,4.193783,251627
3,4.934700,296082
4,2.972217,178333
...,...,...
26745,4.302283,258137
26747,3.796883,227813
26748,4.605550,276333
26749,3.053567,183214


In [31]:
df_new['loudness'].describe()

count    18554.00000
mean        -7.48425
std          3.50200
min        -34.82500
25%         -9.11300
50%         -6.81700
75%         -5.08225
max          1.35500
Name: loudness, dtype: float64

In [32]:
col_names = ['loudness']
features = X_train[col_names]
scaler = StandardScaler()
features = scaler.fit_transform(features.values)
X_train[col_names] = features



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [33]:
df_new['loudness'].describe()

count    18554.00000
mean        -7.48425
std          3.50200
min        -34.82500
25%         -9.11300
50%         -6.81700
75%         -5.08225
max          1.35500
Name: loudness, dtype: float64

In [34]:
df_new.describe()

Unnamed: 0,Popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,mode_0,mode_1,duration_min
count,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0,18554.0
mean,43.306295,0.550663,0.685337,-7.48425,0.086661,0.209958,0.120403,0.199014,0.486302,123.240495,236068.1,5.4e-05,0.005875,0.064676,0.917646,0.011749,0.109141,0.100949,0.108117,0.02738,0.086396,0.074647,0.067263,0.113884,0.061011,0.106716,0.058478,0.086019,0.383906,0.616094,3.934469
std,17.671985,0.169173,0.219051,3.502,0.090623,0.279603,0.258035,0.159602,0.237177,29.606416,84795.71,0.007341,0.076423,0.24596,0.274911,0.107759,0.311824,0.301269,0.310536,0.163191,0.280956,0.262828,0.250484,0.317679,0.239357,0.30876,0.234651,0.2804,0.486349,0.486349,1.413262
min,0.0,0.0,2e-05,-34.825,0.0,0.0,0.0,0.0119,0.0,0.0,13793.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229883
25%,32.0,0.436,0.537,-9.113,0.0356,0.0031,1e-06,0.0976,0.301,99.95225,188600.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.143333
50%,43.0,0.551,0.716,-6.817,0.0503,0.0612,0.000313,0.132,0.481,120.9935,220574.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.676233
75%,55.0,0.67,0.872,-5.08225,0.092475,0.339,0.042475,0.265,0.669,142.177,262490.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.374842
max,100.0,0.989,1.0,1.355,0.96,0.996,0.996,0.992,0.986,249.438,1561133.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,26.018883


In [35]:
df_new.shape

(18554, 39)