### imports

In [3]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### retrieving the data to increase in size

In [4]:
df = pd.read_hdf('../data/aggregated_song_features.h5', key='data')
print('current size of the dataset', df.shape)
print(df.head())

# drop title since it is not a feature that is useful for the scaling tests
df = df.drop(columns=['title'])

current size of the dataset (10000, 4)
       bpm  loudness  year             title
0   92.198   -11.197     0  I Didn't Mean To
1  121.274    -9.843  1969         Soul Deep
2  100.070    -9.689     0   Amor De Cabaret
3  119.293    -9.013  1982   Something Girls
4  129.738    -4.501  2007    Face the Ashes


In [5]:
def randomize_loudness(df):
    # for all songs with bpm > 0, randomize it to a number between -40 and -1
    df['loudness'] = df['loudness'].apply(lambda x: np.random.randint(-40, 0) if x > 0 else x)
    return df

def make_bpm_less_than_300(df):
    # for all songs with bpm > 300, randomize it to a number between 100 and 300
    df['bpm'] = df['bpm'].apply(lambda x: np.random.randint(100, 300) if x > 300 else x)
    return df

#double the size of the dataset without creating duplicates
df_double = pd.concat([df +1, df +2])
# keep the year column as 0 for the new songs
df_double['year'] = df_double['year'].replace(1, 0)
df_double['year'] = df_double['year'].replace(2, 0)

# save the dataset to a csv file
df_double.to_csv('../data/aggregated_song_features_double.csv', index=False)

# quadruple size
df_quadruple = pd.concat([df_double +3, df_double +4])
df_quadruple['year'] = df_quadruple['year'].replace(3, 0)
df_quadruple['year'] = df_quadruple['year'].replace(4, 0)

df_quadruple.to_csv('../data/aggregated_song_features_quadruple.csv', index=False)


# 10 times the size
df_10 = pd.concat([df_double +5, df_double +6, df_double +7, df_double +8, df_double +9])
df_10['year'] = df_10['year'].replace(5, 0)
df_10['year'] = df_10['year'].replace(6, 0)
df_10['year'] = df_10['year'].replace(7, 0)
df_10['year'] = df_10['year'].replace(8, 0)
df_10['year'] = df_10['year'].replace(9, 0)

df_10 = randomize_loudness(df_10)

df_10.to_csv('../data/aggregated_song_features_10x.csv', index=False)

# 100 times the size
df_100 = pd.concat([df_10, df_10+1, df_10+2, df_10+3, df_10+4, df_10+5, df_10+6, df_10+7, df_10+8, df_10+9])
df_100['year'] = df_100['year'].replace(1, 0)
df_100['year'] = df_100['year'].replace(2, 0)
df_100['year'] = df_100['year'].replace(3, 0)
df_100['year'] = df_100['year'].replace(4, 0)
df_100['year'] = df_100['year'].replace(5, 0)
df_100['year'] = df_100['year'].replace(6, 0)
df_100['year'] = df_100['year'].replace(7, 0)
df_100['year'] = df_100['year'].replace(8, 0)
df_100['year'] = df_100['year'].replace(9, 0)

df_100 = randomize_loudness(df_100)
df_100 = make_bpm_less_than_300(df_100)


df_100.to_csv('../data/aggregated_song_features_100x.csv', index=False)

#1000x the size of df

df_1000 = pd.concat([df_100, df_100+11, df_100+12, df_100+13, df_100+14, df_100+15, df_100+16, df_100+17, df_100+18, df_100+19])
df_1000['year'] = df_1000['year'].replace(11, 0)
df_1000['year'] = df_1000['year'].replace(12, 0)
df_1000['year'] = df_1000['year'].replace(13, 0)
df_1000['year'] = df_1000['year'].replace(14, 0)
df_1000['year'] = df_1000['year'].replace(15, 0)
df_1000['year'] = df_1000['year'].replace(16, 0)
df_1000['year'] = df_1000['year'].replace(17, 0)
df_1000['year'] = df_1000['year'].replace(18, 0)
df_1000['year'] = df_1000['year'].replace(19, 0)

df_1000 = randomize_loudness(df_1000)
df_1000 = make_bpm_less_than_300(df_1000)

df_1000.to_csv('../data/aggregated_song_features_1000x.csv', index=False)

# 2000x the size of df

df_2000 = pd.concat([df_1000 +21, df_1000+22])
df_2000['year'] = df_2000['year'].replace(21, 0)
df_2000['year'] = df_2000['year'].replace(22, 0)

df_2000 = randomize_loudness(df_2000)
df_2000 = make_bpm_less_than_300(df_2000)

df_2000.to_csv('../data/aggregated_song_features_2000x.csv', index=False)


# 4000x the size of df

df_4000 = pd.concat([df_2000 +23, df_2000+24])
df_4000['year'] = df_4000['year'].replace(23, 0)
df_4000['year'] = df_4000['year'].replace(24, 0)

df_4000 = randomize_loudness(df_4000)
df_4000 = make_bpm_less_than_300(df_4000)

df_4000.to_csv('../data/aggregated_song_features_4000x.csv', index=False)