In [1]:
import pandas as pd
import numpy as np

In [2]:
df_lewagon_init = pd.read_csv('../data/lewagon-spotify-data.csv')
df_georg_init = pd.read_csv('../data/data-georgemcintire.csv')
df_mahar_init = pd.read_csv('../data/data-maharshipandya.csv')
df_tom_init = pd.read_csv('../data/data-tomigelo-2019-04.csv')

## Style check out

In [3]:
df_lewagon_init.head(1)

Unnamed: 0,name,artists,popularity,danceability,valence,energy,explicit,key,liveness,loudness,speechiness,tempo
0,We're For The Dark - Remastered 2010,['Badfinger'],22,0.678,0.559,0.432,0,3,0.0727,-12.696,0.0334,117.674


In [4]:
df_georg_init.head(1)

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4.0,0.286,1,Mask Off,Future


In [5]:
df_mahar_init.head(1)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic


In [6]:
df_tom_init.head(1)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


## Establishing features to include

In [7]:
dfs = [df_lewagon_init, df_georg_init, df_mahar_init, df_tom_init]
columns_common = set.intersection(*list(map(lambda df: set(df.columns), dfs)))

In [8]:
columns_common_nolewagon = set.intersection(
    *list(map(
        lambda df: set(df.columns), [df_georg_init, df_mahar_init, df_tom_init])))

In [9]:
columns_rest = (columns_common_nolewagon -
               columns_common - {'duration_ms', 'time_signature'})

In [10]:
audio_features = sorted(list(set.union(columns_common, columns_rest)))

In [11]:
list(columns_rest)

['acousticness', 'mode', 'instrumentalness']

In [12]:
audio_features

['acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo',
 'valence']

## Format the dataframes to a unique style

In [13]:
def semicolonize(s):
    return ';'.join(s.strip("[]'").split("', '"))

def listify(s):
    s = s.split(';')
    return str(s)

### Le Wagon

In [14]:
df_lewagon = df_lewagon_init.copy()

In [15]:
df_lewagon[list(columns_rest)] = np.nan

In [16]:
df_lewagon = df_lewagon.rename(columns={'name': 'song_title'})
columns = ['artists', 'song_title'] + audio_features
df_lewagon = df_lewagon[columns]

In [17]:
df_lewagon.head()

Unnamed: 0,artists,song_title,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,['Badfinger'],We're For The Dark - Remastered 2010,,0.678,0.432,,3,0.0727,-12.696,,0.0334,117.674,0.559
1,['Elton John'],Sixty Years On - Piano Demo,,0.456,0.368,,6,0.156,-10.692,,0.028,143.783,0.259
2,['The Guess Who'],Got to Find Another Way,,0.433,0.724,,0,0.17,-9.803,,0.0378,84.341,0.833
3,['Joe Cocker'],Feelin' Alright - Live At The Fillmore East/1970,,0.436,0.914,,5,0.855,-6.955,,0.061,174.005,0.87
4,['Van Morrison'],Caravan - Take 7,,0.669,0.412,,7,0.401,-13.095,,0.0679,78.716,0.564


### Georg

In [18]:
df_georg = df_georg_init.drop(columns=['Unnamed: 0', 'target',
                                       'time_signature', 'duration_ms'])
df_georg = df_georg.rename(columns={'artist': 'artists'})
df_georg = df_georg[columns]

In [19]:
df_georg.head()

Unnamed: 0,artists,song_title,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,Future,Mask Off,0.0102,0.833,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,0.286
1,Childish Gambino,Redbone,0.199,0.743,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,0.588
2,Future,Xanny Family,0.0344,0.838,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,0.173
3,Beach House,Master Of None,0.604,0.494,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,0.23
4,Junior Boys,Parallel Lines,0.18,0.678,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,0.904


### Mahar

In [20]:
#list(df_mahar_init.columns)

In [21]:
df_mahar = df_mahar_init.rename(columns={'track_name': 'song_title'})
df_mahar = df_mahar[columns]
df_mahar = df_mahar.dropna()

In [22]:
df_mahar.head()

Unnamed: 0,artists,song_title,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,Gen Hoshino,Comedy,0.0322,0.676,0.461,1e-06,1,0.358,-6.746,0,0.143,87.917,0.715
1,Ben Woodward,Ghost - Acoustic,0.924,0.42,0.166,6e-06,1,0.101,-17.235,1,0.0763,77.489,0.267
2,Ingrid Michaelson;ZAYN,To Begin Again,0.21,0.438,0.359,0.0,0,0.117,-9.734,1,0.0557,76.332,0.12
3,Kina Grannis,Can't Help Falling In Love,0.905,0.266,0.0596,7.1e-05,0,0.132,-18.515,1,0.0363,181.74,0.143
4,Chord Overstreet,Hold On,0.469,0.618,0.443,0.0,2,0.0829,-9.681,1,0.0526,119.949,0.167


In [23]:
count_semicolons = lambda x: x.count(';')
df_mahar['artists'].map(count_semicolons).value_counts()

0     83924
1     21112
2      6246
3      1706
4       512
5       221
6       118
7        51
9        24
8        24
10       12
16        9
11        8
17        6
12        5
14        4
13        4
22        3
18        2
29        2
30        1
25        1
19        1
21        1
31        1
37        1
Name: artists, dtype: int64

### Tom

In [24]:
df_tom = df_tom_init.rename(
    columns={'track_name': 'song_title', 'artist_name': 'artists'})
df_tom = df_tom[columns]

In [25]:
df_tom.head()

Unnamed: 0,artists,song_title,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,YG,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,0.118
1,YG,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,0.371
2,R3HAB,Radio Silence,0.025,0.603,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,0.382
3,Chris Cooq,Lactose,0.0294,0.8,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,0.641
4,Chris Cooq,Same - Original mix,3.5e-05,0.783,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,0.928


## Merge dataframes

In [26]:
df_no_wagon = pd.concat([df_georg, df_mahar, df_tom])
df_no_wagon = df_no_wagon.drop_duplicates().reset_index().drop(columns='index')

In [27]:
df_no_wagon['artists'] = df_no_wagon['artists'].map(listify)

In [28]:
df = pd.concat([df_lewagon, df_no_wagon])
df = df.drop_duplicates().reset_index().drop(columns='index')

In [29]:
df[['acousticness', 'instrumentalness']] = (
    df[['acousticness', 'instrumentalness']].fillna(
        df[['acousticness', 'instrumentalness']].median())
)

In [30]:
df.drop(columns=['acousticness', 'instrumentalness', 'mode'])

Unnamed: 0,artists,song_title,danceability,energy,key,liveness,loudness,speechiness,tempo,valence
0,['Badfinger'],We're For The Dark - Remastered 2010,0.678,0.432,3,0.0727,-12.696,0.0334,117.674,0.5590
1,['Elton John'],Sixty Years On - Piano Demo,0.456,0.368,6,0.1560,-10.692,0.0280,143.783,0.2590
2,['The Guess Who'],Got to Find Another Way,0.433,0.724,0,0.1700,-9.803,0.0378,84.341,0.8330
3,['Joe Cocker'],Feelin' Alright - Live At The Fillmore East/1970,0.436,0.914,5,0.8550,-6.955,0.0610,174.005,0.8700
4,['Van Morrison'],Caravan - Take 7,0.669,0.412,7,0.4010,-13.095,0.0679,78.716,0.5640
...,...,...,...,...,...,...,...,...,...,...
226312,['Calum Scott'],Come Back Home,0.601,0.801,11,0.0991,-5.174,0.0323,131.049,0.2890
226313,['Saint Claire'],Enough for You,0.387,0.249,9,0.1030,-13.233,0.0437,94.039,0.3460
226314,['Mike Stud'],Do It,0.717,0.532,8,0.0997,-8.351,0.2060,156.977,0.5460
226315,['D Savage'],No Smoke,0.772,0.510,4,0.1310,-9.670,0.1200,120.049,0.0755


## Preprocessong function

In [24]:
def semicolonize(s: str) -> str:
    return ';'.join(s.strip("[]'").split("', '"))


def listify(s: str) -> str:
    s = s.split(';')
    return str(s)


def preprocess_data() -> pd.DataFrame:
    df_lewagon_init = pd.read_csv('../data/lewagon-spotify-data.csv')
    df_georg_init = pd.read_csv('../data/data-georgemcintire.csv')
    df_mahar_init = pd.read_csv('../data/data-maharshipandya.csv')
    df_tom_init = pd.read_csv('../data/data-tomigelo-2019-04.csv')

    dfs_init = [df_lewagon_init, df_georg_init, df_mahar_init, df_tom_init]
    columns_common_set = set.intersection(
        *list(map(lambda df: set(df.columns), dfs_init)))
    audio_features = sorted(list(columns_common_set))
    columns = ['artists', 'song_title'] + audio_features

    df_lewagon = df_lewagon_init.copy()
    df_georg = df_georg_init.copy()
    df_mahar = df_mahar_init.copy()
    df_tom = df_tom_init.copy()

    df_lewagon['artists'] = df_lewagon['artists'].map(semicolonize)
    df_lewagon = df_lewagon.rename(columns={'name': 'song_title'})
    df_georg = df_georg.rename(columns={'artist': 'artists'})
    df_mahar = df_mahar.rename(columns={'track_name': 'song_title'})
    df_tom = df_tom.rename(
        columns={'track_name': 'song_title', 'artist_name': 'artists'})

    df_lewagon = df_lewagon[columns]
    df_georg = df_georg[columns]
    df_mahar = df_mahar[columns]
    df_tom = df_tom[columns]

    dfs = [df_lewagon, df_georg, df_mahar, df_tom]

    df = pd.concat([df_lewagon, df_georg, df_mahar, df_tom])
    df = df[columns]
    df = df.dropna().drop_duplicates().reset_index().drop(columns='index')

    df['artists'] = df['artists'].map(listify)
    df.to_csv('../data/all-songs.csv')

In [25]:
preprocess_data()

In [26]:
df = pd.read_csv('../data/all-songs.csv')

In [27]:
df

Unnamed: 0.1,Unnamed: 0,artists,song_title,danceability,energy,key,liveness,loudness,speechiness,tempo,valence
0,0,['Badfinger'],We're For The Dark - Remastered 2010,0.678,0.432,3,0.0727,-12.696,0.0334,117.674,0.5590
1,1,['Elton John'],Sixty Years On - Piano Demo,0.456,0.368,6,0.1560,-10.692,0.0280,143.783,0.2590
2,2,['The Guess Who'],Got to Find Another Way,0.433,0.724,0,0.1700,-9.803,0.0378,84.341,0.8330
3,3,['Joe Cocker'],Feelin' Alright - Live At The Fillmore East/1970,0.436,0.914,5,0.8550,-6.955,0.0610,174.005,0.8700
4,4,['Van Morrison'],Caravan - Take 7,0.669,0.412,7,0.4010,-13.095,0.0679,78.716,0.5640
...,...,...,...,...,...,...,...,...,...,...,...
226017,226017,['Calum Scott'],Come Back Home,0.601,0.801,11,0.0991,-5.174,0.0323,131.049,0.2890
226018,226018,['Saint Claire'],Enough for You,0.387,0.249,9,0.1030,-13.233,0.0437,94.039,0.3460
226019,226019,['Mike Stud'],Do It,0.717,0.532,8,0.0997,-8.351,0.2060,156.977,0.5460
226020,226020,['D Savage'],No Smoke,0.772,0.510,4,0.1310,-9.670,0.1200,120.049,0.0755


In [None]:
""