In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.mlab as mlab
from matplotlib import rcParams
from IPython.display import display, Markdown, Latex
from pylab import *
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import pickle as pk

## Getting Data Ready

In [2]:
# Load Data

data = pd.read_csv('cleaned_data.csv', low_memory=False) 

In [3]:
# Defining playlists that are successful (by id)

target_playlists = ['6FfOZSAN3N6u7v81uS7mxZ', '37i9dQZF1DX5uokaTN4FTR', '37i9dQZF1DWVTKDs2aOkxut', '37i9dQZF1DX4W3aJJYCDfV']

In [4]:
# Splitting by Binary Classifier

data['Success'] = np.where(data.playlist_id.isin(target_playlists), 1, 0) 
successful = pd.DataFrame(data.groupby('artist_name').Success.sum())
successful['Success'] = successful['Success'].apply(lambda x: 1 if x>0 else 0)
print('rows of successful:',len(successful[successful.Success == 1]))
print('rows of not successful:',len(successful[successful.Success == 0]))

rows of successful: 61
rows of not successful: 600


## Get DataFrame Ready for Feature Engineering

The idea is to minimise as much feature and data leakage as possible. Hence, all rows from the main dataframe and all playlists with over 10k streams will be ignored. We have preprocessed and come up with the playlists to ignore. We will drop rows that include these playlists. 

In [5]:
# Load and copy data

# Copying, to avoid complications
data2 = data.copy()

# Loading
playlists_to_ignore = pd.read_csv('playlists_to_ignore_PCA.csv')

In [6]:
# Making necessary transformations

data2 = data2[~data2.playlist_id.isin(playlists_to_ignore.playlist_id.tolist())]

In [7]:
# Changing names to make it easier to apply to Ghea's code

df_user = data2.copy()

### Gender

In [8]:
# Creating gender table

df_user.loc[df_user.gender=="male","gender_binary"] = 1 #create new column and denote '1' if male
df_user.loc[df_user.gender=="female","gender_binary"] = 0 # denote '0' if female
df_gender = df_user.groupby(['artist_name']).gender_binary.mean() #'mean' method gives percentage of male
df_gender = DataFrame(df_gender)
df_gender = df_gender.rename(columns = {'gender_binary':'percentage of males'}) 
df_gender = df_gender.reset_index()
df_gender.head()

Unnamed: 0,artist_name,percentage of males
0,#90s Update,0.5625
1,17 Memphis,0.333333
2,2D,1.0
3,3JS,0.8
4,99 Percent,0.322074


### Other User Features

In [9]:
# Age

df_user['birth_year'] = 2022 - df_user['birth_year']
df_user.rename(columns = {'birth_year':'age'}, inplace = True)
df_user['age'].dropna(inplace = True)

#Creating age bins

age_bins_df = df_user[["artist_name", "customer_id", "age"]]
age_bins_df = age_bins_df.drop_duplicates(subset = ['customer_id'])
bins = [0, 18, 25, 40, 70]
group_names = ['children', 'teenager', 'adult', 'senior']

# create bins out of intervals
age_bins_df['age category'] = pd.cut(x=age_bins_df['age'], bins = bins, labels = group_names)

# turn into artist name level dataframe
age_bins_df = age_bins_df.set_index('artist_name') 

# find number of children listeners per artist
children = age_bins_df[age_bins_df['age category']=='children']
children_count = DataFrame(children.groupby('artist_name')['age category'].count())
children_count = children_count.rename(columns = {'age category':'number of children'})

# find number of teenager listeners per artist
teenager = age_bins_df[age_bins_df['age category']=='teenager']
teenager_count = DataFrame(teenager.groupby('artist_name')['age category'].count())
teenager_count = teenager_count.rename(columns = {'age category':'number of teenager'})

# find number of adult listeners per artist
adult = age_bins_df[age_bins_df['age category']=='adult']
adult_count = DataFrame(adult.groupby('artist_name')['age category'].count())
adult_count = adult_count.rename(columns = {'age category':'number of adult'})

# find number of senior listeners per artist
senior = age_bins_df[age_bins_df['age category']=='senior']
senior_count = DataFrame(senior.groupby('artist_name')['age category'].count())
senior_count = senior_count.rename(columns = {'age category':'number of senior'})

# merge into one dataframe
age_merge_df = pd.concat([children_count, teenager_count, adult_count, senior_count], axis = 1, sort = 'True').fillna(0)
age_merge_df.head()

# calculate as % of total listeners
age_merge_df['% of children'] = age_merge_df['number of children']/(age_merge_df['number of children'] + 
                                                                    age_merge_df['number of teenager'] + 
                                                                    age_merge_df['number of adult'] + 
                                                                    age_merge_df['number of senior'])
age_merge_df['% of teenager'] = age_merge_df['number of teenager']/(age_merge_df['number of children'] + 
                                                                    age_merge_df['number of teenager'] + 
                                                                    age_merge_df['number of adult'] + 
                                                                    age_merge_df['number of senior'])
age_merge_df['% of adult'] = age_merge_df['number of adult']/(age_merge_df['number of children'] + 
                                                              age_merge_df['number of teenager'] + 
                                                              age_merge_df['number of adult'] + 
                                                              age_merge_df['number of senior'])
age_merge_df['% of senior'] = age_merge_df['number of senior']/(age_merge_df['number of children'] + 
                                                                age_merge_df['number of teenager'] + 
                                                                age_merge_df['number of adult'] + 
                                                                age_merge_df['number of senior'])

# create a dataframe
listener_age_proportion = age_merge_df[['% of children', '% of teenager', '% of adult', '% of senior']]
listener_age_proportion.head()

Unnamed: 0_level_0,% of children,% of teenager,% of adult,% of senior
artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
#90s Update,0.0,0.076923,0.692308,0.230769
17 Memphis,0.0,0.2,0.7,0.1
2D,0.0,1.0,0.0,0.0
3JS,0.0,0.0,0.5,0.5
99 Percent,0.001014,0.477688,0.366126,0.155172


In [10]:
listener_age_proportion = listener_age_proportion.reset_index()

## Artist Features

In [11]:
# Direct Artist Features

newartist = pd.read_csv('newartists2015onwards.csv', low_memory=False) 
df_artist = newartist[['DISPLAY_NAME', 'COUNTRY_OF_ORIGIN_CODE', 'MAJOR_GENRE_CODE']].copy()
df_artist.rename(columns={"DISPLAY_NAME": "artist_name"}, inplace = True)
df_artist.rename(columns={"COUNTRY_OF_ORIGIN_CODE": "country_origin",'MAJOR_GENRE_CODE':'genre'} , inplace = True)

In [12]:
# Unique listeners and streams

df_1 = data2[['artist_name', 'customer_id']].copy()
df_1 = df_1.drop_duplicates()
df_1a = df_1.groupby(by=["artist_name"]).count()
df_2 = data2[['artist_name']].copy()
df_2a = df_2.groupby(['artist_name']).size().reset_index(name='counts')
df_3 = pd.merge(df_1a, df_2a, how = 'outer', left_on = ['artist_name'], right_on = ['artist_name'])
df_3 = df_3.rename(columns={"customer_id":"unique_listener", "counts": "listened_count"})

# Artist passion score
df_3['artist_passion_score'] = (df_3['listened_count']/df_3['unique_listener'])

df_3.head()

Unnamed: 0,artist_name,unique_listener,listened_count,artist_passion_score
0,#90s Update,15,16,1.066667
1,17 Memphis,12,12,1.0
2,2D,1,1,1.0
3,3JS,4,5,1.25
4,99 Percent,1189,1291,1.085786


## Playlist Features:

In [13]:
# Getting features ready

df_playlist = data2.copy()
playlist_df = df_playlist[['playlist_id', 'artist_name', 'customer_id']]
playlist_df_1 = DataFrame(playlist_df.groupby('artist_name').playlist_id.value_counts())

# count prior playlist stream count
playlist_df_1 = playlist_df_1.rename(columns = {'playlist_id':'Prior Playlist Stream Counts'})

# count prior playlist unique user
playlist_df_2 = DataFrame(playlist_df.groupby('playlist_id').customer_id.nunique())

#rename column
playlist_df_2 = playlist_df_2.rename(columns = {'customer_id':'number of unique streamers'})

In [14]:
# Getting passion score for playlists

# merge user and stream count dataframe
playlist_df_passion = pd.merge(playlist_df_1, playlist_df_2, right_index = True, left_index = True)

# create new column for playlist passion score
playlist_df_passion['Playlist Passion Score'] = (playlist_df_passion['Prior Playlist Stream Counts']/
                                                 playlist_df_passion['number of unique streamers'])

playlist_df_passion.sort_values(by = 'Playlist Passion Score',ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Prior Playlist Stream Counts,number of unique streamers,Playlist Passion Score
artist_name,playlist_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kehlani,4OTLViTKWjpaOdqRI5V36g,9,1,9.0
gnash,0rMCs9w1IqYbHZ9s9uk34t,25,3,8.333333
Lil Uzi Vert,6gyAHpLmSy0ZgPNxKopw1h,7,1,7.0
A Boogie Wit da Hoodie,6FItHoZkAegELfKw5LFztU,5,1,5.0
Lil Uzi Vert,5dExtyCpYBZhJM0GKU52RK,13,3,4.333333
Kiiara,1IqznySq6bND671tZyFi33,4,1,4.0
M City JR,6rViSD8FIDmeLpeHiY1Nfo,4,1,4.0
Zion & Lennox,3eOvoHXZWJCXqtR8KC6Zhu,4,1,4.0
Lil Uzi Vert,133Km3RMwdxnVhuenreYUD,4,1,4.0
Lil Uzi Vert,4P98v77PtNgzIqaGcKDxid,4,1,4.0


In [15]:
# since the score is on a playlist level, we need to average the score to find the mean for each artist

# quantified playlist features dataframe
playlist_df_final = playlist_df_passion.groupby('artist_name').agg({'Playlist Passion Score':np.mean}).reset_index()
playlist_df_final.sort_values(by = 'Playlist Passion Score',ascending=False).head()

Unnamed: 0,artist_name,Playlist Passion Score
233,Lele Marchitelli,1.689716
154,Havana Maestros,1.5
409,Teresa Cristina,1.5
323,PRO8L3M,1.166667
456,WEDNESDAY CAMPANELLA,1.15


**Comment:** Merge on Artist and fill na with 0

In [16]:
main_df = df_gender.merge(listener_age_proportion, 
                          # First merge
                          on='artist_name', 
                          how='outer').merge(df_3, 
                                             # Second Merge
                                             on='artist_name', 
                                             how='outer').merge(playlist_df_final, 
                                                                on='artist_name', 
                                                                how='outer').merge(successful, 
                                                                                   on='artist_name',
                                                                                   how='outer').fillna(0)
main_df.head()

Unnamed: 0,artist_name,percentage of males,% of children,% of teenager,% of adult,% of senior,unique_listener,listened_count,artist_passion_score,Playlist Passion Score,Success
0,#90s Update,0.5625,0.0,0.076923,0.692308,0.230769,15,16,1.066667,0.051013,0
1,17 Memphis,0.333333,0.0,0.2,0.7,0.1,12,12,1.0,0.081551,0
2,2D,1.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,0
3,3JS,0.8,0.0,0.0,0.5,0.5,4,5,1.25,1.0,0
4,99 Percent,0.322074,0.001014,0.477688,0.366126,0.155172,1189,1291,1.085786,0.479537,0


In [18]:
# STOPPER

STOPPER

## PCA Primary Steps:
1. Get the basic stuff that do not need PCA. 
2. Split the dataframe into train and test set based on artists and success_status
3. Split train into another train and 'validation' set --> T: 60%, V: 20%, t: 20%
4. Run PCA on region, stream, audio features and lyrics on train ONLY after reorganising along artist

In [43]:
# Creating Train_Test Split

# Setting up X and y
X = main_df.copy().drop('Success', axis=1)
y = main_df.copy()['Success']

# Split 1
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.72, random_state=101, stratify=y) # Test size = 30% of overall dataset

# Split 2 - Creating Train_Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.60, random_state=101) # Validation size = 20% of overall dataset

### Sizes

In [44]:
len(y_train)

285

In [45]:
len(y_val)

190

In [46]:
len(y_test)

186

### Number of Successes

In [50]:
y_train.sum()

26

In [51]:
y_val.sum()

18

In [52]:
y_test.sum()

17

In [53]:
# Saving

pd.concat([X_train, y_train], axis=1).to_csv('./Train_Validation_Test/train.csv', index=False)
pd.concat([X_val, y_val], axis=1).to_csv('./Train_Validation_Test/val.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('./Train_Validation_Test/test.csv', index=False)

In [54]:
# Load

train = pd.read_csv('./Train_Validation_Test/train.csv')
val = pd.read_csv('./Train_Validation_Test/val.csv')
test = pd.read_csv('./Train_Validation_Test/test.csv')

**Comment:** We do not touch y

### Ghea's PCA Columns

Artist region

In [55]:
# Getting regions ready for PCA

by_artist_region = df_user.groupby(['artist_name', 'region_code'])
region_pca_df = by_artist_region.size().unstack().fillna(0).reset_index()

In [130]:
region_pca_df.to_csv('./Interpretability_PCA/region_pca_with_columns_intact.csv', index=False)

Playlist streams

In [56]:
# Getting streams by playlist ready for PCA

by_artist_playlist = df_playlist.groupby(['artist_name', 'playlist_id'])
playlist_pca_df = by_artist_playlist.size().unstack().fillna(0).reset_index()

In [128]:
# playlist_pca_df.to_csv('./Interpretability_PCA/pca_with_columns_intact.csv', index=False)

### Sharaf's PCA Columns

Audio Features

In [58]:
# Don't Run Cell Below
STOPPER

In [59]:
# Getting audio features ready for PCA

# Loading and processing data
audio_features = pd.read_csv('audio_features.csv')
audio_features.drop('Unnamed: 0', axis=1, inplace=True)

# Perhaps we don't need some columns (probably unveiled in EDA section)
audio_features.drop(['duration_ms', 'key'], axis=1, inplace=True)

# Getting joined table ready
audio_joined = data2[['track_uri', 'playlist_id', 'artist_name', 'playlist_name']].copy().merge(audio_features, how='left', on='track_uri')
audio_joined.drop_duplicates(inplace=True)

# Get pivot tables ready

danceability_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='danceability', aggfunc=np.mean).fillna(0)

energy_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='energy', aggfunc=np.mean).fillna(0)

loudness_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='loudness', aggfunc=np.mean).fillna(0)

mode_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='mode', aggfunc=np.mean).fillna(0)

speechiness_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='speechiness', aggfunc=np.mean).fillna(0)

acousticness_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='acousticness', aggfunc=np.mean).fillna(0)

instrumentalness_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='instrumentalness', aggfunc=np.mean).fillna(0)

liveness_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='liveness', aggfunc=np.mean).fillna(0)

valence_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='valence', aggfunc=np.mean).fillna(0)

tempo_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='tempo', aggfunc=np.mean).fillna(0)

time_signature_pivot = pd.pivot_table(audio_joined, index='artist_name', 
                                    columns='playlist_id', values='time_signature', aggfunc=np.median).fillna(0)

# Full join on all pivot tables
final_audio_pivot = pd.concat([danceability_pivot, energy_pivot, loudness_pivot, mode_pivot, 
                               speechiness_pivot, acousticness_pivot, instrumentalness_pivot, 
                               liveness_pivot, valence_pivot, tempo_pivot, time_signature_pivot], axis=1).reset_index()

In [138]:
# loudness_pivot.to_csv('./Interpretability_PCA/loudness_PCA_interpret.csv')
# speechiness_pivot.to_csv('./Interpretability_PCA/speechiness_PCA_interpret.csv')

In [60]:
# Store and load

# Store in a format favouring more rows and less columns
final_audio_pivot.T.to_csv('audio_pca_ready.csv', index=False)

# Load from here 
final_audio_pivot = pd.read_csv('audio_pca_ready.csv', low_memory=False).T
final_audio_pivot.rename(columns={0:'artist_name'}, inplace=True)

In [136]:
final_audio_pivot

Unnamed: 0,artist_name,1,2,3,4,5,6,7,8,9,...,101334,101335,101336,101337,101338,101339,101340,101341,101342,101343
0,#90s Update,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17 Memphis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3JS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,99 Percent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A Boogie Wit Da Hoodie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,birthday,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
478,dvsn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
479,flor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
480,gnash,0.0,0.492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


#### Forgot to save order of songs - need them back

In [61]:
### Intermediate steps for next PCA

# Loading csv
originals_lyrics = pd.read_csv('song_lyrics_func1.csv')
remixes_lyrics = pd.read_csv('song_lyrics_func2.csv')
lyrics_fixed = pd.read_csv('lyrics_sharaf_fix2.txt')

# Making necessary changes
originals_lyrics.drop('Unnamed: 0', axis=1, inplace=True) # Forgot to remove index before saving as csv
remixes_lyrics.drop('Unnamed: 0', axis=1, inplace=True) # Forgot to remove index before saving as csv

In [62]:
### Intermediate steps for next PCA

final_lyrics = pd.concat([lyrics_fixed.copy(), 
                          originals_lyrics.copy(), 
                          remixes_lyrics.copy()]).drop_duplicates(subset=['track_name', 'artist_name'])

# Drop the ones with fail
final_lyrics = final_lyrics[final_lyrics.lyrics != 'Fail'][final_lyrics.columns[:2]]

In [63]:
# load data for lyrics

embedded_lyrics_df1 = pd.read_csv('3639_lyrics_embeddings.csv')
embedded_lyrics_df1.drop('Unnamed: 0', axis=1, inplace=True)

In [64]:
# Merge embeddings with data2

# Getting df ready
merge_with_embeddings_df = data2[['playlist_id', 'playlist_name', 'track_id', 'track_name', 'artist_name']].drop_duplicates().copy()

# Concatenating with columns to join upon
embedded_lyrics_df2 = pd.concat([final_lyrics.copy().reset_index(drop=True), 
                                 embedded_lyrics_df1.copy().reset_index(drop=True)], axis=1)

# Merging
merged_embeddings = merge_with_embeddings_df.merge(embedded_lyrics_df2, how='left', on=['track_name', 'artist_name'])

Lyrics Embeddings

In [65]:
# Aggregate by each vector by their playlists

pivots_embeddings = []

for column in merged_embeddings.columns[5:]:
    embeddings_playlist_pivot = pd.pivot_table(merged_embeddings, 
                                           index='artist_name', 
                                           columns='playlist_id', 
                                           aggfunc = np.mean,
                                           values=column)
    pivots_embeddings.append(embeddings_playlist_pivot)

pca_ready_embeddings = pd.concat(pivots_embeddings, axis=1).reset_index()

In [66]:
pca_ready_embeddings.shape

(447, 4687361)

### Join to Train and Run PCA

In [67]:
# Defining PCA

pca_region = Pipeline(steps=[('Standardize', StandardScaler()), ('pca', PCA(n_components=5))])
pca_streams = Pipeline(steps=[('Standardize', StandardScaler()), ('pca',PCA(n_components=10))])
pca_audio_features = Pipeline(steps=[('Standardize', StandardScaler()), ('pca', PCA(n_components=9))])
pca_lyrics_embeddings = Pipeline(steps=[('Standardize', StandardScaler()), ('pca', PCA(n_components=9))])

#### Region

In [68]:
# Reorganising

train_region_pca = train['artist_name'].to_frame().merge(region_pca_df, how='left', on='artist_name').fillna(0)
train_region_pca.drop('artist_name', axis=1, inplace=True)

In [69]:
# doing pca on train - region

pca_region.fit(train_region_pca)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=5))])

In [70]:
# Save pkl

pk.dump(pca_region, open("./PCA Models/pca_region.pkl","wb"))

In [71]:
# Load PCA

pca_region = pk.load(open("./PCA Models/pca_region.pkl",'rb'))

In [72]:
# Transform Train set, validation set and test set

# Train
train_pca_region_complete = pd.DataFrame(pca_region.transform(train_region_pca))
train_pca_region_complete.columns = ['region_PCA1', 'region_PCA2', 'region_PCA3', 'region_PCA4', 'region_PCA5']

# Validation
val_region_pca = val['artist_name'].to_frame().merge(region_pca_df, how='left', on='artist_name').fillna(0)
val_region_pca.drop('artist_name', axis=1, inplace=True)
val_pca_region_complete = pd.DataFrame(pca_region.transform(val_region_pca))
val_pca_region_complete.columns = ['region_PCA1', 'region_PCA2', 'region_PCA3', 'region_PCA4', 'region_PCA5']

In [73]:
val_pca_region_complete

Unnamed: 0,region_PCA1,region_PCA2,region_PCA3,region_PCA4,region_PCA5
0,-2.354526,-0.289196,-0.253909,-0.225465,-0.108248
1,-2.216886,-0.343484,-0.129161,-0.227332,-0.157884
2,-2.295512,0.180626,1.793094,-0.634150,-0.569554
3,47.428510,4.333547,-6.031091,4.132601,-1.476903
4,-2.376563,-0.294215,-0.248373,-0.223470,-0.107050
...,...,...,...,...,...
185,0.087565,-0.027254,-0.577212,-0.608812,-0.146235
186,-0.906757,0.760617,0.398614,-0.386933,-0.412069
187,-2.377355,-0.294355,-0.248246,-0.223309,-0.107041
188,-2.376587,-0.294270,-0.248427,-0.223462,-0.107072


#### Playlist Streams

In [74]:
# Reorganising

train_playlist_pca = train['artist_name'].to_frame().merge(playlist_pca_df, how='left', on='artist_name').fillna(0)
train_playlist_pca.drop('artist_name', axis=1, inplace=True)

In [75]:
# Fit on Train ONLY

pca_streams.fit(train_playlist_pca)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=10))])

In [76]:
# Save pkl

pk.dump(pca_streams, open("./PCA Models/pca_streams.pkl","wb"))

In [77]:
# Load PCA

pca_streams = pk.load(open("./PCA Models/pca_streams.pkl",'rb'))

In [78]:
# Transform Train set, validation set and test set

# Train
train_pca_playlist_complete = pd.DataFrame(pca_streams.transform(train_playlist_pca))
train_pca_playlist_complete.columns = ['p_PCA1', 'p_PCA2', 'p_PCA3', 'p_PCA4', 'p_PCA5', 'p_PCA6', 'p_PCA7', 'p_PCA8', 'p_PCA9', 'p_PCA10']

# Validation
val_playlist_pca = val['artist_name'].to_frame().merge(playlist_pca_df, how='left', on='artist_name').fillna(0)
val_playlist_pca.drop('artist_name', axis=1, inplace=True)
val_pca_playlist_complete = pd.DataFrame(pca_streams.transform(val_playlist_pca))
val_pca_playlist_complete.columns = ['p_PCA1', 'p_PCA2', 'p_PCA3', 'p_PCA4', 'p_PCA5', 'p_PCA6', 'p_PCA7', 'p_PCA8', 'p_PCA9', 'p_PCA10']

In [79]:
train_pca_playlist_complete

Unnamed: 0,p_PCA1,p_PCA2,p_PCA3,p_PCA4,p_PCA5,p_PCA6,p_PCA7,p_PCA8,p_PCA9,p_PCA10
0,-2.764349,-1.702704,-0.940343,-1.152846,-1.172591,-0.475917,-0.876800,-0.824872,-0.358337,-1.015720
1,-2.651884,-1.447655,-0.931870,-1.148070,-1.136775,-0.476731,-0.831714,-0.798999,-0.349437,-0.960477
2,-2.748548,-1.402253,-0.932054,-1.159239,-1.034363,0.037001,-0.862613,-0.837638,-0.273763,-1.042812
3,-2.754508,-1.681383,-0.923447,-1.131089,-1.127555,-0.455384,-0.832622,-0.781764,-0.339136,-0.960760
4,-2.756470,-1.685604,-0.926777,-1.135374,-1.136284,-0.459347,-0.841097,-0.790021,-0.342809,-0.971271
...,...,...,...,...,...,...,...,...,...,...
280,-2.754508,-1.681383,-0.923447,-1.131089,-1.127555,-0.455384,-0.832622,-0.781764,-0.339136,-0.960760
281,-2.754508,-1.681383,-0.923447,-1.131089,-1.127555,-0.455384,-0.832622,-0.781764,-0.339136,-0.960760
282,-2.754508,-1.681383,-0.923447,-1.131089,-1.127555,-0.455384,-0.832622,-0.781764,-0.339136,-0.960760
283,-2.743191,-1.282551,-0.932069,-1.169817,-1.166664,-0.451605,-0.854666,-0.764651,-0.448990,-0.903360


#### Audio Features

In [80]:
# Reorganising

train_audio_pca = train['artist_name'].to_frame().merge(final_audio_pivot, how='left', on='artist_name').fillna(0)
train_audio_pca.drop('artist_name', axis=1, inplace=True)

In [81]:
# Fit on Train ONLY

pca_audio_features.fit(train_audio_pca)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=9))])

In [82]:
# Save pkl

pk.dump(pca_audio_features, open("./PCA Models/pca_audio_features.pkl","wb"))

In [83]:
# Load PCA

pca_audio_features = pk.load(open("./PCA Models/pca_audio_features.pkl",'rb'))

In [84]:
# Transform Train set, validation set and test set

# Train
train_pca_audio_complete = pd.DataFrame(pca_audio_features.transform(train_audio_pca))
train_pca_audio_complete.columns = ['audio_PCA1', 'audio_PCA2', 'audio_PCA3', 'audio_PCA4', 'audio_PCA5', 
                                    'audio_PCA6', 'audio_PCA7', 'audio_PCA8', 'audio_PCA9']

# Validation
val_audio_pca = val['artist_name'].to_frame().merge(final_audio_pivot, how='left', on='artist_name').fillna(0)
val_audio_pca.drop('artist_name', axis=1, inplace=True)
val_pca_audio_complete = pd.DataFrame(pca_audio_features.transform(val_audio_pca))
val_pca_audio_complete.columns = ['audio_PCA1', 'audio_PCA2', 'audio_PCA3', 'audio_PCA4', 'audio_PCA5', 
                                    'audio_PCA6', 'audio_PCA7', 'audio_PCA8', 'audio_PCA9']

In [85]:
train_pca_audio_complete

Unnamed: 0,audio_PCA1,audio_PCA2,audio_PCA3,audio_PCA4,audio_PCA5,audio_PCA6,audio_PCA7,audio_PCA8,audio_PCA9
0,-8.775788,-5.968723,-3.148232,-3.556986,-3.585516,-3.215315,-2.015804,-1.867768,-3.832670
1,-7.990299,-5.250854,-3.204149,-3.325179,-2.908342,-3.095504,-1.454699,-1.785189,-2.553329
2,-8.727091,-5.179364,-3.137997,-3.615787,-3.554577,-2.224791,-2.590184,-1.673297,-3.737562
3,-8.741306,-5.892705,-3.094670,-3.489016,-3.467995,-3.077913,-1.924089,-1.776534,-3.641231
4,-8.748046,-5.907458,-3.105028,-3.502138,-3.490427,-3.103921,-1.941406,-1.793710,-3.677248
...,...,...,...,...,...,...,...,...,...
280,-8.741306,-5.892705,-3.094670,-3.489016,-3.467995,-3.077913,-1.924089,-1.776534,-3.641231
281,-8.741306,-5.892705,-3.094670,-3.489016,-3.467995,-3.077913,-1.924089,-1.776534,-3.641231
282,-8.741306,-5.892705,-3.094670,-3.489016,-3.467995,-3.077913,-1.924089,-1.776534,-3.641231
283,-8.708476,-4.823340,-3.150036,-3.645411,-3.393158,-2.452917,-0.733618,-2.378785,-3.874453


#### Lyrics

In [86]:
# Reorganising

train_lyrics_pca = train['artist_name'].to_frame().merge(pca_ready_embeddings, how='left', on='artist_name').fillna(0)
train_lyrics_pca.drop('artist_name', axis=1, inplace=True)

In [87]:
# Fit on Train ONLY

pca_lyrics_embeddings.fit(train_lyrics_pca)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=9))])

In [88]:
# Save pkl

pk.dump(pca_lyrics_embeddings, open("./PCA Models/pca_lyrics_embeddings.pkl","wb"))

In [89]:
# Load PCA

pca_lyrics_embeddings = pk.load(open("./PCA Models/pca_lyrics_embeddings.pkl",'rb'))

In [90]:
# Transform Train set, validation set and test set

# Train
train_pca_lyrics_complete = pd.DataFrame(pca_lyrics_embeddings.transform(train_lyrics_pca))
train_pca_lyrics_complete.columns = ['lyrics_PCA1', 'lyrics_PCA2', 'lyrics_PCA3', 'lyrics_PCA4', 'lyrics_PCA5', 
                                    'lyrics_PCA6', 'lyrics_PCA7', 'lyrics_PCA8', 'lyrics_PCA9']

# Validation
val_lyrics_pca = val['artist_name'].to_frame().merge(pca_ready_embeddings, how='left', on='artist_name').fillna(0)
val_lyrics_pca.drop('artist_name', axis=1, inplace=True)
val_pca_lyrics_complete = pd.DataFrame(pca_lyrics_embeddings.transform(val_lyrics_pca))
val_pca_lyrics_complete.columns = ['lyrics_PCA1', 'lyrics_PCA2', 'lyrics_PCA3', 'lyrics_PCA4', 'lyrics_PCA5', 
                                    'lyrics_PCA6', 'lyrics_PCA7', 'lyrics_PCA8', 'lyrics_PCA9']

In [91]:
train_pca_lyrics_complete

Unnamed: 0,lyrics_PCA1,lyrics_PCA2,lyrics_PCA3,lyrics_PCA4,lyrics_PCA5,lyrics_PCA6,lyrics_PCA7,lyrics_PCA8,lyrics_PCA9
0,-54.034640,-31.637188,-20.828723,-24.154824,-20.020538,-17.586215,-18.241849,-4.935419,-18.738697
1,-51.963156,-29.804337,-20.777676,-23.522865,-17.632991,-17.206006,-16.501635,-5.230594,-16.303966
2,-53.908359,-30.083179,-20.630138,-23.936108,-19.638522,-14.468588,-18.289218,-4.604854,-18.134865
3,-53.829674,-31.239296,-20.447139,-23.687225,-19.321799,-16.804180,-17.366623,-4.691054,-17.810122
4,-53.870543,-31.318072,-20.522333,-23.779291,-19.457618,-16.954967,-17.534878,-4.737962,-17.988400
...,...,...,...,...,...,...,...,...,...
280,-53.829674,-31.239296,-20.447139,-23.687225,-19.321799,-16.804180,-17.366623,-4.691054,-17.810122
281,-53.829674,-31.239296,-20.447139,-23.687225,-19.321799,-16.804180,-17.366623,-4.691054,-17.810122
282,-53.829674,-31.239296,-20.447139,-23.687225,-19.321799,-16.804180,-17.366623,-4.691054,-17.810122
283,-53.884425,-29.119326,-20.627203,-23.929586,-19.450971,-16.337684,-13.315314,-6.366103,-18.609833


## Save All the PCAs by their Formats - ORDER IS IMPORTANT

In [92]:
# All the trains - DO NOT TOUCH

train_pca_region_complete.to_csv('./Train_Validation_Test/train_pca_region_complete.csv', index=False)
train_pca_playlist_complete.to_csv('./Train_Validation_Test/train_pca_playlist_complete.csv', index=False)
train_pca_audio_complete.to_csv('./Train_Validation_Test/train_pca_audio_complete.csv', index=False)
train_pca_lyrics_complete.to_csv('./Train_Validation_Test/train_pca_lyrics_complete.csv', index=False)

In [93]:
# All the val - DO NOT TOUCH

val_pca_region_complete.to_csv('./Train_Validation_Test/val_pca_region_complete.csv', index=False)
val_pca_playlist_complete.to_csv('./Train_Validation_Test/val_pca_playlist_complete.csv', index=False)
val_pca_audio_complete.to_csv('./Train_Validation_Test/val_pca_audio_complete.csv', index=False)
val_pca_lyrics_complete.to_csv('./Train_Validation_Test/val_pca_lyrics_complete.csv', index=False)

## Load All the Completed PCA Sets

In [94]:
# Loading

# Train
train = pd.read_csv('./Train_Validation_Test/train.csv')
train_pca_region_complete = pd.read_csv('./Train_Validation_Test/train_pca_region_complete.csv')
train_pca_playlist_complete = pd.read_csv('./Train_Validation_Test/train_pca_playlist_complete.csv')
train_pca_audio_complete = pd.read_csv('./Train_Validation_Test/train_pca_audio_complete.csv')
train_pca_lyrics_complete = pd.read_csv('./Train_Validation_Test/train_pca_lyrics_complete.csv')

# Val
val = pd.read_csv('./Train_Validation_Test/val.csv')
val_pca_region_complete = pd.read_csv('./Train_Validation_Test/val_pca_region_complete.csv')
val_pca_playlist_complete = pd.read_csv('./Train_Validation_Test/val_pca_playlist_complete.csv')
val_pca_audio_complete = pd.read_csv('./Train_Validation_Test/val_pca_audio_complete.csv')
val_pca_lyrics_complete = pd.read_csv('./Train_Validation_Test/val_pca_lyrics_complete.csv')

In [95]:
# Assembling the train and validation sets

train_final = pd.concat([train, train_pca_region_complete, train_pca_playlist_complete, 
                         train_pca_audio_complete, train_pca_lyrics_complete], axis=1)
val_final = pd.concat([val, val_pca_region_complete, val_pca_playlist_complete, 
                       val_pca_audio_complete, val_pca_lyrics_complete], axis=1)

In [96]:
train_final.head()

Unnamed: 0,artist_name,percentage of males,% of children,% of teenager,% of adult,% of senior,unique_listener,listened_count,artist_passion_score,Playlist Passion Score,...,audio_PCA9,lyrics_PCA1,lyrics_PCA2,lyrics_PCA3,lyrics_PCA4,lyrics_PCA5,lyrics_PCA6,lyrics_PCA7,lyrics_PCA8,lyrics_PCA9
0,Full,0.431373,0.0,0.029412,0.794118,0.176471,38,51,1.342105,0.8,...,-3.83267,-54.03464,-31.637188,-20.828723,-24.154824,-20.020538,-17.586215,-18.241849,-4.935419,-18.738697
1,Darline,0.501326,0.0,0.151613,0.629032,0.219355,368,380,1.032609,0.039605,...,-2.553329,-51.963156,-29.804337,-20.777676,-23.522865,-17.632991,-17.206006,-16.501635,-5.230594,-16.303966
2,Sweem,0.410256,0.0,0.029412,0.911765,0.058824,38,39,1.026316,0.185621,...,-3.737562,-53.908359,-30.083179,-20.630138,-23.936108,-19.638522,-14.468588,-18.289218,-4.604854,-18.134865
3,Hovey Benjamin,0.916667,0.0,0.125,0.875,0.0,11,12,1.090909,0.0,...,-3.641231,-53.829674,-31.239296,-20.447139,-23.687225,-19.321799,-16.80418,-17.366623,-4.691054,-17.810122
4,Thomas,1.0,0.0,0.0,1.0,0.0,1,1,1.0,0.5,...,-3.677248,-53.870543,-31.318072,-20.522333,-23.779291,-19.457618,-16.954967,-17.534878,-4.737962,-17.9884


In [97]:
train_final.to_csv('./Train_Validation_Test/train_final.csv', index=False)
val_final.to_csv('./Train_Validation_Test/val_final.csv', index=False)

# True Train and Test

We cannot simply concatenate the current train and validation set and train on that before creating and evaluating on the test set. The necessary steps are: 
1. The train and validation set prior to all PCAs must be concatenated first to form the true train set
2. All the Region, Playlist Streams, Audio Features, and Lyrics embeddings will be joined along the artist name for the true train test
3. All the Region, Playlist Streams, Audio Features, and Lyrics embeddings will also be joined along the artist name for the test set
3. PCAs will be fitted on the necessary columns that belong to the true train test. 
4. Transformations will be done to the tables, and properly joined to the true train set
5. Transformations will be applied to the tables belonging to the test set, and full joins will be made
6. We now have the final train and test set (no cross validation)

In [98]:
# Assembling Skeleton of True Train (Without Components from PCAs)

train = pd.read_csv('./Train_Validation_Test/train.csv')
val = pd.read_csv('./Train_Validation_Test/val.csv')
true_train = pd.concat([train, val], axis=0)

In [99]:
# Further transformations in index

true_train.reset_index(drop=True, inplace=True)

In [100]:
# Redefining PCA models

pca_region2 = Pipeline(steps=[('Standardize', StandardScaler()), ('pca', PCA(n_components=5))])
pca_streams2 = Pipeline(steps=[('Standardize', StandardScaler()), ('pca',PCA(n_components=10))])
pca_audio_features2 = Pipeline(steps=[('Standardize', StandardScaler()), ('pca', PCA(n_components=9))])
pca_lyrics_embeddings2 = Pipeline(steps=[('Standardize', StandardScaler()), ('pca', PCA(n_components=9))])

Region

In [101]:
# Reorganising

train_region_pca2 = true_train['artist_name'].to_frame().merge(region_pca_df, how='left', on='artist_name').fillna(0)
train_region_pca2.drop('artist_name', axis=1, inplace=True)

In [102]:
# doing pca on train - region

pca_region2.fit(train_region_pca)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=5))])

In [103]:
# Save pkl

pk.dump(pca_region2, open("./PCA Models/pca_region2.pkl","wb"))

In [104]:
# Load PCA

pca_region2 = pk.load(open("./PCA Models/pca_region2.pkl",'rb'))

In [105]:
# Transform Train set, test set and test set

# Train
train_pca_region_complete2 = pd.DataFrame(pca_region2.transform(train_region_pca2))
train_pca_region_complete2.columns = ['region_PCA1', 'region_PCA2', 'region_PCA3', 'region_PCA4', 'region_PCA5']

# Test
test_region_pca = test['artist_name'].to_frame().merge(region_pca_df, how='left', on='artist_name').fillna(0)
test_region_pca.drop('artist_name', axis=1, inplace=True)
test_pca_region_complete = pd.DataFrame(pca_region2.transform(test_region_pca))
test_pca_region_complete.columns = ['region_PCA1', 'region_PCA2', 'region_PCA3', 'region_PCA4', 'region_PCA5']

Playlist Streams

In [106]:
# Reorganising

train_playlist_pca2 = true_train['artist_name'].to_frame().merge(playlist_pca_df, how='left', on='artist_name').fillna(0)
train_playlist_pca2.drop('artist_name', axis=1, inplace=True)

In [107]:
# Fit on Train ONLY

pca_streams2.fit(train_playlist_pca2)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=10))])

In [108]:
# Save pkl

pk.dump(pca_streams2, open("./PCA Models/pca_streams2.pkl","wb"))

In [109]:
# Load PCA

pca_streams2 = pk.load(open("./PCA Models/pca_streams2.pkl",'rb'))

In [110]:
# Transform Train set, test set and test set

# Train
train_pca_playlist_complete2 = pd.DataFrame(pca_streams2.transform(train_playlist_pca2))
train_pca_playlist_complete2.columns = ['p_PCA1', 'p_PCA2', 'p_PCA3', 'p_PCA4', 'p_PCA5', 'p_PCA6', 'p_PCA7', 'p_PCA8', 'p_PCA9', 'p_PCA10']

# test
test_playlist_pca = test['artist_name'].to_frame().merge(playlist_pca_df, how='left', on='artist_name').fillna(0)
test_playlist_pca.drop('artist_name', axis=1, inplace=True)
test_pca_playlist_complete = pd.DataFrame(pca_streams2.transform(test_playlist_pca))
test_pca_playlist_complete.columns = ['p_PCA1', 'p_PCA2', 'p_PCA3', 'p_PCA4', 'p_PCA5', 'p_PCA6', 'p_PCA7', 'p_PCA8', 'p_PCA9', 'p_PCA10']

Audio Features

In [111]:
# Reorganising

train_audio_pca2 = true_train['artist_name'].to_frame().merge(final_audio_pivot, how='left', on='artist_name').fillna(0)
train_audio_pca2.drop('artist_name', axis=1, inplace=True)

In [112]:
# Fit on Train ONLY

pca_audio_features2.fit(train_audio_pca2)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=9))])

In [113]:
# Save pkl

pk.dump(pca_audio_features2, open("./PCA Models/pca_audio_features2.pkl","wb"))

In [114]:
# Load PCA

pca_audio_features2 = pk.load(open("./PCA Models/pca_audio_features2.pkl",'rb'))

In [115]:
# Transform Train set, test set and test set

# Train
train_pca_audio_complete2 = pd.DataFrame(pca_audio_features2.transform(train_audio_pca2))
train_pca_audio_complete2.columns = ['audio_PCA1', 'audio_PCA2', 'audio_PCA3', 'audio_PCA4', 'audio_PCA5', 
                                    'audio_PCA6', 'audio_PCA7', 'audio_PCA8', 'audio_PCA9']

# Test
test_audio_pca = test['artist_name'].to_frame().merge(final_audio_pivot, how='left', on='artist_name').fillna(0)
test_audio_pca.drop('artist_name', axis=1, inplace=True)
test_pca_audio_complete = pd.DataFrame(pca_audio_features2.transform(test_audio_pca))
test_pca_audio_complete.columns = ['audio_PCA1', 'audio_PCA2', 'audio_PCA3', 'audio_PCA4', 'audio_PCA5', 
                                    'audio_PCA6', 'audio_PCA7', 'audio_PCA8', 'audio_PCA9']

Lyrics Embedding

In [116]:
# Reorganising

train_lyrics_pca2 = true_train['artist_name'].to_frame().merge(pca_ready_embeddings, how='left', on='artist_name').fillna(0)
train_lyrics_pca2.drop('artist_name', axis=1, inplace=True)

In [117]:
# Fit on Train ONLY

pca_lyrics_embeddings2.fit(train_lyrics_pca2)

Pipeline(steps=[('Standardize', StandardScaler()),
                ('pca', PCA(n_components=9))])

In [118]:
# Save pkl

pk.dump(pca_lyrics_embeddings2, open("./PCA Models/pca_lyrics_embeddings2.pkl","wb"))

In [119]:
# Load PCA

pca_lyrics_embeddings2 = pk.load(open("./PCA Models/pca_lyrics_embeddings2.pkl",'rb'))

In [120]:
# Transform Train set, validation set and test set

# Train
train_pca_lyrics_complete2 = pd.DataFrame(pca_lyrics_embeddings2.transform(train_lyrics_pca2))
train_pca_lyrics_complete2.columns = ['lyrics_PCA1', 'lyrics_PCA2', 'lyrics_PCA3', 'lyrics_PCA4', 'lyrics_PCA5', 
                                    'lyrics_PCA6', 'lyrics_PCA7', 'lyrics_PCA8', 'lyrics_PCA9']

# Test
test_lyrics_pca = test['artist_name'].to_frame().merge(pca_ready_embeddings, how='left', on='artist_name').fillna(0)
test_lyrics_pca.drop('artist_name', axis=1, inplace=True)
test_pca_lyrics_complete = pd.DataFrame(pca_lyrics_embeddings2.transform(test_lyrics_pca))
test_pca_lyrics_complete.columns = ['lyrics_PCA1', 'lyrics_PCA2', 'lyrics_PCA3', 'lyrics_PCA4', 'lyrics_PCA5', 
                                    'lyrics_PCA6', 'lyrics_PCA7', 'lyrics_PCA8', 'lyrics_PCA9']

# Creating True Train and Test Data

In [121]:
train_pca_playlist_complete2

Unnamed: 0,p_PCA1,p_PCA2,p_PCA3,p_PCA4,p_PCA5,p_PCA6,p_PCA7,p_PCA8,p_PCA9,p_PCA10
0,-2.709830,-2.012200,-1.414642,-1.092832,-1.071148,-0.384580,-1.066352,-0.937447,-0.895038,-0.429780
1,-2.610275,-1.973701,-1.405903,-1.002904,-1.071742,-0.375163,-0.830326,-0.927847,-0.887692,-0.460321
2,-2.642672,-1.607104,-1.408371,-1.090040,-0.800634,-0.271134,-0.888239,-0.919787,-0.936733,-0.490529
3,-2.698142,-1.995501,-1.400847,-1.080454,-1.057096,-0.379275,-1.046734,-0.917431,-0.874108,-0.418511
4,-2.700472,-1.998819,-1.403584,-1.082907,-1.059877,-0.380325,-1.050600,-0.921366,-0.878216,-0.420717
...,...,...,...,...,...,...,...,...,...,...
470,-2.421169,-0.567211,-1.422775,-1.165980,-0.979876,-0.310073,0.869945,-0.989756,-0.990124,-0.120358
471,-2.196445,-1.587857,-1.434525,0.327538,-1.512119,-0.347311,-0.665674,-0.989966,-0.977720,-0.592130
472,-2.700472,-1.998819,-1.403584,-1.082907,-1.059877,-0.380325,-1.050600,-0.921366,-0.878216,-0.420717
473,-2.698142,-1.995501,-1.400847,-1.080454,-1.057096,-0.379275,-1.046734,-0.917431,-0.874108,-0.418511


In [122]:
# Assembling the train and test sets

true_train_final = pd.concat([true_train, train_pca_region_complete2, train_pca_playlist_complete2, 
                         train_pca_audio_complete2, train_pca_lyrics_complete2], axis=1)
test_final = pd.concat([test, test_pca_region_complete, test_pca_playlist_complete, 
                       test_pca_audio_complete, test_pca_lyrics_complete], axis=1)

In [123]:
# Save final Train and Test

true_train_final.to_csv('./Train_Validation_Test/TRUE_TRAIN.csv', index=False)
test_final.to_csv('./Train_Validation_Test/TRUE_TEST.csv', index=False)

## Load the Final Result

In [124]:
# Loading the final datasets

true_train = pd.read_csv('./Train_Validation_Test/TRUE_TRAIN.csv')
true_test = pd.read_csv('./Train_Validation_Test/TRUE_TEST.csv')