# Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as prp
from pandas.api.types import is_numeric_dtype

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
pd.set_option('display.max_columns', None)

# Data Pre-processing


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Music_Recommender/Kaggle/Spotify_Tracks_Dataset/dataset.csv')

In [None]:
dataset.head()

In [None]:
dataset.columns.values

In [None]:
dataset.info()


In [None]:
dataset.describe()

 **Usefull Features**


---


*   track_id
*   artists
*   popularity
*   danceability
*   energy
*   speechiness
*   acousticness
*   liveness
*   valence
*   track_genre
*   tempo
*   key
*   mode









In [None]:
dataset = dataset[['track_id', 'artists', 'track_genre', 'mode' , 'key' ,'popularity' ,'danceability', 'energy', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo' ]]
dataset.head()

In [None]:

# print(len(pd.unique(dataset['track_id'])))


print(len(pd.unique(dataset['track_genre'])))
print(dataset['track_genre'].unique())


In [None]:
dataset.drop_duplicates(subset=['track_id'], inplace = True)


In [None]:
dataset.info()

In [None]:
dataset.dropna(inplace = True)

In [None]:
dataset.reset_index(drop = True, inplace = True)


## Generate 'artists' list

In [None]:
def generate_list(col_name):
  return list(dataset[col_name].apply(lambda x : [i.replace("-", "").replace(" ", "").replace(".","") for i in x]))

In [None]:
dataset['artists'] = dataset['artists'].apply(lambda x:x.lower().split(';'))
  

In [None]:
dataset['artists'] = generate_list('artists')

## Generate 'track_genres' list

In [None]:
dataset['track_genre'] = dataset['track_genre'].apply(lambda x: x.split())

In [None]:
dataset['track_genre'] = generate_list('track_genre')

# One Hot Encoding 


In [None]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

# Feature generation

In [None]:

def create_feature_set(df, float_cols):
    '''
    Process spotify df to create a final set of features that will be used to generate recommendations
    ---
    Input: 
    df (pandas dataframe): Spotify Dataframe
    float_cols (list(str)): List of float columns that will be scaled
            
    Output: 
    final (pandas dataframe): Final set of features 
    '''
    
    # Tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(dataset['track_genre'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names_out()]
    # genre_df.drop(columns='genre|unknown') # drop unknown genre
    genre_df.reset_index(drop = True, inplace=True)
    
    #Tfidf artist lists
    # tfidf = TfidfVectorizer()
    # tfidf_artists_matrix = tfidf.fit_transform(dataset['artists'].apply(lambda x: " ".join(x)))
    # artists_df = pd.DataFrame(tfidf_artists_matrix.toarray())
    # artists_df.columns = ['artist' + "|" + i for i in tfidf.get_feature_names_out()]
    # artists_df.reset_index(drop = True, inplace=True)

    # One-hot Encoding
    key_ohe = ohe_prep(dataset, 'key','key') * 0.5
    mode_ohe = ohe_prep(dataset, 'mode','mode') * 0.5

    # Normalization
    # Scale popularity columns
    pop = dataset[['popularity']].reset_index(drop = True)
    scaler = MinMaxScaler()
    pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns) * 0.2 

    # Scale audio columns
    floats = dataset[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    # Concanenate all features
    final = pd.concat([genre_df,floats_scaled, pop_scaled, key_ohe, mode_ohe], axis = 1)
    
    # Add song id
    final['track_id']=dataset['track_id'].values
    # final['artists'] = dataset['artists'].values
    
    return final

In [None]:
float_cols = dataset.dtypes[dataset.dtypes == 'float64'].index.values


In [None]:
# Save the data
# dataset.to_csv("/content/drive/MyDrive/Music_Recommender/Kaggle/Spotify_Tracks_Dataset/useful_feature.csv", index = False)

In [None]:
#generate new csv with normalised features
complete_feature_set = create_feature_set(dataset, float_cols=float_cols)
complete_feature_set

In [None]:
# complete_feature_set.to_csv("/content/drive/MyDrive/Music_Recommender/Kaggle/Spotify_Tracks_Dataset/complete_feature.csv", index = False)
complete_feature_set.head()