# Preprocessing - Part II

### Steps included:
* reading song & artist combined data
* cleaning data
* generating cosine similarity matrix
* generating top 10 recommenxations for each song
* saving recommendations as a matrix for fast loading

#### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

#### Loading songs artist combined file

In [None]:
df = pd.read_csv("song_artist.csv")
df.head()

#### Separating numerical attributes & meta

In [None]:
song_id = df[["id", "name", "artists", "spotify_id"]]

In [None]:
data = df.drop(["id", "name", "artists", "acousticness", "danceability", "spotify_id"], axis = 1).copy()
data.head()

#### Corelation between numerical attributes

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(data.corr(), annot=True)

#### Limiting dataset to 40,000 songs because of memory contraints

In [None]:
meta = data.values
meta = meta[:40000, :]
meta.shape

#### Generating cosine similarity matrix
The following three cells will take significant time to execute

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(meta, meta)

In [None]:
np.save("sim.npy", cosine_sim)

In [None]:
song_id["search"] = song_id["name"] + ' ' + song_id["artists"]
song_id.to_csv("id2.csv", index=False)

#### Finding top 10 similar songs for a given song and saving them in a matrix

In [None]:
def get_recommendations(idx, cosine_sim):
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the songs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar songs
    sim_scores = sim_scores[1:11]
    sim_scores.sort()
    # Get the song indices
    song_index = [i[0] for i in sim_scores]
    song_index = song_index
    
    # Return the top 10 most similar songs
    return song_index


sim = np.load("sim.npy")
all_res = np.zeros((1, 10))

for i in range(0, 40000):
    rec_list = get_recommendations(i, sim)
    rec_array = np.array(rec_list)
    rec_array = rec_array.reshape((1, 10))
    rec_array = rec_array.astype(int)
    all_res = np.append(all_res, rec_array, axis=0)
    print("saved for:" + str(i))

all_res = np.delete(all_res, 0, axis=0)
np.save("light.npy", all_res)

#### light.npy is a matrix where i<sup>th</sup> row will have 10 columns, each having the song_id for the recommended song