<a href="https://colab.research.google.com/github/prishanmu/Music_Recommender/blob/master/AZ_Lyrics_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music Artist Recommendation System

Inspired by a [Netflix recommendation system](https://www.kaggle.com/niharika41298/netflix-visualizations-recommendation-eda/notebook#Recommendation-System-(Content-Based)) I decided to create a similar recommendation system for music. 

In [0]:
import pandas as pd
import numpy as np
import random

## Data 

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import glob

path = "/content/drive/My Drive/Music Recommender/azlyrics-scraper/"

filenames = glob.glob(path + "/*.csv")

dfs = []
for filename in filenames:
  x = pd.read_csv(filename, usecols = ['ARTIST_NAME', 'SONG_NAME', 'LYRICS'])
  dfs.append(x)

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)

In [15]:
big_frame.head()

Unnamed: 0,ARTIST_NAME,SONG_NAME,LYRICS
0,03 greedo,sweet lady,"its only one, 03, i'm from grape street, where..."
1,03 greedo,mafia business,"you gonna make me put you in a suit and tie, s..."
2,03 greedo,paranoid,"we could kill it, yeah, we could, we could, we..."
3,03 greedo,never bend,"yeah, you lil bitch ass niggas steady speaking..."
4,03 greedo,prayer for my lost,"x loaded up bro, never see me in the church, p..."


In [0]:
df = big_frame.sample(500) #using a sample for easier dev/testing

In [18]:
df.head()

Unnamed: 0,ARTIST_NAME,SONG_NAME,LYRICS
89733,"jacka, the",storm,", killa on the road nigga, nigga watch out nig..."
145138,wincent weiss,kein lied,"ey, meinst du, es war lacherlich, wenn ich jet..."
130377,tony bennett,the shining sea,"we loved the shining sea, she gathered sea she..."
114491,"sesto, camilo",algo de mi,"un adios sin razones, unos anos sin valor. me ..."
94514,joonil jung,plastic,", bakkereul jom nagaboryeogo haenneunde, o ire..."


## TF-IDF

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

# to-do: parameter tune later

cv = CountVectorizer(strip_accents='ascii',
                     lowercase=True,
                     tokenizer=None,
                     stop_words='english', #enlgish stop words dictionary
                     ngram_range=(1, 1), #broken down into one word units, you can change to 1+2 word units, etc
                     max_df=1.0,
                     min_df=0,
                     max_features=None,
                     vocabulary=None)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['LYRICS'] = df['LYRICS'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['LYRICS'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(444, 10289)

## Cosine Similarity

In [0]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [0]:
indices = pd.Series(df.index, index=df['SONG_NAME']).drop_duplicates()

## Recommendation Functions

*   Song Recommender based on favorite song
*   Artist Recommender based on favorite artist




In [0]:
def get_song_rec(song, cosine_sim=cosine_sim, df=df):
    idx = indices[song]

    # Get the pairwsie similarity scores of all songs with that song
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the songs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar songs
    sim_scores = sim_scores[1:11]

    # Get the song indices
    song_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar songs
    return df['SONG_NAME'].iloc[song_indices]

In [0]:
def get_artist_rec(artist, cosine_sim=cosine_sim, df=df): 
  # groupby tf-idf with artist & then do similarity matrix

  # idx = indices[artist]

  # get pairwise similarity 

  # sort artists based on similarity scores

  # get the scores of the 10 most similar artists

  # get the artist indices

  #return the top 10 most similar songs
  return None

## Getting Recommendatons (Testing)

In [0]:
song_list = df.SONG_NAME.tolist()

song = random.choice(song_list)
#print(get_song_rec(song))

artist_list = df.ARTIST_NAME.tolist()

artist = random.choice(artist_list)
#print(get_artist_rec(artist))