# Data processing

In [13]:
import requests
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
import selenium
from tqdm import tqdm
from scipy import sparse

In [15]:
#load the dataframe that saved and drop all the duplicate in case
load_df = pd.read_pickle("../data/raw/scraped/usersong")
load_df = load_df.drop_duplicates()
load_df.shape[0]

466070

In [16]:
#Take a look at the loaded dataframe
load_df.head(10)

Unnamed: 0,user,count,song
0,raedrexlre,9,Rex Orange County-Corduroy Dreams
1,raedrexlre,8,Keshi-blue
2,raedrexlre,8,Zeph-Lucky
3,raedrexlre,7,Pink Sweat$-17
4,raedrexlre,6,Childish Gambino-Feels Like Summer
5,raedrexlre,6,Daniel Caesar-Get You (feat. Kali Uchis)
6,raedrexlre,6,Daniel Caesar-Japanese Denim
7,raedrexlre,6,Smino-Wild Irish Roses
8,raedrexlre,6,Troye Sivan-BLUE
9,raedrexlre,5,Alisson Shore-Urong Sulong


In [17]:
#Make sure the count is numeric
load_df[["count"]] = load_df[["count"]].apply(pd.to_numeric)

In [18]:
#Change the dataframe into a matrix, which will be better to fit in our collaborative filtering algorithm
usersong_matrix_df = load_df.pivot_table(index='user', columns='song', values='count').fillna(0)
usersong_matrix_df.head(10)

song,"""Dave""-Hi, I'm Dave - From ""DAVE""","""VIKINGS""-Most Epic Viking & Nordic Folk Music","""Weird Al"" Yankovic-A Complicated Song","""Weird Al"" Yankovic-Good Old Days","""Weird Al"" Yankovic-One More Minute","""hitman"" bang-Commonness",#ЗАЦВ-Девочка с района,$IGA A-Jang Won Geup Jae,$NOT & Flo Milli-Mean,"$NOT-""Life""",...,효린-Closer,효린-너 밖에 몰라 (One Way Love),효린-사랑 하지 마 (Don't Love Me),휘성-놈들이 온다,２８１４-恢复,２８１４-遠くの愛好家,𝐒𝐤𝐢𝐩™️-𝐌𝐞 𝐚𝐧𝐝 𝐌𝐲 𝐇𝐮𝐬𝐛𝐚𝐧𝐝,𝖘𝖑𝖔𝖜𝖊𝖉 𝖘𝖔𝖓𝖌𝖘 ★-worldstar money - joji (𝖘𝖑𝖔𝖜𝖊𝖉 + 𝖗𝖊𝖛𝖊𝖗𝖇),𝗺𝗶𝗹𝗸𝘆’𝘀 𝗮 𝗳𝘅𝗰𝗸𝗶𝗻𝗴 𝘄𝘅𝗻𝗻𝗮𝗯𝗲-Dollhouse - Melanie Martinez [𝔻𝕒𝕪𝕔𝕠𝕣𝕖/𝕊𝕝𝕠𝕨𝕖𝕕 𝔻𝕠𝕨𝕟],🎶 Music for ya'll :D-Therefore You And Me
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00Eraser00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1Kosmo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1NCRADY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACRACRAK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AEsir-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGcry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AIDANP17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALANALIONALIKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AMFaerie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#Also save this matrix dataframe just in case 4.32GB
usersong_matrix_df.to_pickle("../data/processed/usersong_matrix_df")
usersong_matrix_load_df = pd.read_pickle("../data/processed/usersong_matrix_df")

In [21]:
usersong_matrix_sparse_df = sparse.csr_matrix(usersong_matrix_load_df.values)

In [22]:
#See the info of this dataframe
usersong_matrix_load_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7147 entries, 00Eraser00 to zzzavy
Columns: 71730 entries, "Dave"-Hi, I'm Dave - From "DAVE" to 🎶 Music for ya'll :D-Therefore You And Me
dtypes: float64(71730)
memory usage: 3.8+ GB


In [23]:
usersong_matrix_df.shape[0]

7147

# Use matrix to fit into collaborative filtering package surprise

In [11]:
#How to cite?
# @article{Hug2020,
#   doi = {10.21105/joss.02174},
#   url = {https://doi.org/10.21105/joss.02174},
#   year = {2020},
#   publisher = {The Open Journal},
#   volume = {5},
#   number = {52},
#   pages = {2174},
#   author = {Nicolas Hug},
#   title = {Surprise: A Python library for recommender systems},
#   journal = {Journal of Open Source Software}
# }

In [15]:
#!pip install scikit-surprise

In [16]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import BaselineOnly
from surprise.model_selection import cross_validate

# Play around with the dataset

In [None]:
def __init__(self, metric, algorithm, k, data, decode_id_song):
  # .
  self.model = self._recommender().fit(data)

def _recommender(self):
  return NearestNeighbors(metric=self.metric, algorithm=self.algorithm, 
                          n_neighbors=self.k, n_jobs=-1)

# Instantiate and fit the model
model = Recommender(metric='cosine', algorithm='brute', k=20, data=mat_songs_features, 
                    decode_id_song=decode_id_song)

In [None]:
def _get_recommendations(self, new_song, n_recommendations):
    recom_song_id = self._fuzzy_matching(song=new_song)
    # Return the n neighbors for the song id
    distances, indices = self.model.kneighbors(self.data[recom_song_id], 
                                               n_neighbors=n_recommendations+1)
    return sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), 
                  key=lambda x: x[1])[:0:-1]

def _map_indeces_to_song_title(self, recommendation_ids):
    # get reverse mapper
    return {song_id: song_title for song_title, song_id in self.decode_id_song.items()}
