In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
import ast
from collections import Counter

In [2]:
SQL_CREDENTIALS = "root:1234"
engine = sqlalchemy.create_engine('mysql+pymysql://' + SQL_CREDENTIALS + '@localhost:3306/music_recommender_db')

In [3]:
events_df = pd.read_csv("feature_engineering/data/lowms_events_nondominating.csv", sep=";")
events_df.head()

Unnamed: 0,user_id,track_id,artist_id,timestamp
0,31435741,53,21,1370977938
1,31435741,53,21,1370977728
2,31435741,53,21,1370977518
3,31435741,53,21,1370977308
4,31435741,53,21,1370977098


In [4]:
listened_tracks_df = events_df.drop_duplicates(subset=["user_id", "track_id", "artist_id"])[["user_id", "track_id", "artist_id"]]
listened_tracks_df.head()

Unnamed: 0,user_id,track_id,artist_id
0,31435741,53,21
54,31435741,86,32
138,31435741,127,44
141,31435741,182,57
186,31435741,219,65


In [5]:
track_genres_df = pd.read_csv("feature_engineering/data/track_genres.csv", sep=";", header=None)
track_genres_df.columns = ["track_id", "genres"]
track_genres_df = track_genres_df[track_genres_df["genres"] != "[]"]
track_genres_df.set_index("track_id", inplace=True)
track_genres_df["genres"] = track_genres_df["genres"].apply(lambda r: ast.literal_eval(r))
track_genres_df.head()

Unnamed: 0_level_0,genres
track_id,Unnamed: 1_level_1
4868,"[soul, pop, singersongwriter, blues, jazz, ind..."
2900,"[electronic, indiepop, shoegaze, dreampop, pos..."
572665,"[soul, pop, singersongwriter, blues, drama]"
2897,"[indierock, electronic, indiepop, postpunk, ro..."
15100,"[folk, indiefolk, banjo, folkrock, bluegrass, ..."


In [6]:
rel_tracks = pd.read_csv("feature_engineering/data/relevant_tracks_lowms.txt", header=None)
rel_tracks = rel_tracks[0].values
rel_tracks

array([  4868,   2900, 572665, ..., 901203, 701827, 720698], dtype=int64)

In [7]:
len(rel_tracks)

147156

In [8]:
listened_tracks_df[listened_tracks_df["track_id"].isin(rel_tracks)]["artist_id"].nunique()

14243

In [9]:
events_df[events_df["track_id"].isin(rel_tracks)].groupby(by="user_id").size().describe()

count     2073.000000
mean      2258.630487
std       1457.309885
min          1.000000
25%       1198.000000
50%       1945.000000
75%       3078.000000
max      10536.000000
dtype: float64

In [10]:
len(events_df[events_df["track_id"].isin(rel_tracks)])

4682141

In [11]:
track_to_cluster_df = pd.read_csv("clustering/track_to_cluster.csv", sep=";")
track_to_cluster_df = track_to_cluster_df[track_to_cluster_df["cluster"] != -1]
track_to_cluster_df.set_index("track_id", inplace=True)
track_to_cluster_df["cluster"] += 1
track_to_cluster_df.head()

Unnamed: 0_level_0,cluster
track_id,Unnamed: 1_level_1
53,2
127,2
182,2
219,2
289,2


In [12]:
df = track_genres_df.merge(track_to_cluster_df, left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,genres,cluster
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4868,"[soul, pop, singersongwriter, blues, jazz, ind...",2
572665,"[soul, pop, singersongwriter, blues, drama]",2
2897,"[indierock, electronic, indiepop, postpunk, ro...",2
15100,"[folk, indiefolk, banjo, folkrock, bluegrass, ...",2
7112,"[rock, indierock, alternativerock, pop, poproc...",2
