In [1]:
import pandas as pd
import sqlalchemy
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np

SQL_CREDENTIALS = "root:1234"
engine = sqlalchemy.create_engine('mysql+pymysql://' + SQL_CREDENTIALS + '@localhost:3306/music_recommender_db')

%matplotlib qt

In [2]:
lowms_df = pd.read_csv("data/playcounts_lowms.csv", sep=";")
lowms_df.head()

Unnamed: 0,user_id,item_id,rating
0,31435741,53,54
1,31435741,86,84
2,31435741,127,3
3,31435741,182,45
4,31435741,219,17


In [3]:
normms_df = pd.read_csv("data/playcounts_normms.csv", sep=";")
normms_df.head()

Unnamed: 0,user_id,item_id,rating
0,2673250,202202,2
1,2673250,17524,2
2,2673250,17523,3
3,2673250,202209,1
4,2673250,202257,2


In [4]:
lowms_sparsity = 1 - (len(lowms_df) / (lowms_df["user_id"].nunique() * lowms_df["item_id"].nunique()))
normms_sparsity = 1 - (len(normms_df) / (normms_df["user_id"].nunique() * normms_df["item_id"].nunique()))

In [5]:
print("LowMs Sparsity: %.04f" % (lowms_sparsity))
print("NormMs Sparsity: %.04f" % (normms_sparsity))

LowMs Sparsity: 0.9986
NormMs Sparsity: 0.9978


In [6]:
stmt = "SELECT user_id, M_global_R_APC from user_mainstreaminess WHERE user_id IN " + str(tuple(lowms_df["user_id"].unique()))
lowms_mainstreaminess_df = pd.read_sql(con=engine, sql=stmt).set_index("user_id").dropna()
lowms_mainstreaminess_df.head()

  result = self._query(query)


Unnamed: 0_level_0,M_global_R_APC
user_id,Unnamed: 1_level_1
1049656,0.09606
1055118,0.0883
1056935,0.076351
1070023,0.000605
1072752,0.052467


In [7]:
len(lowms_mainstreaminess_df)

2074

In [94]:
stmt = "SELECT user_id, M_global_R_APC, country from user_mainstreaminess WHERE M_global_R_APC > 0.097732 AND user_id NOT IN " + str(tuple(lowms_df["user_id"].unique()))
norm_users_df = pd.read_sql(con=engine, sql=stmt).dropna()

  result = self._query(query)


In [95]:
norm_users_df["M_global_R_APC"].describe()

count    42599.000000
mean         0.206140
std          0.067815
min          0.097737
25%          0.154184
50%          0.198491
75%          0.247744
max          0.737865
Name: M_global_R_APC, dtype: float64

In [96]:
stmt = "SELECT user_id, M_global_R_APC from user_mainstreaminess WHERE user_id IN " + str(tuple(normms_df["user_id"].unique()))
normms_mainstreaminess_df = pd.read_sql(con=engine, sql=stmt).set_index("user_id").dropna()
normms_mainstreaminess_df.head()

Unnamed: 0_level_0,M_global_R_APC
user_id,Unnamed: 1_level_1
1003045,0.263064
1009940,0.143923
1025350,0.225473
1042860,0.219473
1073993,0.20312


In [162]:
plt.style.use("seaborn")
fig, axes = plt.subplots(ncols=2)
fig.add_subplot(111, frameon=False)

density = gaussian_kde(lowms_mainstreaminess_df["M_global_R_APC"])
xs = np.linspace(np.min(lowms_mainstreaminess_df["M_global_R_APC"]), np.max(lowms_mainstreaminess_df["M_global_R_APC"]),200)
density.covariance_factor = lambda : .25
density._compute_covariance()
axes[0].plot(xs[10:-10],density(xs)[10:-10], label="LowMs")
axes[0].set_yticklabels("")
axes[0].set_title("LowMs")

density = gaussian_kde(normms_mainstreaminess_df["M_global_R_APC"])
xs = np.linspace(np.min(normms_mainstreaminess_df["M_global_R_APC"]), np.max(normms_mainstreaminess_df["M_global_R_APC"]),200)
density.covariance_factor = lambda : .25
density._compute_covariance()
axes[1].plot(xs[10:-10],density(xs)[10:-10], label="NormMs")
axes[1].set_yticklabels("")
axes[1].set_title("NormMs")

plt.yticks([], "")
plt.xticks([], "")
#plt.xlabel("Mainstreaminess")

([], <a list of 0 Text xticklabel objects>)

In [6]:
classification_df = pd.read_csv("data/classification_clean.csv", sep=";")
classification_df.head()

Unnamed: 0,user_id,cluster
0,10883488,1
1,35212267,3
2,38189090,3
3,22113634,3
4,3704198,3


In [7]:
df = lowms_df.merge(classification_df, left_on="user_id", right_on="user_id")
df.head()

Unnamed: 0,user_id,item_id,rating,cluster
0,31435741,53,54,2
1,31435741,86,84,2
2,31435741,127,3,2
3,31435741,182,45,2
4,31435741,219,17,2


In [10]:
n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_users, n_items

(2073, 799658)

In [12]:
df.groupby("cluster").size() / (2073*799658)

cluster
1    0.000287
2    0.000540
3    0.000073
4    0.000536
dtype: float64

In [11]:
classification_df = pd.read_csv("data/classification_clean.csv", sep=";")
playcounts_lowms_df = pd.read_csv("data/playcounts_lowms.csv", sep=";")
usergroups_df = classification_df.merge(playcounts_lowms_df, left_on="user_id", right_on="user_id")[["user_id", "item_id", "cluster"]]
usergroups_df.head()

Unnamed: 0,user_id,item_id,cluster
0,10883488,797752,1
1,10883488,6659905,1
2,10883488,797744,1
3,10883488,797745,1
4,10883488,797748,1


In [12]:
unique_tracks_u1 = usergroups_df[usergroups_df["cluster"] ==  1]["item_id"].unique().tolist()
unique_tracks_u2 = usergroups_df[usergroups_df["cluster"] ==  2]["item_id"].unique().tolist()
unique_tracks_u3 = usergroups_df[usergroups_df["cluster"] ==  3]["item_id"].unique().tolist()
unique_tracks_u4 = usergroups_df[usergroups_df["cluster"] ==  4]["item_id"].unique().tolist()

In [21]:
all_sets = [set(unique_tracks_u1), set(unique_tracks_u2), set(unique_tracks_u3), set(unique_tracks_u4)]
for i in range(4):
    for j in range(4):
        intersection = all_sets[i].intersection(all_sets[j])
        union = all_sets[i].union(all_sets[j])
        print("%d, %d: %f" % (i+1, j+1, len(intersection)))

1, 1: 275979.000000
1, 2: 98986.000000
1, 3: 35529.000000
1, 4: 89208.000000
2, 1: 98986.000000
2, 2: 377507.000000
2, 3: 30998.000000
2, 4: 142100.000000
3, 1: 35529.000000
3, 2: 30998.000000
3, 3: 92823.000000
3, 4: 43130.000000
4, 1: 89208.000000
4, 2: 142100.000000
4, 3: 43130.000000
4, 4: 387459.000000
