In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib qt
plt.style.use("seaborn")
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 21

plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)  # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)  # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [2]:
import sklearn

In [3]:
sklearn.__version__

'0.21.2'

In [4]:
users_df = pd.read_csv("feature_engineering/data/low_main_users.txt", sep=",").set_index("user_id")
users_df.head()

Unnamed: 0_level_0,cnt_listeningevents,cnt_distinct_artists,novelty_artist_avg_year,cnt_distinct_tracks,cnt_listeningevents_per_week,M_global_R_APC,country,age,gender
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1049656,11698.0,792.0,0.427083154519399,3256.0,164.983,0.09606,FI,35,m
1055118,5365.0,440.0,0.524092843135198,1404.0,15.0147,0.0883,US,34,m
1056935,8365.0,136.0,0.162145761506898,2713.0,43.1365,0.076351,UK,30,m
1070023,14118.0,711.0,0.5249859260188209,3089.0,59.963,0.000605,US,32,m
1072752,12749.0,484.0,0.5596268346103338,2899.0,39.7548,0.052467,DK,48,m


In [5]:
1 - ((users_df["age"] == -1) & (users_df["gender"] != "n")).mean()

0.8066538090646095

In [6]:
N = len(users_df)

In [7]:
n_countries = users_df["country"].nunique()

In [8]:
n_per_country = users_df.groupby(by="country").size()
n_per_country.head()

country
AR     8
AT    14
AU    51
BE    21
BG     8
dtype: int64

In [9]:
idfs = []
for count in n_per_country:
    idf = np.log10(N / count)
    idfs.append(idf)

In [10]:
idfs_df = pd.DataFrame({"country": n_per_country.index, "score": idfs})
idfs_df.set_index("country", inplace=True)
idfs_df.head()

Unnamed: 0_level_0,score
country,Unnamed: 1_level_1
AR,2.413719
AT,2.170681
AU,1.609239
BE,1.994589
BG,2.413719


In [11]:
idfs_df.sort_values(by="score", ascending=True, inplace=True)
classification_df = pd.read_csv("clustering/classification_clean.csv", sep=";").set_index("user_id")
df = users_df.merge(classification_df, left_index=True, right_index=True)[["country", "cluster"]]
df.head()

Unnamed: 0_level_0,country,cluster
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1049656,FI,3
1055118,US,2
1056935,UK,2
1070023,US,2
1072752,DK,1


In [12]:
idfs_df.sort_values(by="score", ascending=True).head(6)

Unnamed: 0_level_0,score
country,Unnamed: 1_level_1
US,0.684351
RU,0.976365
DE,0.982355
UK,1.083813
BR,1.173794
PL,1.192957


In [21]:
#plt.figure(figsize=(8, 6))
plt.plot(list(range(n_countries)), idfs_df, "-o")
plt.axhline(y=1.5, linestyle="--", color="black", label="Lower bound (1.5)")
plt.ylabel("Country IDF-score")
plt.xlabel("Countries")
plt.grid(False)
plt.legend()

<matplotlib.legend.Legend at 0x1d4674ecc50>

In [14]:
cleaned_users_df = users_df[~users_df["country"].isin(["US", "RU", "DE", "UK", "BR", "PL"])]
len(cleaned_users_df)

767

In [15]:
classification_df = pd.read_csv("clustering/classification_clean.csv", sep=";").set_index("user_id")
classification_df.head()

Unnamed: 0_level_0,cluster
user_id,Unnamed: 1_level_1
10883488,1
35212267,3
38189090,3
22113634,3
3704198,3


In [18]:
classification_df.groupby("cluster").size()

cluster
1    396
2    900
3    102
4    675
dtype: int64

In [16]:
cleaned_users_df.merge(classification_df, left_index=True, right_index=True).groupby(by="cluster").size()

cluster
1    180
2    306
3     35
4    245
dtype: int64

In [31]:
cleaned_users_df.to_csv("users_without_top6.csv", sep=";")

In [23]:
df = cleaned_users_df.merge(classification_df, left_index=True, right_index=True)[["cluster", "country"]]
df.head()

Unnamed: 0_level_0,cluster,country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1049656,3,FI
1072752,1,DK
2052756,2,CA
2095434,1,SE
2246867,1,EE


In [24]:
df.groupby(by="cluster")["country"].value_counts().groupby(by="cluster").head(5)

cluster  country
1        ES         19
         NL         12
         FR         11
         SE         10
         IT          9
2        AU         30
         FI         23
         ES         18
         FR         18
         NL         18
3        JP          4
         ID          3
         NL          3
         TR          3
         BE          2
4        UA         23
         FI         21
         CA         18
         IT         14
         AU         13
Name: country, dtype: int64

In [25]:
df.groupby("cluster").size()

cluster
1    180
2    306
3     35
4    245
dtype: int64