In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlalchemy
import pycountry
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
%matplotlib qt
plt.style.use("default")
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 21

"""plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)  # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)  # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
"""


"plt.rc('font', size=SMALL_SIZE)  # controls default text sizes\nplt.rc('axes', titlesize=MEDIUM_SIZE)  # fontsize of the axes title\nplt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels\nplt.rc('xtick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels\nplt.rc('ytick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels\nplt.rc('legend', fontsize=MEDIUM_SIZE)  # legend fontsize\nplt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title\n"

In [3]:
SQL_CREDENTIALS = "root:1234"
engine = sqlalchemy.create_engine('mysql+pymysql://' + SQL_CREDENTIALS + '@localhost:3306/music_recommender_db')


# preparations

In [4]:
"""
use all 2074 users
"""
users_df = pd.read_csv("feature_engineering/data/low_main_users.txt", sep=",", index_col="user_id", usecols=["country", "user_id"])
users_df.head()

Unnamed: 0_level_0,country
user_id,Unnamed: 1_level_1
1049656,FI
1055118,US
1056935,UK
1070023,US
1072752,DK


In [5]:
"""
exclude top 6 countries (US, RU, DE, UK BR, PL) and use only 767 users 
"""
users_df = pd.read_csv("users_without_top6.csv", sep=";", index_col="user_id", usecols=["country", "user_id"])
users_df.head()

Unnamed: 0_level_0,country
user_id,Unnamed: 1_level_1
1049656,FI
1072752,DK
2052756,CA
2095434,SE
2246867,EE


In [6]:
len(users_df)

767

In [7]:
stmt = "SELECT * FROM hofstede"
hofstede_df = pd.read_sql(con=engine, sql=stmt).set_index("country")
hofstede_df.drop("ctr", axis=1, inplace=True)

hofstede_df.replace("\\N", np.nan, inplace=True)
hofstede_df["power_distance"] = hofstede_df["power_distance"].astype(float)
hofstede_df["individualism"] = hofstede_df["individualism"].astype(float)
hofstede_df["masculinity"] = hofstede_df["masculinity"].astype(float)
hofstede_df["uncertainty_avoidance"] = hofstede_df["uncertainty_avoidance"].astype(float)
hofstede_df["indulgence"] = hofstede_df["indulgence"].astype(float)
hofstede_df["long_term_orientation"] = hofstede_df["long_term_orientation"].astype(float)

hofstede_df = pd.DataFrame(MinMaxScaler().fit_transform(hofstede_df), index=hofstede_df.index, columns=hofstede_df.columns)

hofstede_df.head()

  result = self._query(query)


Unnamed: 0_level_0,power_distance,individualism,masculinity,uncertainty_avoidance,long_term_orientation,indulgence
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Argentina,0.408602,0.43038,0.485714,0.707865,0.093333,0.563218
Austria,0.0,0.544304,0.704762,0.52809,0.626667,0.574713
Australia,0.290323,0.987342,0.533333,0.314607,0.106667,0.666667
Belgium,0.580645,0.797468,0.466667,0.797753,0.92,0.505747
Bulgeria,0.634409,0.227848,0.333333,0.696629,0.746667,0.034483


In [8]:
stmt = "SELECT *  FROM world_happiness"
world_happiness_df = pd.read_sql(con=engine, sql=stmt)
world_happiness_df = world_happiness_df.sort_values(by="year", ascending=False).groupby(by="country").head(1).set_index("country")
world_happiness_df.sort_index(inplace=True)
#print(world_happiness_df.isna().sum() / len(world_happiness_df))
world_happiness_df.drop(["GINI index (World Bank estimate)", "Democratic Quality", "Delivery Quality"], axis=1, inplace=True)
world_happiness_df = pd.DataFrame(MinMaxScaler().fit_transform(world_happiness_df), index=world_happiness_df.index, columns=world_happiness_df.columns)
world_happiness_df.head()

  return self.partial_fit(X, y)


Unnamed: 0_level_0,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Standard deviation of ladder by country-year,Standard deviation/Mean of ladder by country-year,"GINI index (World Bank estimate), average 2000-15","gini of household income reported in Gallup, by wp5-year"
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Afghanistan,1.0,0.0,0.164726,0.263522,0.247373,0.206834,0.205734,1.0,0.228615,0.486698,0.170214,0.065327,0.464856,,0.067604
Albania,1.0,0.385803,0.54232,0.489394,0.767193,0.665255,0.28267,0.90166,0.540064,0.415847,0.392901,0.657086,0.504877,0.187369,0.271894
Algeria,1.0,0.504667,0.575179,0.749477,0.662917,0.22056,0.110288,0.680044,0.490956,0.330366,,0.347564,0.266499,0.118758,0.464935
Angola,0.727273,0.22103,0.417658,0.669265,0.251142,0.132274,0.150039,0.848808,0.376634,0.480147,0.522744,0.423191,0.505849,0.498951,0.32171
Argentina,1.0,0.658849,0.635023,0.903238,0.720134,0.782283,0.119332,0.857576,0.792589,0.336056,0.220348,0.525645,0.279498,0.622493,0.244995


In [9]:
def cc_to_name(cc):
    country = pycountry.countries.get(alpha_2=cc)
    if country:
        return country.name
    else:
        return None

In [10]:
users_df["country"] = users_df["country"].apply(lambda cc: cc_to_name(cc))
users_df.head()

Unnamed: 0_level_0,country
user_id,Unnamed: 1_level_1
1049656,Finland
1072752,Denmark
2052756,Canada
2095434,Sweden
2246867,Estonia


In [11]:
classification_df = pd.read_csv("clustering/classification_clean.csv", sep=";", index_col="user_id")
classification_df.columns = ["prediction"]
classification_df.head()

Unnamed: 0_level_0,prediction
user_id,Unnamed: 1_level_1
10883488,1
35212267,3
38189090,3
22113634,3
3704198,3


In [12]:
users_df = users_df.merge(classification_df, left_index=True, right_index=True)
users_df.head()

Unnamed: 0_level_0,country,prediction
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1049656,Finland,3
1072752,Denmark,1
2052756,Canada,2
2095434,Sweden,1
2246867,Estonia,1


# hofstede

In [13]:
df_h = users_df.merge(hofstede_df, left_on="country", right_index=True)
df_h.columns= ["country", "prediction", "Power distance", "Individualism", "Masculinity", "Uncertainty avoidance", "Long-term orientation", "Indulgence"]
df_h.head()

Unnamed: 0_level_0,country,prediction,Power distance,Individualism,Masculinity,Uncertainty avoidance,Long-term orientation,Indulgence
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1049656,Finland,3,0.236559,0.64557,0.2,0.404494,0.333333,0.505747
2547668,Finland,4,0.236559,0.64557,0.2,0.404494,0.333333,0.505747
3822486,Finland,4,0.236559,0.64557,0.2,0.404494,0.333333,0.505747
3847834,Finland,2,0.236559,0.64557,0.2,0.404494,0.333333,0.505747
4768471,Finland,2,0.236559,0.64557,0.2,0.404494,0.333333,0.505747


In [66]:
%matplotlib qt
for dim in ["power_distance", "individualism", "masculinity", "uncertainty_avoidance", "long_term_orientation", "indulgence"]:
    df_h[["prediction", dim]].boxplot(by="prediction")

In [18]:
%matplotlib qt
df = df_h[["Power distance", "Individualism", "Masculinity", "Uncertainty avoidance", "Long-term orientation", "Indulgence"]].stack().reset_index()
df.columns = ["user_id", "acoustic_features", "value"]
df = df.merge(classification_df, left_on="user_id", right_index=True)
g = sns.boxplot(x="value", y="acoustic_features", hue="prediction", data=df, showfliers=False)
#g.legend_.remove()
g.legend_.set_title("")
# replace labels
plt.legend(loc="upper right")
#new_labels = [r"$\tilde{U}_{C_1}$", r"$\tilde{U}_{C_2}$", r"$\tilde{U}_{C_3}$", r"$\tilde{U}_{C_4}$"]
new_labels = [r"$U_{C_1}$", r"$U_{C_2}$", r"${U}_{C_3}$", r"${U}_{C_4}$"]
for t, l in zip(g.legend_.texts, new_labels):
    t.set_text(l)
plt.ylabel("")
plt.xlabel("")
plt.grid(False)

# world happiness

In [19]:
df_wh = users_df.merge(world_happiness_df, left_on="country", right_index=True)
df_wh = df_wh[["country", "prediction", "Life Ladder", "Log GDP per capita", "Social support", "Healthy life expectancy at birth", "Freedom to make life choices", "Generosity", "Perceptions of corruption"]]
df_wh.columns = ["country", "prediction", "Happiness", "GDP", "Social support", "Healthy life expectancy", "Freedom", "Generosity", "Perception of corruption"]
df_wh.head()

Unnamed: 0_level_0,country,prediction,Happiness,GDP,Social support,Healthy life expectancy,Freedom,Generosity,Perception of corruption
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1049656,Finland,3,1.0,0.786729,0.991125,0.849474,0.967347,0.307487,0.042488
2547668,Finland,4,1.0,0.786729,0.991125,0.849474,0.967347,0.307487,0.042488
3822486,Finland,4,1.0,0.786729,0.991125,0.849474,0.967347,0.307487,0.042488
3847834,Finland,2,1.0,0.786729,0.991125,0.849474,0.967347,0.307487,0.042488
4768471,Finland,2,1.0,0.786729,0.991125,0.849474,0.967347,0.307487,0.042488


In [69]:
for dim in ["Life Ladder", "Log GDP per capita", "Social support", "Healthy life expectancy at birth", "Freedom to make life choices", "Generosity", "Perceptions of corruption"]:
    df_wh[["prediction", dim]].boxplot(by="prediction")

In [25]:
#df = df_wh[["Life Ladder", "Log GDP per capita", "Social support", "Healthy life expectancy at birth", "Freedom to make life choices", "Generosity", "Perceptions of corruption"]].stack().reset_index()
df = df_wh[["Happiness", "GDP", "Social support", "Healthy life expectancy", "Freedom", "Generosity", "Perception of corruption"]].stack().reset_index()
df.columns = ["user_id", "acoustic_features", "value"]
df = df.merge(classification_df, left_on="user_id", right_index=True)
g = sns.boxplot(x="value", y="acoustic_features", hue="prediction", data=df, showfliers=False)
#g.legend_.remove()
g.legend_.set_title("")
# replace labels
plt.legend(loc="upper left")
new_labels = [r"${U}_{C_1}$", r"${U}_{C_2}$", r"${U}_{C_3}$", r"${U}_{C_4}$"]
for t, l in zip(g.legend_.texts, new_labels):
    t.set_text(l)
plt.grid(False)
plt.ylabel("")
plt.xlabel("")

Text(0.5, 0, '')

In [66]:
df_wh[["Life Ladder", "Log GDP per capita", "Social support", "Healthy life expectancy at birth", "Freedom to make life choices", "Generosity", "Perceptions of corruption"]].min()

Life Ladder                          4.046111
Log GDP per capita                   8.768456
Social support                       0.606767
Healthy life expectancy at birth    59.502628
Freedom to make life choices         0.438300
Generosity                          -0.296735
Perceptions of corruption            0.181148
dtype: float64

In [113]:
fractions = df_h.groupby("prediction")["long_term_orientation"].value_counts() / df_h.groupby("prediction")["long_term_orientation"].size()
fractions

prediction  long_term_orientation
1           0.466667                 0.115152
            0.293333                 0.084848
            0.720000                 0.072727
            0.666667                 0.066667
            0.533333                 0.060606
                                       ...   
4           0.426667                 0.008889
            0.093333                 0.004444
            0.200000                 0.004444
            0.266667                 0.004444
            0.746667                 0.004444
Name: long_term_orientation, Length: 99, dtype: float64

In [114]:
fractions.groupby("prediction").head()

prediction  long_term_orientation
1           0.466667                 0.115152
            0.293333                 0.084848
            0.720000                 0.072727
            0.666667                 0.066667
            0.533333                 0.060606
2           0.106667                 0.106007
            0.333333                 0.091873
            0.466667                 0.063604
            0.666667                 0.063604
            0.720000                 0.063604
3           1.000000                 0.121212
            0.440000                 0.090909
            0.653333                 0.090909
            0.720000                 0.090909
            0.266667                 0.060606
4           0.973333                 0.102222
            0.333333                 0.093333
            0.306667                 0.080000
            0.640000                 0.062222
            0.106667                 0.057778
Name: long_term_orientation, dtype: float64