# cluster correlation on weights

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
weights_df = pd.read_csv("weights.csv", sep=";", usecols=["user_id", "cluster", "weight"])
weights_df.head()

Unnamed: 0,user_id,cluster,weight
0,1049656,0,0.119843
1,1049656,1,0.337917
2,1049656,2,0.281925
3,1049656,3,0.260314
4,1055118,0,0.036036


In [3]:
df = weights_df.pivot(index="user_id", columns="cluster", values="weight")
df.fillna(0, inplace=True)

In [4]:
df.head()

cluster,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1049656,0.119843,0.337917,0.281925,0.260314
1055118,0.036036,0.936937,0.004505,0.022523
1056935,0.121143,0.693714,0.041143,0.144
1070023,0.054945,0.717033,0.035714,0.192308
1072752,0.185864,0.494764,0.102094,0.217277


In [5]:
df = df.stack().to_frame()
df.columns = ["weight"]
df.reset_index(inplace=True)
df["cluster"] += 1
df.head()

Unnamed: 0,user_id,cluster,weight
0,1049656,1,0.119843
1,1049656,2,0.337917
2,1049656,3,0.281925
3,1049656,4,0.260314
4,1055118,1,0.036036


In [6]:
classification_df = pd.read_csv("classification_clean.csv", sep=";")
classification_df.columns = ["user_id", "prediction"]
classification_df.head()

Unnamed: 0,user_id,prediction
0,10883488,1
1,35212267,3
2,38189090,3
3,22113634,3
4,3704198,3


In [7]:
classification_df.groupby("prediction").size()

prediction
1    396
2    900
3    102
4    675
dtype: int64

In [8]:
df = df.merge(classification_df, left_on="user_id", right_on="user_id")
df.head()

Unnamed: 0,user_id,cluster,weight,prediction
0,1049656,1,0.119843,3
1,1049656,2,0.337917,3
2,1049656,3,0.281925,3
3,1049656,4,0.260314,3
4,1055118,1,0.036036,2


In [9]:
%matplotlib qt 
for c, group in df.groupby(by="prediction"):
    plt.figure()
    correlation_mat = group.pivot(index="user_id", columns="cluster", values="weight").corr()
    ax = sns.heatmap(correlation_mat, annot=True, vmax=1, vmin=-1)
    plt.title("weights U" + str(c))
    ax.set_xticklabels([1, 2, 3, 4])
    ax.set_yticklabels([1, 2, 3, 4])
    ax.set_ylim(4, 0)

In [10]:
cluster_idfs = {1: 1.053797369666461, 2: 0.18646370555351036, 3: 1.3061778461357676, 4: 0.675086198153002}
idf_df = pd.DataFrame.from_dict(cluster_idfs, orient="index")
idf_df.columns = ["score"]
idf_df.index.name = "cluster"
idf_df

Unnamed: 0_level_0,score
cluster,Unnamed: 1_level_1
1,1.053797
2,0.186464
3,1.306178
4,0.675086


In [11]:
weights_with_idf_df = df.merge(idf_df, left_on="cluster", right_index=True)
weights_with_idf_df["idf_weight"] = weights_with_idf_df["score"] * weights_with_idf_df["weight"]
weights_with_idf_df = weights_with_idf_df[["user_id", "cluster", "idf_weight", "prediction"]]
weights_with_idf_df.head()

Unnamed: 0,user_id,cluster,idf_weight,prediction
0,1049656,1,0.12629,3
4,1055118,1,0.037975,2
8,1056935,1,0.12766,2
12,1070023,1,0.057901,2
16,1072752,1,0.195863,1


In [14]:
%matplotlib qt 
for c, group in weights_with_idf_df.groupby(by="prediction"):
    group[["cluster", "idf_weight"]].boxplot(by="cluster")
    plt.title("U" + str(c))
    ax.set_ylim(4, 0)

In [15]:
%matplotlib qt 
for c, group in df.groupby(by="prediction"):
    group[["cluster", "weight"]].boxplot(by="cluster")
    plt.title("U" + str(c))
    ax.set_ylim(4, 0)

In [12]:
%matplotlib qt
sns.set(font_scale=1.3)
correlation_mat = df.pivot(index="user_id", columns="cluster", values="weight").corr()
ax = sns.heatmap(correlation_mat, annot=True, vmax=1, vmin=-1)
plt.xlabel("")
plt.ylabel("")
#plt.title("weights")
ax.set_xticklabels([r"$U_{C_1}$", r"$U_{C_2}$", r"$U_{C_3}$", r"$U_{C_4}$"])
ax.set_yticklabels([r"$U_{C_1}$", r"$U_{C_2}$", r"$U_{C_3}$", r"$U_{C_4}$"])
ax.set_ylim(4, 0)


(4, 0)

In [15]:
df.pivot(index="user_id", columns="cluster", values="weight").corr()

cluster,1,2,3,4
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,-0.270844,0.237172,-0.33952
2,-0.270844,1.0,-0.718734,-0.7712
3,0.237172,-0.718734,1.0,0.332007
4,-0.33952,-0.7712,0.332007,1.0
