In [5]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform



### Discovering groups of correlated metrics

For large numbers of metrics, it's not always possible to spot sensible groupings from the correlation matrix alone. 

This notebook uses hierarchical clustering. The algorithm works like this: 

1) Starting with the correlation matrix for the individual metrics, find the single highest correlation between any two metrics.

2) Create a loading matrix that converts the original dataset into a new one where the two most correlated metrics are grouped, but all others are separate. 

3) Use this new loading matrix to create a new version of the dataset.

4) Calculate the new correlation matrix for the data after the first two metrics are grouped.

5) Start a new iteration of 1-4 (looking for the next highest correlation).

The algorithm stops when enough metrics have been grouped so that nothing is left that is moderately or highly correlated (a parameter controls the level of grouping).

Uses SciPy's `linkage` and `fcluster` to do the actual clustering.

In [7]:
def find_correlation_clusters(corr, corr_thresh):

    # Clustering uses dissimilarity rather than correlation
    dissimilarity = 1.0 - corr
    diss_thresh = 1.0 - corr_thresh

    # Calculate the order or relative distances between metrics
    hierarchy = linkage(squareform(dissimilarity), method='single')

    # Determine the groups given the hierarchy and threshold
    labels = fcluster(hierarchy, diss_thresh, criterion='distance')

    return labels

In [8]:
def relabel_clusters(labels, metric_columns):
    
    # Count number of elements in each cluster
    cluster_count = Counter(labels)

    # Find the order of the cluster's number of members
    cluster_order = {cluster[0]: idx for idx, cluster in enumerate(cluster_count.most_common())}
    
    # Make a new series of the cluster labels in order
    relabeled_clusters = [cluster_order[l] for l in labels]

    # Make a new count from the relabeled clusters
    relabeled_count = Counter(relabeled_clusters)

    # Make a DataFrame listing the group for each of the metrics
    labeled_column_df = (
        pd.DataFrame(
        {'group': relabeled_clusters,
         'column': metric_columns}
        ).sort_values(['group', 'column'], ascending=[True, True])
    )

    return labeled_column_df, relabeled_count

In [9]:
def make_load_matrix(labeled_column_df, metric_columns, relabled_count, corr):

    # Creates an empty matrix to hold the averaging weights
    load_mat = np.zeros((len(metric_columns), len(relabled_count)))
    
    # Enter the weight for each metric in the loading matrix
    for row in labeled_column_df.iterrows():
        orig_col = metric_columns.index(row[1][1])
        
        # Selects columns in the loading matrix that are groups
        if relabled_count[row[1][0]]>1:
            load_mat[orig_col, row[1][0]] = 1.0 /  (np.sqrt(corr) * float(relabled_count[row[1][0]]))
        
        # For non-grouped metrix, the weight is 1.0
        else:
            load_mat[orig_col, row[1][0]] = 1.0

    # Make a Boolean series showing which columns are groups
    is_group = load_mat.astype(bool).sum(axis=0) > 1
    
    # Make the column names 'metric_group_n' for the groups, otherwise just the metric name
    column_names=['metric_group_{}'.format(d + 1) if is_group[d]
                      else labeled_column_df.loc[labeled_column_df['group']==d,'column'].iloc[0]
                      for d in range(0, load_mat.shape[1])]
    

    # Make a DataFrame from the weighted matrix
    loadmat_df = pd.DataFrame(load_mat, index=metric_columns, columns=column_names)
    
    # Create a name column for sorting
    loadmat_df['name'] = loadmat_df.index
    sort_cols = list(loadmat_df.columns.values)
    sort_order = [False] * loadmat_df.shape[1]
    sort_order[-1] = True
    
    # Sort for interpretability
    loadmat_df = loadmat_df.sort_values(sort_cols, ascending=sort_order)
    
    # Drop the name column since this was just used for sorting
    loadmat_df = loadmat_df.drop('name', axis=1)
    return loadmat_df

In [10]:
group_corr_thresh = 0.5
scores_path = "../metric-scores/socialnet_dataset_scores.csv"

# Load in the metric scores
score_data = pd.read_csv(scores_path,index_col=[0,1])
score_data.drop('is_churn', axis=1, inplace=True)
metric_columns = list(score_data.columns.values)

labels = find_correlation_clusters(score_data.corr(), group_corr_thresh)
labeled_column_df, relabeled_count = relabel_clusters(labels, metric_columns)
loadmat_df = make_load_matrix(labeled_column_df, metric_columns, relabeled_count, group_corr_thresh)

In [11]:
loadmat_df

Unnamed: 0,metric_group_1,metric_group_2,account_tenure,dislike_per_month,unfriend_per_month
adview_per_month,0.353553,0.0,0.0,0.0,0.0
like_per_month,0.353553,0.0,0.0,0.0,0.0
newfriend_per_month,0.353553,0.0,0.0,0.0,0.0
post_per_month,0.353553,0.0,0.0,0.0,0.0
message_per_month,0.0,0.707107,0.0,0.0,0.0
reply_per_month,0.0,0.707107,0.0,0.0,0.0
account_tenure,0.0,0.0,1.0,0.0,0.0
dislike_per_month,0.0,0.0,0.0,1.0,0.0
unfriend_per_month,0.0,0.0,0.0,0.0,1.0


With a threshold of 0.5, the algorithm has clustered `adview_per_month`, `like_per_month`, `newfriend_per_month` and `post_per_month` into one group, and `message_per_month` and `reply_per_month` into a second group.

In [12]:
# Save the loading matrix:
loadmat_df.to_csv('socialnet_dataset_scores_load_mat.csv')

In [13]:
# Save the group lists:
group_lists=['|'.join(labeled_column_df[labeled_column_df['group']==g]['column'])
                    for g in set(labeled_column_df['group'])]

(
    pd.DataFrame(group_lists, 
                 index=loadmat_df.columns.values, 
                 columns=['metrics'])
    .to_csv('socialnet_dataset_scores_groupmets.csv')
)