## Description

During the description of the challenge it is mentioned that the dataset may contain **potential redundancy & strong feature correlation**, so it may be interesting to analyse the relationship among the different features in the dataset by clustering them following different approaches.

Firstly I will cluster the features in the train set based on how they are correlated with each other. Then I will use the feature's metadata to cluster them based on their tags.

Since first approach requires to work with numerical data and second with categorical, I will use the following technologies:

* kmeans
* kmodes
* PCA
* MCA

In [None]:
# Required installs
!pip install prince

# Required imports
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None, 'display.max_rows', None, 'display.max_colwidth', -1)  

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
from kmodes import kmodes

from sklearn.decomposition import PCA
import prince

import warnings
warnings.filterwarnings("ignore")

## 1- Clustering Features based on their Correlation

<u>Steps followed</u>:

* 1.1- Calculating the Correlation across the different features.
* 1.2- Determining the number of clusters
* 1.3- Clustering the Correlation matrix using Kmeans
* 1.4- Visualizing features once clustered with a correaltion matrix heatmap
* 1.5- Visualizing features with a 2D representation after applying PCA on the kmeans results

In [None]:
train_data = pd.read_csv('../input/jane-street-market-prediction/train.csv')
feat_df = train_data.iloc[:,8:-1]
feat_df.fillna(0, inplace = True)
feat_names = feat_df.columns
feat_df.head()

### 1.1- Calculating the Correlation across the different features

I dropped ```feature_0``` since it seems to be a sign flag. Below there is a heat map of the correlation matrix of the other features.

In [None]:
# Calculating feature correlation
corr_feat_df = feat_df.corr()
corr_feat_mtx = corr_feat_df.to_numpy()
plt.figure()
plt.imshow(corr_feat_mtx, interpolation='nearest')
plt.colorbar()
plt.title('Feature correlation')

### 1.2- Determining the number of clusters

By plotting the cluster cost evolution when increasing the number of clusters we can see the best **number of cluster is 5**

In [None]:
# Determine optimun number of clusters for kmeans
wcss = []
max_num_clusters = 15
for i in range(1, max_num_clusters):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(corr_feat_mtx)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1, max_num_clusters), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

### 1.3- Clustering the Correlation matrix using Kmeans

The correlation matrix is clustered using 5 cluster in Kmeans. Below you can see to which cluster each feature has been assign to and the distance of each feature to its cluster centroid

In [None]:
# Using kmeans to cluster the features based on their correlation
n_clusters_kmeans = 5
kmeans = KMeans(n_clusters = n_clusters_kmeans, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
corr_feat_labels = kmeans.fit_predict(corr_feat_mtx)

# Preparing a dataframe to collect some cluster stats
corr_feat_clust_df = pd.DataFrame(np.c_[feat_names, corr_feat_labels])
corr_feat_clust_df.columns = ["feature", "cluster"]
corr_feat_clust_df['feat_list'] = corr_feat_clust_df.groupby(["cluster"]).transform(lambda x: ', '.join(x))
corr_feat_clust_df = corr_feat_clust_df.groupby(["cluster", "feat_list"]).size().reset_index(name = 'feat_count')
corr_feat_clust_df


In [None]:
corr_node_dist = kmeans.transform(corr_feat_df)
corr_clust_dist = np.c_[feat_names, np.round(corr_node_dist.min(axis=1),3), np.round(corr_node_dist.min(axis=1)/np.max(corr_node_dist.min(axis=1)),3), corr_feat_labels]
corr_clust_dist_df = pd.DataFrame(corr_clust_dist)
corr_clust_dist_df.columns = ['feature', 'dist_corr', 'dist_corr_norm', 'cluster_corr']
corr_clust_dist_df

### 1.4- Visualizing features once clustered with a correaltion matrix heatmap

Then the correlation matrix was reordered so features belonging to same cluster stay together in the heatmap.

In [None]:
# Method to group together in correlation matrix features with same labels
def clustering_corr_matrix(corrMatrix, clustered_features):
    npm = corrMatrix.to_numpy()
    npm_zero = np.zeros(shape=(len(npm), len(npm)))
    n = 0
    for i in clustered_features:
        m = 0
        for j in clustered_features:
            npm_zero[n, m] = npm[i-1, j-1] # TODO: remove the -1 if including again feature 0
            m += 1
        n += 1
    return npm_zero

# Preprocessing the correlation matrix before starting the the clustering based on labels
def processing_clusterd_corr_matrix(feat_labels, corrMatrix):
    
    lst_lab = list(feat_labels)
    lst_feat = corrMatrix.columns

    lab_feat_map = {lst_feat[i].replace("feature_" , "") : lst_lab[i] for i in range(len(lst_lab))} 
    lab_feat_map_sorted = {k: v for k, v in sorted(lab_feat_map.items(), key=lambda item: item[1])}
    
    clustered_features = list(map(int,lab_feat_map_sorted.keys()))
    print(len(clustered_features))
    return clustering_corr_matrix(corrMatrix, clustered_features)

# Function to plot the clustered 
def plot_clustered_matrix(clust_mtx, feat_clust_list):
    plt.figure()
    
    fig, ax = plt.subplots(1)
    
    im = ax.imshow(clust_mtx, interpolation='nearest')
    
    corner = 0
    for s in feat_clust_list:
        rect = patches.Rectangle((float(corner),float(corner)), float(s), float(s), angle=0.0, linewidth=2,edgecolor='r',facecolor='none')
        ax.add_patch(rect)
        corner += s
        ax.add_patch(rect)
    
    fig.colorbar(im)
    
    plt.title('Clusterd Feature by Correlation')
    plt.show()    
    

In [None]:
# Plotting Clustered Correlation Matrix Heat Map
clust_mtx = processing_clusterd_corr_matrix(corr_feat_labels, corr_feat_df)
plot_clustered_matrix(clust_mtx, corr_feat_clust_df['feat_count'].to_numpy())

### 1.5- Visualizing features with a 2D representation after applying PCA on the kmeans results
It was applied PCA to the feature correlation matrix to reduce the number of dimensions to two. The same was applied to the centroids of the cluster. Finally both, features and centroids were plotted together to see the distribution of the different clusters.

In [None]:
# Scatter plot of the different centroids along with observations once clusterd
def plotting_scatter(n_clusters, centroids, labels_mtx, title):
    
    # Size and alpha values 
    obsv_lw, obsv_alp = 2, .9
    cntr_lw, cntr_apl = 55, .55
    
    # Generating cluster names for the legend and colors
    target_names = ['k'+str(i) for i in range(n_clusters)]
    colors = colors = cm.rainbow(np.linspace(0, 1, n_clusters))
    
    # Printing the centroids
    for color, i, target_name in zip(colors, range(n_clusters), target_names):
        plt.scatter(centroids[i, 0], centroids[i, 1], color = color , alpha = cntr_apl,  s = cntr_lw**2)
    
    # Printing observation
    for color, i, target_name in zip(colors, range(n_clusters), target_names):
        cur_label = labels_mtx[np.where(labels_mtx[:,2] == i)]
        plt.scatter(cur_label[:, 0], cur_label[:, 1], color = color, alpha = obsv_alp , lw = obsv_lw, label = target_name)
    
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title(title)
    plt.figure()

In [None]:
# Visualizing the dispersion of each cluster in "2D"
pca_2 = PCA(n_components=2)
corr_pca = pca_2.fit(corr_feat_df).transform(corr_feat_df)
corr_centr_pca = pca_2.fit(kmeans.cluster_centers_).transform(kmeans.cluster_centers_)

# Concatenating the pca values with their labels
corr_labels_mtx = np.c_[corr_pca, corr_feat_labels]
plotting_scatter(n_clusters_kmeans, corr_centr_pca, corr_labels_mtx,'PCA of Clustered Features')

## 2- Clustering Features based on their Tags

<u>Steps followed</u>:

* 2.1- Determining the number of clusters
* 2.2- Clustering the Tag matrix using Kmodes
* 2.3- Visualizing features with a 2D representation after applying MCA on the Kmodes results

In [None]:
tags_feat_df = pd.read_csv('../input/jane-street-market-prediction/features.csv', skiprows=[1])
tags_feat_df.replace({False: "False", True: "True"}, inplace = True)
tags_feat_df['feature'] = tags_feat_df['feature'].apply(lambda x : x.replace('eature_', ''))
tags_feat_df.head()

### 2.1- Determining the number of clusters
By using the Cao initialization method it is determined that 4 is the best number of clusters. The first 6 tags are not taken into account since they show some sort of cycle in most of the features. 

In [None]:
# Looking for best number of clusters
cost = []
max_clust = 15
for num_clusters in list(range(1,max_num_clusters)):
    kmode = kmodes.KModes(n_clusters = num_clusters, init = "Cao", n_init = 5, verbose=1)
    kmode.fit_predict(tags_feat_df.iloc[:,7:])
    cost.append(kmode.cost_)

In [None]:
plt.plot(range(1, max_num_clusters), cost)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Cost')
plt.show()

### 2.2- Clustering the Tags Matrix using Kmodes

The Tags Matrix is clustered using 4 cluster in Kmodes. Below you can see to which cluster each feature has been assign to and the distance of each feature to its cluster centroid

In [None]:
n_clusters_kmodes = 4
kmode_cao = kmodes.KModes(n_clusters = n_clusters_kmodes, init = "Cao", n_init = 10, verbose=1)
tags_feat_labels = kmode_cao.fit_predict(tags_feat_df.iloc[:,7:])

# Preparing a dataframe to collect some cluster stats
tags_feat_clust_df = pd.DataFrame(np.c_[feat_names, tags_feat_labels])
tags_feat_clust_df.columns = ["feature", "cluster"]
tags_feat_clust_df['feat_list'] = tags_feat_clust_df.groupby(["cluster"]).transform(lambda x: ', '.join(x))
tags_feat_clust_df = tags_feat_clust_df.groupby(["cluster", "feat_list"]).size().reset_index(name = 'feat_count')
tags_feat_clust_df

In [None]:
def nodes_distances(nodes_clustered, centroids):
    distances = [] 
    for node in nodes_clustered:
        # Centroid value of the current node
        centroid = centroids[node[-1]]     
        distances.append([node[0], node[-1], np.sum(centroid != node[1:-1])])
    return np.array(distances)


tags_nodes_clustered = np.c_[tags_feat_df.to_numpy(), tags_feat_labels]
tags_nodes_dist = nodes_distances(tags_nodes_clustered, kmode_cao.cluster_centroids_)

tag_clust_dist = np.c_[feat_names, tags_nodes_dist[:,2], tags_feat_labels]
tag_clust_dist_df = pd.DataFrame(tag_clust_dist)
tag_clust_dist_df.columns = ['feature', 'dist_tags', 'cluster_tags']
tag_clust_dist_df

### 2.3- Visualizing features with a 2D representation after applying MCA on the Kmodes results

It was applied MCA to the feature Clustered Tag Matrix to reduce the number of dimensions to two. The same was applied to the centroids of the cluster. Finally both, features and centroids were plotted together to see the distribution of the different clusters.


In [None]:
mca = prince.MCA(n_components = 2)
centr_kmodes_df = pd.DataFrame(kmode_cao.cluster_centroids_)

tags_mca = mca.fit(tags_feat_df.iloc[:,7:]).transform(tags_feat_df.iloc[:,7:])
tags_mca_centr = mca.fit(centr_kmodes_df).transform(centr_kmodes_df)
tags_labels_mtx = np.c_[tags_mca, tags_feat_labels]
plotting_scatter(n_clusters_kmodes, tags_mca_centr.to_numpy(), tags_labels_mtx, 'MCA for Clustered Features')