https://datascience.stackexchange.com/questions/67040/how-to-do-feature-selection-for-clustering-and-implement-it-in-python

In [3]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
import pandas as pd
# create some dummy data
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])
print(df)
class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features
    
    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]
    
        sc = StandardScaler()
        X = sc.fit_transform(X)
    
        pca = PCA(n_components=self.q).fit(X) # calculation Covmatrix is embeded in PCA
        A_q = pca.components_.T
    
        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_
    
        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))
    
        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]

        num_legs  num_wings  num_specimen_seen
falcon         2          2                 10
dog            4          0                  2
spider         8          0                  1
fish           0          0                  8


In [7]:
# Usage
pfa = PFA(n_features=2)
pfa.fit(df)
# To get the transformed matrix
x = pfa.features_
print(x)
# To get the column indices of the kept features
column_indices = pfa.indices_

[[-0.50709255  1.73205081]
 [ 0.16903085 -0.57735027]
 [ 1.52127766 -0.57735027]
 [-1.18321596 -0.57735027]]


In [8]:
column_indices

[0, 1]

---

---

In [91]:
X=df.copy()
q = X.shape[1]
n_features=2

In [92]:
X.shape

(4, 3)

In [93]:
sc = StandardScaler()
X = sc.fit_transform(X)
pca = PCA(n_components=q).fit(X) # calculation Covmatrix is embeded in PCA
A_q = pca.components_.T # coordenadas en el espacio de las variables

In [94]:
A_q

array([[-0.55529346, -0.64470406, -0.5253626 ],
       [ 0.50769017, -0.76312539,  0.39986289],
       [ 0.65871076,  0.04468018, -0.75106845]])

In [95]:
kmeans = KMeans(n_clusters=n_features).fit(A_q) # agrupacion de variables
clusters = kmeans.predict(A_q)
cluster_centers = kmeans.cluster_centers_

In [96]:
clusters

array([0, 1, 0], dtype=int32)

In [97]:
# distancia entre cada variable y el centroide. 
dists = defaultdict(list)
for i, c in enumerate(clusters):
    dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
    print(i, dist)
    dists[c].append((i, dist))

0 0.7071067811865475
1 0.0
2 0.7071067811865475


In [98]:
dists.values()

dict_values([[(0, 0.7071067811865475), (2, 0.7071067811865475)], [(1, 0.0)]])

In [99]:
# asi selecciona la variable mas cercana al cluster
indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
features_ = X[:, indices_]

In [100]:
for f in dists.values():
    #print(f)
    print(sorted(f,  key=lambda x: x[1])[0][0])
    print("--")

0
--
1
--


In [90]:
indices_

[0, 1, 2]