In [1]:
"""
Authors: Jesse Simpson, Rafail Islam, Amber Gillenwaters
"""
import pandas as pd
import numpy as np

from sklearn.cluster import SpectralClustering

In [2]:
df = pd.read_csv("nostop.lemmas.csv")

In [3]:
# Drop any NaN rows
df = df.dropna()

In [4]:
# Test on subset of features
df_features= df.iloc[:1000,0:3]
df_label = df.iloc[:1000,-1]

In [5]:
# encode categorical
df_features_transformed = pd.get_dummies(df_features)

In [6]:
number_k = df_label.nunique()

In [7]:
model = SpectralClustering()

In [8]:
model.fit(df_features_transformed)

SpectralClustering()

In [9]:
from sklearn import metrics
# A clustering result satisfies homogeneity if all of its clusters 
#contain only data points which are members of a single class.
homogeneity = metrics.homogeneity_score(df_label.to_numpy(), model.labels_)
# A clustering result satisfies completeness if all the data points 
# that are members of a given class are elements of the same cluster.
completeness = metrics.completeness_score(df_label.to_numpy(), model.labels_)
homogeneity, completeness

(0.2845707437008366, 0.856812169373565)

In [10]:
# Spectral Clustering Features
'''
0 n_clusters : int > 2
1 eigen_solver: {None, 'arpack', lobpcg', 'amg'} # no amg since library failed
2 random_state: int (constant)
3 n_init: int > 2
4 gamma : float (0-2)
5 affinity: 'rbf'
6 n_neighbors: int (2-15)
7 eigen_tol: float - default 0.0
8 assign_labels : {'kmeans', 'discretize'}
9 degree : float default 3
10 coef0: float default 1
'''

def create_spectral_cluster_parameters():
    num_clusters = np.random.randint(low=2, high=16)
    eigen_solver = np.random.choice([None, 'arpack', 'lobpcg'])
    random_state = 99
    n_init = np.random.randint(low=2, high=15)
    gamma = np.random.uniform(low=0.0, high=2.0)
    affinity = 'rbf'
    n_neighbors = np.random.randint(low=2, high=15)
    eigen_tol = 0.0
    assign_labels = np.random.choice(['kmeans', 'discretize'])
    degree = 3.0
    coef0 = 1
    
    return [num_clusters, eigen_solver, random_state, n_init, gamma, affinity,
                             n_neighbors, eigen_tol, assign_labels, degree, coef0]

test1 = create_spectral_cluster_parameters()
test2 = create_spectral_cluster_parameters()
print(test1)
print(test2)

[12, 'lobpcg', 99, 9, 1.9099760625681137, 'rbf', 9, 0.0, 'discretize', 3.0, 1]
[12, 'lobpcg', 99, 6, 0.7725454599456996, 'rbf', 7, 0.0, 'kmeans', 3.0, 1]


In [11]:
# Random parent selection
def parent_selection(population):
    p1 = np.random.randint(low=0, high=len(population)-1)
    p2 = np.random.randint(low=0, high=len(population)-1)
    return population[p1], population[p2]

In [12]:
def cross_over(p1, p2, length):
    point = np.random.randint(low=1, high= length-2)
    new_parameters = p1[:point]
    new_parameters += p2[point:]
    
    return np.array(new_parameters).reshape(1, length)

print(test1)
print(test2)
cross_over(test1, test2, 11)

[12, 'lobpcg', 99, 9, 1.9099760625681137, 'rbf', 9, 0.0, 'discretize', 3.0, 1]
[12, 'lobpcg', 99, 6, 0.7725454599456996, 'rbf', 7, 0.0, 'kmeans', 3.0, 1]


array([['12', 'lobpcg', '99', '9', '0.7725454599456996', 'rbf', '7',
        '0.0', 'kmeans', '3.0', '1']], dtype='<U21')

In [13]:
'''
0 n_clusters : int > 2
1 eigen_solver: {None, 'arpack', lobpcg', 'amg'}
2 random_state: int (constant)
3 n_init: int > 2
4 gamma : float (0-2)
5 affinity: 'rbf'
6 n_neighbors: int (2-15)
7 eigen_tol: float - default 0.0
8 assign_labels : {'kmeans', 'discretize'}
9 degree : float default 3
10 coef0: float default 1
'''

def mutation(sample):
    # Randomly mutate 1 parameter
    index = np.random.randint(low=0, high=11)
    
    # handle constraints
    if index == 0:
        new_cluster_size = np.random.randint(2, 16)
        sample[0] = new_cluster_size
    elif index == 1:
        new_solver = np.random.choice([None, 'arpack', 'lobpcg'])
        sample[1] = new_solver
    elif index == 2:
        pass
    elif index == 3:
        new_init = np.random.randint(low=2, high=15)
        sample[3] = new_init
    elif index == 4:
        new_gamma = np.random.uniform(low=0.0, high=2.0)
        sample[4] = new_gamma
    elif index == 5:
        pass
    elif index == 6:
        new_neighbors = np.random.randint(low=2, high=15)
        sample[6] = new_neighbors
    elif index == 7:
        pass
    elif index == 8:
        new_assign = np.random.choice(['kmeans', 'discretize'])
        sample[8] = new_assign
    elif index == 9:
        pass
    elif index == 10:
        pass
    
    return sample
        
print(test1)
mutation(test1)

[12, 'lobpcg', 99, 9, 1.9099760625681137, 'rbf', 9, 0.0, 'discretize', 3.0, 1]


[12, 'lobpcg', 99, 9, 1.9099760625681137, 'rbf', 9, 0.0, 'discretize', 3.0, 1]

In [14]:
def get_fitness(model_labels, label_true):
    homogeneity = metrics.homogeneity_score(label_true, model.labels_)
    completeness = metrics.completeness_score(label_true, model.labels_)
    return homogeneity + completeness

In [None]:
import itertools

# Hyper parameters
population_size = 20
number_gen = 50
length = 11 # Changes depending on clustering algorithm parameter size
num_rep = 10

best_of_pop = []
# Genetic algorithm
# Parent Selection
# Crossover
# Mutation
# Survivor
for rep in range(num_rep):
    population = [create_spectral_cluster_parameters() for _ in range(population_size)]
    for gen in range(number_gen):
        child_pop = []
        for _ in range(population_size):
            # Parent Selection
            parent1, parent2 = parent_selection(population)

            # Crossover
            child = cross_over(parent1, parent2, length).tolist()
            child = list(itertools.chain(*child))

            # Mutation
            child = mutation(child)

            child_pop.append(child)

        # Survivor Selection - Calculate Fitness, replace worst in population with best child
        # Create Models for each parameter list, evaluate fitness
        # Have to try/except all parameters since there is some linear algebra limitations
        # that can be broken from random mutation/crossover
        model_pop = []
        child_model_pop = []
        for i, sample in enumerate(population):
            try:
                model = SpectralClustering(n_clusters=sample[0], eigen_solver=sample[1], random_state=sample[2],
                                    n_init=sample[3], gamma=sample[4], affinity=sample[5], 
                                     eigen_tol=sample[7], assign_labels=sample[8], degree=sample[9], coef0=sample[10])

                fitness =  get_fitness(model.fit(df_features_transformed).labels_, df_label.to_numpy())
            except:
                flag = True
                while flag:
                    new_sample = create_spectral_cluster_parameters()
                    try:
                        model = SpectralClustering(n_clusters=new_sample[0], eigen_solver=new_sample[1], random_state=new_sample[2],
                                            n_init=new_sample[3], gamma=new_sample[4], affinity=new_sample[5], 
                                             eigen_tol=new_sample[7], assign_labels=new_sample[8], 
                                                   degree=new_sample[9], coef0=new_sample[10])

                        fitness =  get_fitness(model.fit(df_features_transformed).labels_, df_label.to_numpy())
                        flag = False
                    except:
                        pass


            # Append sample and fitness 
            model_pop.append([sample, fitness])

        for i, sample in enumerate(child_pop):
            try:
                model = SpectralClustering(n_clusters=sample[0], eigen_solver=sample[1], random_state=sample[2],
                                    n_init=sample[3], gamma=sample[4], affinity=sample[5], 
                                     eigen_tol=sample[7], assign_labels=sample[8], degree=sample[9], coef0=sample[10])

                fitness =  get_fitness(model.fit(df_features_transformed).labels_, df_label.to_numpy())
            except:
                flag = True
                while flag:
                    new_sample = create_spectral_cluster_parameters()
                    try:
                        model = SpectralClustering(n_clusters=new_sample[0], eigen_solver=new_sample[1], random_state=new_sample[2],
                                            n_init=new_sample[3], gamma=new_sample[4], affinity=new_sample[5], 
                                             eigen_tol=new_sample[7], assign_labels=new_sample[8], 
                                                   degree=new_sample[9], coef0=new_sample[10])

                        fitness =  get_fitness(model.fit(df_features_transformed).labels_, df_label.to_numpy())
                        flag = False
                    except:
                        pass


            # Append sample and fitness 
            child_model_pop.append([sample, fitness])

        sorted_pop = sorted(model_pop, key=lambda fit: fit[1])
        sorted_child_pop = sorted(child_model_pop, key=lambda fit: fit[1])
        sorted_pop[-1] = sorted_child_pop[0]
        population = [sample[0] for sample in sorted_pop]
        print("Epoch: ", gen)
    best_of_pop.append(population[0])


Epoch:  0
Epoch:  1
Epoch:  2
Epoch:  3
Epoch:  4
Epoch:  5
Epoch:  6
Epoch:  7
Epoch:  8
Epoch:  9
Epoch:  10
Epoch:  11
Epoch:  12
Epoch:  13
Epoch:  14
Epoch:  15
Epoch:  16
Epoch:  17


In [None]:
# PCA

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

pca = PCA(n_components= 2)
data, label = df_features_transformed, df_label.to_numpy()

foldername = None
filename = None

X1, X2 = np.transpose(pca.fit_transform(data))

fig = plt.figure(figsize=(5,4))

# Markers
# matplotlib markers: https://matplotlib.org/3.1.1/api/markers_api.html
# matplotlib colors: https://matplotlib.org/3.1.1/users/dflt_style_changes.html
markers = ['s', 'h', '^', 'D', 'o']
unique = np.unique(label)
for j, i in enumerate(unique):
    X_r1 = [X1[j] for j in range(len(X1)) if label[j] == i]
    X_r2 = [X2[j] for j in range(len(X2)) if label[j] == i]
    l = [label[j] for j in range(len(label)) if label[j] == i]
    plt.scatter(X_r1, X_r2, label = l)
    

#plt.savefig(foldername+filename, dpi=fig.get_dpi())

In [None]:
# ISO
from sklearn.manifold import Isomap

array = df_features_transformed.copy()
iso = Isomap(n_components=2)
iso.fit(array)
manifold_2Da = iso.transform(array)
manifold_2D = pd.DataFrame(manifold_2Da, columns=['Component 1', 'Component 2'])

fig = plt.figure()
fig = plt.figure(figsize=(5,4))
ax = fig.add_subplot(111)

X1 = manifold_2D['Component 1']
X2 = manifold_2D['Component 2']
unique = np.unique(label)

for j, i in enumerate(unique):
    X_r1 = [X1[j] for j in range(len(X1)) if label[j] == i]
    X_r2 = [X2[j] for j in range(len(X2)) if label[j] == i]
    l = [label[j] for j in range(len(label)) if label[j] == i]
    plt.scatter(X_r1, X_r2, label = l)

plt.show()
#plt.savefig(foldername+filename, dpi=fig.get_dpi())
plt.clf()

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df_features_transformed)

X1 = tsne_results[:,0]
X2 = tsne_results[:,1]
fig = plt.figure(figsize=(5,4))

unique = np.unique(label)
for j, i in enumerate(unique):
    X_r1 = [X1[j] for j in range(len(X1)) if label[j] == i]
    X_r2 = [X2[j] for j in range(len(X2)) if label[j] == i]
    l = [label[j] for j in range(len(label)) if label[j] == i]
    plt.scatter(X_r1, X_r2, label = l)
    
plt.show()
#plt.savefig(foldername+filename, dpi=fig.get_dpi())
plt.clf()