### Machine Specific

In [None]:
# prevent memory leak for Windows
import os
os.environ["OMP_NUM_THREADS"] = "1"

### Import Dependencies

In [None]:
# for machine learning
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# for data handling
import pandas as pd

# for visualization
import matplotlib.pyplot as plt

### Define Functions

In [None]:
# functions
def get_clusters(k, data):
    model = KMeans(n_clusters = k, random_state = 1)
    model.fit(data)
    predictions = model.predict(data)
    data["class"] = model.labels_
    return data

def show_clusters(df, x, y):
    plt.scatter(df[x], df[y], c = df["class"])
    plt.xlabel(x)
    plt.ylabel(y)
    plt.savefig(os.path.join("Output", "output.png"))
    plt.show()

### Import and Prepare Data

In [None]:
# define parameters
pca_variance = 0.9
tsne_lr = 35

In [None]:
# bring in the dataset
myopia_df = pd.read_csv(os.path.join("Resources", "myopia.csv"))

# preview the raw data
myopia_df.head()

In [None]:
# remove the output to prevent overtraining
myopia_df = myopia_df.drop("MYOPIC", axis = 1)
myopia_df.head()

In [None]:
# standardize the dataset
myopia_scaled = StandardScaler().fit_transform(myopia_df)

# reduce the dataset's features with PCA
pca = PCA(n_components = pca_variance)
myopia_pca = pca.fit_transform(myopia_scaled)
myopia_pca_df = pd.DataFrame( data = myopia_pca, columns = [f"pc{x}" for x in range(myopia_pca.shape[1])])
myopia_pca_df.shape

In [None]:
# further reduce the dataset's features with t-SNE
tsne = TSNE(learning_rate = tsne_lr)
tsne_features = tsne.fit_transform(myopia_pca_df)
tsne_features.shape

In [None]:
# recombine the t-SNE features with the dataset
myopia_pca_df["x"] = tsne_features[:,0]
myopia_pca_df["y"] = tsne_features[:,1]

# preview the t-SNE reduced features
plt.scatter(myopia_pca_df["x"], myopia_pca_df["y"])
plt.show()

In [None]:
# build the k means elbow plot
inertia = []
k = [i + 1 for i in range(10)]

for i in k:
    km = KMeans(n_clusters = i, random_state = 1)
    km.fit(myopia_pca_df)
    inertia.append(km.inertia_)

elbow_df = pd.DataFrame({ "k": k, "inertia": inertia })
plt.plot(elbow_df["k"], elbow_df["inertia"])
plt.xticks(k)
plt.title("Elbow Curve")
plt.xlabel("Cluster #")
plt.ylabel("Inertia")
plt.show()

In [None]:
# it appears that k = 3 is ideal
clusters = get_clusters(3, myopia_pca_df)

# show the clusters
show_clusters(clusters, "x", "y")

### Assessment
Given this output, it is safe to say there are 3 clusters within the given dataset. The value of 3 is first identified by the elbow curve's strongest derivative then confirmed by the colored clusters. There is clear distinction between the three groups.