In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re
%matplotlib inline

In [None]:
# load abbreviations file
# note that the file is encoded in latin-1, instead of Python's default utf-8 or the common ascii encoding
raw_abbrs = [line.decode('latin-1') for line in open('../data/stateabbr.data', 'rb')]

# get raw samples with the locations as a list
raw_samples = [sample.decode('latin-1').strip().split(',') for sample in open('../data/plants.data', 'rb')]


In [None]:
# parse the abbreviations, not every line is a location and not every abbreviation is 2 letters
# however, every abbreviation is lower case
samples_dict = {}
sample_count = len(raw_samples)
for line in raw_abbrs:
    m = re.search('^([a-z]{2,})\s', line)
    if m:
        abbr = m.group(0).replace(' ', '')
        # pre-populate with zeros
        samples_dict[abbr] = np.zeros(sample_count, dtype=np.int8)
print("Num Locations: {}".format(len(samples_dict.keys())))

In [None]:
# there should be 70 locations, turns out "Prince Edward Island" does not have an abbreviation
# also, samples have an abbreviation the location "pe" and "gl" which is not in stateabbr.txt
# let's assume pe == "Prince Edward Island" and gl == "GreenLand" and add it to the dictionary
samples_dict['pe'] = np.zeros(sample_count, dtype=np.int8)
samples_dict['gl'] = np.zeros(sample_count, dtype=np.int8)

all_locations = set(samples_dict.keys())
# assign 1 for the locations
for i in range(0, sample_count):
    abbrs = set(raw_samples[i][1:])
    for abbr in abbrs:
        samples_dict[abbr][i] = 1 

plants_df = pd.DataFrame(data=samples_dict)

In [None]:
from sklearn.decomposition import PCA

number_of_dimensions = 2
pca = PCA(n_components=number_of_dimensions)

pca.fit(plants_df)
plants_2d = pca.transform(plants_df)

In [None]:
from sklearn.cluster import KMeans

number_of_clusters = [2,3,6,14]

fig = plt.figure(figsize=(20,4))

for i, clusters in enumerate(number_of_clusters):
    fig.add_subplot(101+i+10*len(number_of_clusters))
    kmeans = KMeans(n_clusters=clusters)
    kmeans.fit(plants_df)
    labels = ['cluster ' + str(label+1) for label in kmeans.labels_]
    ax = sns.swarmplot(x=gene_exp_2d[:,0], y=gene_exp_2d[:,1], hue=labels)
    ax.set(xticklabels=[])
    ax.set(yticklabels=[])
    ax.legend(loc='upper right')
    if i == 3: ax.legend_.remove()