In [1]:
import sys
sys.path.append("../")

In [2]:
from sklearn import mixture
from sklearn import metrics
from sklearn.cluster import KMeans

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sns.set(color_codes=True)

In [5]:
def load_data_file(filename):

    with open(filename) as file:
        context = file.read()
        size = len(context)
        arraystr= context[1:size-1]
        arraystr = arraystr.split(',')
        region_means = [float(item) for item in arraystr]
        return region_means

In [6]:
wga_mean_file = "/home/a/ag568/wga_windows_mean_0_MANHATAN_5_MEAN_RATIO.txt"
no_wga_mean_file = "/home/a/ag568/no_wga_windows_mean_0_MANHATAN_5_MEAN_RATIO.txt"
gc_file = "/home/a/ag568/no_wga_windows_mean_0_MANHATAN_5_MEAN_RATIO.txt"

In [None]:
wga_mu = load_data_file(filename=wga_mean_file)
no_wga_mu = load_data_file(filename=no_wga_mean_file)
gc = load_data_file(filename=gc_file)

In [None]:
assert len(wga_mu) == len(no_wga_mu)
assert len(gc) == len(wga_mu)

In [None]:
print("Size of WGA sample {1}".format(INFO, len(wga_mu)))
print("WGA max: {1}".format(INFO, max(wga_mu)))
print("WGA min: {1}".format(INFO, min(wga_mu)))
print("WGA mean: {1}".format(INFO, np.mean(wga_mu)))
print("WGA var: {1}".format(INFO, np.var(wga_mu)))

In [None]:
sns.distplot(wga_mu, bins=35, kde=False, rug=True)
plt.xlabel("WGA means")
plt.show()

In [None]:
print("NO-WGA size {1}".format(INFO, len(no_wga_mu)))
print("NO-WGA max: {1}".format(INFO, max(no_wga_mu)))
print("NO-WGA min: {1}".format(INFO, min(no_wga_mu)))
print("NO-WGA mean: {1}".format(INFO, np.mean(no_wga_mu)))
print("NO-WGA var: {1}".format(INFO, np.var(no_wga_mu)))

In [None]:
sns.distplot(no_wga_mu, bins=35, kde=False, rug=True)
plt.xlabel("NO-WGA means")
plt.show()

In [None]:
print("GC size {1}".format(INFO, len(gc)))
print("GC max: {1}".format(INFO, max(gc)))
print("GC min: {1}".format(INFO, min(gc)))
print("GC mean: {1}".format(INFO, np.mean(gc)))
print("GC var: {1}".format(INFO, np.var(gc)))

In [None]:
sns.distplot(gc, bins=35, kde=False, rug=True)
plt.xlabel("Ref GC content")
plt.show()

In [None]:
def make_data_array(wga_mu, no_wga_mu, gc, use_ratio, use_gc):
    data = []
    
    if use_ratio and use_gc:
        for wga_val, no_wga_val, gc_val in zip(wga_mu, no_wga_mu, gc):
            data.append([wga_val, no_wga_val, (wga_val + 1)/(no_wga_val + 1), gc_val])
    elif use_ratio:
        for wga, no_wga in zip(wga_mu, no_wga_mu):
            data.append([wga, no_wga, (wga + 1)/(no_wga + 1)])
    elif use_gc:
        for wga_val, no_wga_val, gc_val in zip(wga_mu, no_wga_mu, gc):
            data.append([wga_val, no_wga_val, gc_val])
    else:
        
        for wga, no_wga in zip(wga_region_mean, no_wga_region_mean):
            data.append([wga, no_wga])
        
    return data

In [None]:
def kmeans_clustering(clusters, data):
    for nclusters in clusters:
    
        print("Number of clusters ", nclusters)
        kmeans = KMeans(n_clusters=nclusters)
        kmeans.fit(X=data)
        labels = kmeans.labels_


        calinski_harabasz_score = metrics.calinski_harabasz_score(data, labels)
        print("Calinski score: ", calinski_harabasz_score)
        davies_bouldin_score = metrics.davies_bouldin_score(data, labels)
        print("Davies score: ", davies_bouldin_score)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                          int(max(labels) + 1))))

        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])

        plt.scatter(no_wga_mu, wga_mu,  color=colors[labels])
        plt.xlabel("NO-WGA ")
        plt.ylabel("WGA")
        plt.show()
    

In [None]:
def gmm_clustering(clusters, data):
    
    for nclusters in clusters:
    
        print("Number of clusters ", nclusters)
        gmm = mixture.GaussianMixture(n_components=nclusters,
                                    covariance_type='full')
        gmm.fit(data)
        labels = gmm.predict(data)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                          int(max(labels) + 1))))


        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])

        plt.scatter(no_wga_mu, wga_mu,  color=colors[labels])
        plt.xlabel("NO-WGA ")
        plt.ylabel("WGA")
        plt.show()
    

In [None]:
N_CLUSTERS = [2, 3, 4, 5]

## Two means 

In [None]:
data = make_data_array(wga_mu=wga_mu, 
                       no_wga_mu=no_wga_mu, gc=None, 
                       use_ratio=False, use_gc=False)

data = np.array(data)

assert data.shape == (len(wga_mu), 2)

### KMeans Clustering

In [7]:
kmeans_clustering(clusters=N_CLUSTERS, data=data)


NameError: name 'N_CLUSTERS' is not defined

### GMM Clustering

In [None]:
gmm_clustering(clusters=N_CLUSTERS, data=data)


## Two means + Ratio

In [None]:
data = make_data_array(wga_mu=wga_mu, 
                       no_wga_mu=no_wga_mu, gc=None, 
                       use_ratio=True, use_gc=False)

data = np.array(data)

assert data.shape == (len(wga_mu), 3)

### KMeans

In [None]:
kmeans_clustering(clusters=N_CLUSTERS, data=data)

### GMM

In [None]:
gmm_clustering(clusters=N_CLUSTERS, data=data)

## Two means + GC

In [None]:
data = make_data_array(wga_mu=wga_mu, 
                       no_wga_mu=no_wga_mu, gc=gc, 
                       use_ratio=False, use_gc=True)

data = np.array(data)

assert data.shape == (len(wga_mu), 3)

### KMeans

In [None]:
kmeans_clustering(clusters=N_CLUSTERS, data=data)
 

### GMM

In [None]:
gmm_clustering(clusters=N_CLUSTERS, data=data)

## Two means + GC + Ratio

In [None]:
data = make_data_array(wga_mu=wga_mu, 
                       no_wga_mu=no_wga_mu, gc=gc, 
                       use_ratio=True, use_gc=True)

data = np.array(data)

assert data.shape == (len(wga_mu), 4)

## KMeans

In [None]:
kmeans_clustering(clusters=N_CLUSTERS, data=data)

### GMM

In [None]:
gmm_clustering(clusters=N_CLUSTERS, data=data)