https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression

In [130]:
import pandas as pd
from sklearn import decomposition
from sklearn.cluster import KMeans, Birch


import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
matplotlib.style.use('ggplot')
%matplotlib inline 

In [65]:
data_path = "../data/external/data_geneMice/Data_Cortex_Nuclear.xls"
raw_df = pd.read_excel(data_path)
raw_df = raw_df.dropna()

In [66]:
raw_df.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
75,3415_1,0.649781,0.828696,0.405862,2.921435,5.167979,0.207174,0.17664,3.728084,0.239283,...,0.129363,0.486912,0.125152,0.146865,0.143517,1.627181,Control,Memantine,C/S,c-CS-m
76,3415_2,0.616481,0.841974,0.388584,2.862575,5.194163,0.223433,0.167725,3.64824,0.22103,...,0.143084,0.467833,0.112857,0.161132,0.145719,1.562096,Control,Memantine,C/S,c-CS-m
77,3415_3,0.637424,0.852882,0.400561,2.968155,5.35082,0.20879,0.173261,3.814545,0.2223,...,0.147673,0.462501,0.116433,0.160594,0.142879,1.571868,Control,Memantine,C/S,c-CS-m
78,3415_4,0.576815,0.75539,0.348346,2.624901,4.727509,0.205892,0.161192,3.77853,0.194153,...,0.12129,0.47911,0.102831,0.144238,0.141681,1.646608,Control,Memantine,C/S,c-CS-m
79,3415_5,0.542545,0.757917,0.350051,2.634509,4.735602,0.210526,0.165671,3.871971,0.194297,...,0.142617,0.438354,0.110614,0.155667,0.146408,1.607631,Control,Memantine,C/S,c-CS-m


In [123]:
def scatter_3d(df, x = 0, y = 1, z = 2):
    colors = df.color

    x,y,z = df[x], df[y], df[z]
    trace1 = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        #text=whole_df['artist_name'],
        #name=whole_df['artist_name'],
        showlegend=True,
        marker=dict(
            size=10,
            color=colors,
            colorscale='Jet',
            showscale=True,
            line=dict(
                color=colors,
                width=0.5,
                colorscale='Jet',
            ),

            opacity=1.0
        )
    )

    data = [trace1]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename='simple-3d-scatter')

def scatter_2d(df, x = 0, y = 1):
    colors = df.color

    x,y = df[x], df[y]
    trace1 = go.Scatter(
        x=x,
        y=y,
        mode='markers',
        #text=whole_df['artist_name'],
        #name=whole_df['artist_name'],
        showlegend=True,
        marker=dict(
            size=10,
            color=colors,
            colorscale='Jet',
            showscale=True,
            line=dict(
                color=colors,
                width=0.5,
                colorscale='Jet',
            ),

            opacity=1.0
        )
    )

    data = [trace1]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )
    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename='simple-3d-scatter')

In [198]:
key_kmeans = 'kmeans'
key_birch = 'birch'
key_colour = 'color'
key_labels = 'labels'

class Exploration(object):
    def __init__(self, raw_df, target_cols):
        cols = raw_df.columns

        self.df_class = raw_df[target_cols]
        self.df_attributes = raw_df.drop(target_cols, axis=1)
        
    def set_class_to_explore(self, key):
        self.target_key = key
        
    def pca(self, n):
        X = self.df_attributes
        pca = decomposition.PCA(n_components=n)
        pca.fit(X)
        X = pca.transform(X)
        self.df_pca = pd.DataFrame(X)
        
    def _cluster_kmeans(self, random_state=0):
        n_clusters = len(set(self.df_class[self.target_key]))
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(self.df_pca)
        self.cluster_results = kmeans
        
    def _cluster_birch(self, random_state=0):
        n_clusters = len(set(self.df_class[self.target_key]))
        birch = Birch(n_clusters=n_clusters).fit(self.df_pca)
        self.cluster_results = birch
        
    def cluster(self, algo=key_kmeans):
        if algo==key_kmeans:
            self._cluster_kmeans()
        if algo==key_birch:
            self._cluster_birch()
        
    def pca_scatter_cluster(self, n = 2, algo = key_birch):
        self.pca(n)
        self.cluster(algo = algo)
        int_labels = self.cluster_results.labels_
        text_labels = ['Cluster {}'.format(l) for l in int_labels]
        self.df_pca[key_labels] = text_labels
        self.df_pca[key_colour] = int_labels
        if n == 3:
            return scatter_3d(self.df_pca)
        return scatter_2d(self.df_pca)
        
    def pca_scatter_class(self, n = 2):
        self.pca(n)
        class_values = self.df_class[self.target_key]
        text_labels = class_values
        df_colour_dict = dict([(class_label, i) for (i, class_label) in enumerate(set(class_values))])
        class_colours = np.array([df_colour_dict[key] for key in class_values], dtype=int)
        self.df_pca[key_labels] = text_labels
        self.df_pca[key_colour] = class_colours
        if n == 3:
            return scatter_3d(self.df_pca)
        return scatter_2d(self.df_pca)
    
    def compare_class_clusters(self):
        cluster_int_labels = np.array(self.cluster_results.labels_)
        class_values = self.df_class[self.target_key]
        df_colour_dict = dict([(class_label, i) for (i, class_label) in enumerate(set(class_values))])
        class_int_labels = np.array([df_colour_dict[key] for key in class_values], dtype=int)
        df = pd.DataFrame()
        df[0] = [x + perturb() for x in class_int_labels]
        df[1] = [x + perturb() for x in cluster_int_labels]
        df[key_colour] = cluster_int_labels
        df[key_labels] = 0
        return scatter_2d(df)
        
        
def perturb(grid=0.1):
    return np.random.uniform(low=-grid, high=grid)/2

In [193]:
grid=0.1
np.random.uniform(low=-grid, high=grid)/2

0.028966537813702828

In [199]:
class_labels = ['Genotype','Treatment','Behavior','class', 'MouseID']

explorer_1 = Exploration(raw_df, class_labels)

In [200]:
explorer_1.set_class_to_explore('class')
explorer_1.pca_scatter_cluster(n=3)

In [201]:
explorer_1.compare_class_clusters()

In [203]:
explorer_1.set_class_to_explore('class')
explorer_1.pca_scatter_class(n=3)

In [202]:
explorer_1.set_class_to_explore('class')
explorer_1.pca_scatter_class(n=2)

In [126]:
explorer_1.set_class_to_explore('class')
explorer_1.pca_scatter_cluster(n=2)

In [128]:
explorer_1.set_class_to_explore('Behavior')
explorer_1.pca_scatter_cluster(n=3)

In [129]:
explorer_1.set_class_to_explore('Behavior')
explorer_1.pca_scatter_class(n=3)