In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import fcluster

Load data

In [None]:
os.chdir('../../../Rat-Pilot/data/tag_directories/')

In [None]:
all_exp = pd.read_csv('merged-all.rlog.txt',sep='\t')
colnames = {all_exp.columns[0]:'PeakID'}
for col in all_exp.columns[19:]:
    colnames[col] = col.split()[0]
all_exp = all_exp.rename(colnames,axis=1)
all_exp = all_exp[['PeakID','Chr','Start','End','Annotation']+list(all_exp.columns[19:])]
all_exp['Annotation'] = all_exp.apply(lambda x: x[4].split()[0].lower(),axis=1)
all_exp.head()

In [None]:
distal_exp = pd.read_csv('merged-distal.rlog.txt',sep='\t')
colnames = {distal_exp.columns[0]:'PeakID'}
for col in distal_exp.columns[19:]:
    colnames[col] = col.split()[0]
distal_exp = distal_exp.rename(colnames,axis=1)
distal_exp = distal_exp[['PeakID','Chr','Start','End','Annotation']+list(distal_exp.columns[19:])]
del distal_exp['Annotation']
distal_exp.head()

Separate by annotation

In [None]:
promoter = all_exp[all_exp.Annotation == 'promoter-tss']
del promoter['Annotation']
promoter.head()

Perform PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
principal_components = pca.fit_transform(distal_exp[distal_exp.columns[4:]].T)

In [None]:
principal_df = pd.DataFrame(data=principal_components,columns=['PC 1','PC 2'])
principal_df['Sample'] = distal_exp[distal_exp.columns[4:]].T.index
principal_df.head()

In [None]:
sns.lmplot(x='PC 1',y='PC 2',data=principal_df,hue='Sample',fit_reg=False)
title('Distal Peaks')
xlabel('PC 1 (39%)')
ylabel('PC 2 (31%)')

In [None]:
pca.explained_variance_ratio_

In [None]:
principal_components = pca.fit_transform(promoter[promoter.columns[4:]].T)

In [None]:
principal_df = pd.DataFrame(data=principal_components,columns=['PC 1','PC 2'])
principal_df['Sample'] = promoter[promoter.columns[4:]].T.index
principal_df.head()

In [None]:
sns.lmplot(x='PC 1',y='PC 2',data=principal_df,hue='Sample',fit_reg=False)
title('Promoter Peaks')
xlabel('PC 1 (36%)')
ylabel('PC 2 (32%)')

In [None]:
pca.explained_variance_ratio_

Cluster Peaks

In [None]:
distal_exp['var'] = distal_exp[distal_exp.columns[4:]].var(axis=1)

In [None]:
distal_exp = distal_exp.nlargest(int(len(distal_exp)/20),'var')
del distal_exp['var']
distal_exp = distal_exp.set_index('PeakID')

In [None]:
figsize(10,8)
cluster = sns.clustermap(distal_exp[distal_exp.columns[3:]],z_score=0,yticklabels=False)
clf()
distal_exp['cluster'] = fcluster(cluster.dendrogram_row.linkage,2,'distance')
pal = sns.color_palette()
lut = dict(zip(distal_exp.cluster.unique(),pal))
row_colors = distal_exp.cluster.map(lut)
sns.clustermap(distal_exp[distal_exp.columns[3:-1]],row_linkage=cluster.dendrogram_row.linkage,
               col_linkage=cluster.dendrogram_col.linkage,row_colors=row_colors,z_score=0,yticklabels=False)
del distal_exp['cluster']