In [None]:
import scipy, patsy
from gpmultipy import dataset
import pandas as pd

In [None]:
%pylab

In [None]:
%matplotlib inline

In [None]:
ds1 = dataset.DataSet("../data/normalized/ura3-pq-replicate/")
ds1.meta['batch'] = 1

ds2 = dataset.DataSet("../data/pq-osmo-control/")
ds2.meta['mM_PQ'] = ds2.meta['mM PQ']
ds2.meta.mM_PQ[ds2.meta.mM_PQ.isnull()] = 0
ds2.meta['batch'] = 2

ds3 = dataset.DataSet("../data/pq-osmo-combo/")
ds3.meta['mM_PQ'] = ds3.meta['mM PQ']
ds3.meta.mM_PQ[ds3.meta.mM_PQ.isnull()] = 0
ds3.meta['batch'] = 3

In [None]:
meta = pd.concat((ds1.meta,ds2.meta,ds3.meta))
meta.loc[meta['M NaCl'].isnull(),'M NaCl'] = 4.2

meta.head()

In [None]:
', '.join(['%.3lf' % pq for pq in sorted(meta.mM_PQ.unique())])

In [None]:
data = pd.concat((ds1.data,ds2.data,ds3.data),1)
data.head()

In [None]:
# remove osmo stress

select = meta['M NaCl']==4.2
meta = meta[select]
data = data.loc[:,select]

In [None]:
# remove edges

edge = range(101,111) + range(111,191,10) + range(120,191,10) + range(191,201) + range(201,211) + range(211,291,10) + range(220,291,10) + range(291,301)
edge = np.array(edge)

select = ~meta.Well.isin(edge)

meta = meta[select]
data = data.loc[:,select]

In [None]:
data.to_csv("data/data.csv")
meta.to_csv("data/meta.csv",index=False)

In [None]:
ds = dataset.DataSet('data/')

In [None]:
pivot = pd.concat((ds.meta, ds.data.T),1,ignore_index=False)

pivot.mM_PQ = pivot.mM_PQ.round(2)
pivot = pivot[pivot.Strain=='ura3']
pivot = pivot[~(pivot.Bio.isin(list('EFGHIJKLMN')))]

pivot.head()

In [None]:
melt = pd.melt(pivot, ds.meta.columns.tolist(), ds.data.columns.tolist(), var_name='time', value_name='od')
melt.head()

In [None]:
melt.time = melt.time.astype(float)
melt.mM_PQ = melt.mM_PQ.round(2)

melt = melt[~melt.od.isnull()]
melt = melt[melt.Strain=='ura3']
melt = melt[~(melt.Bio.isin(list('EFGHIJKLMN')))]

melt.Bio = melt.Bio.astype('category').cat.codes
melt.Bio += 1

melt.od = np.log2(melt.od)

melt.shape

In [None]:
melt.to_csv("data/melt.csv",index=False)

In [None]:
select = melt[melt.mM_PQ==0.0]
g = select.groupby(['batch','Well'])

plt.figure(figsize=(10,6))

for k, temp in g:
    #print k
    batch, well = k
    
    temp = temp.sort_values('time')
    
    if batch == 1:
        color = 'g'
    elif batch == 2:
        color = 'r'
    else:
        color = 'b'
    
    plt.plot(temp.time,temp.od,c=color,alpha=.6)
    plt.ylim(melt.od.min(), melt.od.max())

plt.ylabel("log(OD)",fontsize=20)
plt.xlabel("time (h)",fontsize=20)
plt.tight_layout()

plt.savefig("figures/batch-0mMPQ.pdf",bbox_inches='tight')

In [None]:
g = melt.groupby(['mM_PQ','batch','Bio','Well'])
pqvals = melt.mM_PQ.unique()
pqvals.sort()

plt.figure(figsize=(20,8))

for k, temp in g:
    #print k
    pq, batch, bio, well = k
    
    ind = pqvals.tolist().index(pq)
    plt.subplot(2,5,ind+1)
    plt.title(pq)
    
    temp = temp.sort_values('time')
    
    if batch == 1:
        color = 'g'
    elif batch == 2:
        color = 'r'
    else:
        color = 'b'
    
    plt.plot(temp.time,temp.od,c=color,alpha=.6)
    plt.ylim(melt.od.min(), melt.od.max())
    
plt.tight_layout()
plt.savefig("figures/data.pdf")

In [None]:
g = pivot.groupby(['batch','mM_PQ'])

d1 = g.get_group((1,0.0)).iloc[:,10:]
d2 = g.get_group((2,0.0)).iloc[:,10:]

plt.plot(d1.columns,d1.T,c='g');
plt.plot(d1.columns,d2.T,c='r');

In [None]:
combined = pd.concat((d1,d2))
combined.head()

In [None]:
combined.shape

In [None]:
X = pivot.iloc[:,10:]
Y = pivot[['mM_PQ','batch']]

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
import sklearn.decomposition

In [None]:
pca = sklearn.decomposition.PCA()
reduced = pca.fit_transform(X)
reduced.shape

In [None]:
plt.plot(pca.explained_variance_)
plt.semilogy()

In [None]:
plt.plot(pca.mean_)

In [None]:
plt.scatter(reduced[:,0],reduced[:,1],c=Y.batch);

In [None]:
plt.scatter(reduced[:,1],reduced[:,2],c=Y.batch);

In [None]:
for i in range(20):
    plt.figure()
    plt.xlabel(i)
    plt.ylabel(i+1)
    plt.scatter(reduced[:,i],reduced[:,i+1],c=Y.batch);

In [None]:
plt.scatter(reduced[:,0],reduced[:,1],c=Y.mM_PQ);

In [None]:
plt.plot(np.dot(reduced, pca.components_).T);

In [None]:
plt.plot(np.dot(reduced, pca.components_).T+pca.mean_[:,None]);

In [None]:
r = 1
plt.plot(np.dot(reduced[:,r:], pca.components_[r:,:]).T + pca.mean_[:,None]);

In [None]:
# selected = [0] + range(2,pca.n_components_)
selected = [0] #+ range(2,3)
plt.plot(np.dot(reduced[:,selected], pca.components_[selected,:]).T + pca.mean_[:,None]);

In [None]:
r = 1

plt.figure(figsize=(20,8))

for i in range(X.shape[0]):

    pq,batch = Y.iloc[i,]
        
    ind = pqvals.tolist().index(pq)
    plt.subplot(2,pqvals.shape[0],ind+1)
    plt.title(pq)
    
    if batch == 1:
        color = 'g'
    elif batch == 2:
        color = 'r'
    else:
        color = 'b'
        
    d = X.iloc[i,:]
    plt.plot(pivot.columns[10:],d.T, c=color)
    plt.semilogy(basey=2)
    plt.ylim(2**-4,2**0)
    
    plt.subplot(2,pqvals.shape[0],pqvals.shape[0]+ind+1)
    trans = pca.transform(d.values[None,:])
    
    plt.plot(pivot.columns[10:],np.dot(trans[:,selected], pca.components_[selected,:]).T+pca.mean_[:,None],c=color)
    plt.semilogy(basey=2)
    plt.ylim(2**-4,2**0)
    
plt.tight_layout()

In [None]:
plt.get_cmap()(.1)

In [None]:
R = 4

plt.figure(figsize=(20,4*(R+1)))

for i in range(X.shape[0]):

    pq,batch = Y.iloc[i,]

    ind = pqvals.tolist().index(pq)
    plt.subplot(R+1,pqvals.shape[0],ind+1)
    plt.title(pq)

    d = X.iloc[i,:]
    plt.plot(pivot.columns[10:],d.T, c=plt.get_cmap()(batch*1./3))
    plt.semilogy(basey=2)
    plt.ylim(2**-4,2**0)

    
    for r in range(R):
        plt.subplot(R+1,pqvals.shape[0],pqvals.shape[0]*(r+1)+ind+1)
        trans = pca.transform(d.values[None,:])

        #plt.plot(pivot.columns[10:],np.dot(trans[:,[r]], pca.components_[[r],:]).T+pca.mean_[:,None],c=plt.get_cmap()(batch*1./3))
        plt.plot(pivot.columns[10:],np.dot(trans[:,[r]], pca.components_[[r],:]).T,c=plt.get_cmap()(batch*1./3))
        #plt.semilogy(basey=2)
        #plt.ylim(2**-4,2**0)

plt.tight_layout()

In [None]:
from dtw import dtw

In [None]:
d1.mean()

In [None]:
dist, cost, acc, path = dtw(d1.mean().values[:,None], d2.mean().values[:,None], dist=lambda x, y: norm(x - y, ord=inf))
# dist, cost, acc, path = dtw(d1.mean().values[:,None], d2.mean().values[:,None],dist=norm)

plt.imshow(acc.T, origin='lower', cmap=cm.gray, interpolation='nearest')
plt.plot(path[0], path[1], 'w')
plt.xlim((-0.5, acc.shape[0]-0.5))
plt.ylim((-0.5, acc.shape[1]-0.5))

plt.figure()
plt.plot(d1.mean().values[path[0]])
plt.plot(d2.mean().values[path[1]])

In [None]:
ygp, xgp = patsy.dmatrices('standardize(od) ~ standardize(time) + standardize(mM_PQ) + batch + Bio + 0', melt)
xgp

In [None]:
np.unique(xgp[:,-1])

In [None]:
plt.scatter(xgp[:,-1],xgp[:,-2])

In [None]:
xgp = pd.DataFrame(xgp, columns = xgp.design_info.column_names)

In [None]:
# g = melt.groupby(['mM_PQ','batch','Bio'])
# pqvals = melt.mM_PQ.unique(); pqvals.sort()

In [None]:
# for k, in g:
#     print i,k

In [None]:
g = 

In [None]:


# x,y,effect,labels = ds.build(Strain='ura3',scale='range',effects=['Well','Bio','mM_PQ'])
x,y,effect,labels = ds.build(Strain='ura3',scale='range',Bio=['B','C','D'],effects=['Well','batch','Bio','mM_PQ'])

# remove early time points
x = x[6:,:]
y = y[6:,:]

# observations that are on an edge are removed
edge = range(101,111) + range(111,191,10) + range(120,191,10) + range(191,201) + range(201,211) + range(211,291,10) + range(220,291,10) + range(291,301)
edge = np.array(edge)

dist = np.array([min(abs(l-edge)) for l in labels[0]])
position = (dist==0).astype(int)

select = dist > 0

In [None]:
effect

In [None]:
y = y[:,select]
effect = effect.iloc[select,:]

# standardize
y = np.log2(y)

# scale to 0 within bio reps
# for i,p in enumerate(effect.Bio.unique()):
#     select = effect.Bio==p
#     y[:,select] -= y[0,select].mean()

y = (y-y.mean())/y.std()

y.shape

In [None]:
labels[2]

In [None]:
# reindex so groups are useful
effect.index = range(effect.shape[0])

gall = effect.groupby(['mM_PQ','Bio'])
gpq = effect.groupby(['mM_PQ'])

In [None]:
plt.figure(figsize=(len(labels[1])*4, len(labels[2])*4))

for k,v in gall:
    #print k,v.index
    
    pq,bio = k
    
    plt.subplot(len(labels[2]), len(labels[1]), pq*len(labels[1]) + bio + 1)
    
    plt.plot(x[:,0], y[:,v.index])
    plt.ylim(y.min()*1.05,y.max()*1.05)

In [None]:
plt.figure(figsize=(len(labels[2])*4,4))
cmap = plt.get_cmap()

for k,v in gall:
    
    pq,bio = k
    
    plt.subplot(1, len(labels[2]), pq + 1)
    
    plt.plot(x[:,0], y[:,v.index], color = cmap((bio+1)*1./3))
    plt.ylim(y.min()*1.05,y.max()*1.05)

In [None]:
ds = dataset.DataSet('data/')

# x,y,effect,labels = ds.build(Strain='ura3',scale='range',effects=['Well','Bio','mM_PQ'])
x,y,effect,labels = ds.build(Strain='ura3',scale='range',batch=2,effects=['Well','Bio','mM_PQ'])

# remove early time points
x = x[6:,:]
y = y[6:,:]

# observations that are on an edge are removed
edge = range(101,111) + range(111,191,10) + range(120,191,10) + range(191,201) + range(201,211) + range(211,291,10) + range(220,291,10) + range(291,301)
edge = np.array(edge)

dist = np.array([min(abs(l-edge)) for l in labels[0]])
position = (dist==0).astype(int)

select = dist > 0
y = y[:,select]
effect = effect.iloc[select,:]

# standardize
y = np.log2(y)

# scale to 0 within bio reps
# for i,p in enumerate(effect.Bio.unique()):
#     select = effect.Bio==p
#     y[:,select] -= y[0,select].mean()

y = (y-y.mean())/y.std()

y.shape

In [None]:
# reindex so groups are useful
effect.index = range(effect.shape[0])

gall = effect.groupby(['mM_PQ','Bio'])
gpq = effect.groupby(['mM_PQ'])

In [None]:
plt.figure(figsize=(len(labels[2])*4,4))
cmap = plt.get_cmap()

for k,v in gall:
    
    pq,bio = k
    
    plt.subplot(1, len(labels[2]), pq + 1)
    
    plt.plot(x[:,0], y[:,v.index], color = cmap((bio+1)*1./3))
    plt.ylim(y.min()*1.05,y.max()*1.05)