Skip to content

Latest commit

 

History

History
3360 lines (2700 loc) · 77.6 KB

ProcessingCentroid-Copy1.md

File metadata and controls

3360 lines (2700 loc) · 77.6 KB
jupyter
jupytext kernelspec
formats text_representation
ipynb,md
extension format_name format_version jupytext_version
.md
markdown
1.1
1.1.4
display_name language name
Python 3
python
python3

Data import

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

import dill
import time

from matplotlib.font_manager import FontProperties
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
from sklearn.metrics import r2_score

from multiprocessing import Pool
import multiprocessing as mp
from functools import partial
plt.style.use('seaborn')
dill.load_session('13 09-10Apr.jupyterData')
f = pd.read_csv('result_CAMERA.csv').rename({'Unnamed: 0': 'F_index'}, axis=1)
f.head()
fd = pd.concat([f.iloc[:, [0]], f.iloc[:, 1:9]], axis=1)
fd.columns
into = pd.concat([f.iloc[:, [0]], f.iloc[:, 9:-3]], axis=1)
into.columns
fd2 = pd.read_csv('fd_centroid.csv').rename({'Unnamed: 0':'F_index'}, axis=1)
into2 = pd.read_csv('int_centroid.csv').rename({'Unnamed: 0':'F_index'}, axis=1)
into2.head()
CAMERA = pd.concat([f.iloc[:, [0]], f.iloc[:, -3:]], axis=1)
CAMERA.columns
into.shape

Number of samples were involved in every feature

fd['X1'].value_counts().sort_index(ascending=False)
fd.columns

First distribution

plt.style.use('seaborn')
plt.figure(figsize=(15,10))
scatters = plt.scatter(fd['rt'], fd['mz'], alpha=.2)
ax = plt.gca()
ax.set_xlabel('RT(s)', fontsize=15)
ax.set_ylabel('m/z', fontsize=15)
ax.set_title('Distribution of Features', fontsize=20)
#plt.savefig('dist1.png')
scatters.get_facecolors()

Filters

into.columns
filters = into.iloc[:, 0:7]
filters = filters.iloc[:,[0,1,6,2,3,4,5]]
filters = filters.rename({'Filter_01.mzXML':  '0.10',
                          'Fiter_015.mzXML': '0.15',
                          'Filter_03.mzXML':  '0.30', 
                          'Filter_05.mzXML':  '0.50', 
                          'Filter_075.mzXML': '0.75',
                          'Filter_1.mzXML':   '1.00'}, 
                          axis=1)
filters.head()
def get_R2(x):
    y_i = np.array([0.1, 0.15, 0.3, 0.5, 0.75, 1])
    y_hat = x[1:] / x[-1]
    return r2_score(y_i, y_hat)
mp.cpu_count()

Multiprocess apply

def deco(fun):
    def wrapper(*args, **kwargs):
        s = time.time()
        results = fun(*args, **kwargs)
        print(time.time()-s)
        return results
    return wrapper

def process(data, fun, axis):
    return data.apply(fun, axis)

def best_number_of_processes(data):
    import multiprocessing as mp
    
    CPUs = mp.cpu_count()
    rows = data.shape[0]
    if rows < 10:
        return 2
    elif rows < 100:
        return 4
    else:
        return CPUs
@deco
def multiprocessing_apply(data, fun, processes=None, axis=1):
    from multiprocessing import Pool
    from functools import partial
    
    if not processes:
        processes = best_number_of_processes(data)
        
    with Pool(processes=processes) as pool:
        if data.isnull().sum().sum():
            print("There are NA")
        
        data = np.array_split(data, processes)
        data = pool.map(partial(process, fun=fun, axis=axis), data)
        
        return pd.concat(data)
plt.style.use('seaborn')
r2 = multiprocessing_apply(filters.dropna(), get_R2)
sns.distplot(r2[r2>0], color='maroon', bins=50)
plt.xlabel('$R^2$')
test = filters.copy()
def subbin(nums):
    results = []
    N = len(nums)
    for i in range(2**N):
        pivot = bin(i)[2:][::-1]
        print(pivot)
        re = []
        for j in range(len(pivot)):
            if int(pivot[j]):
                re.append(nums[int(j)])
        results.append(re)
        
    return results
def subsets(nums):
    output = [[]]
    for i in range(len(nums)):
        for j in range(len(output)):
            output.append(output[j]+[nums[i]])
            
    return output

f = lambda x: len(x) >= 3

def get_R2_modify(x):
    if x.isnull().sum() > 3:
        return -1

    x = x.fillna(-1)[1:7]
    y_hat = x / x[-1]
    temp = y_hat
    y_hat = list(y_hat)
    
    y_i = [0.1, 0.15, 0.3, 0.5, 0.75, 1]
    pivot = 0
    for i in range(6):
        if x[i] == -1:
            y_hat.pop(i-pivot)
            y_i.pop(i-pivot)
            pivot += 1

    pivots = list(range(len(y_i)))
    subs = subsets(pivots)
    
    subs = list(filter(f, subs))
    y_i = np.array(y_i)
    y_hat = np.array(y_hat)

    d = {}
    for var, conc in zip(temp.values, temp.index):
        d[var] = conc  
    re = float('-inf')
    
    for sub in subs:
        p = r2_score(y_i[sub], y_hat[sub])
        if p > re:
            re = p
            indices = []
            for y in y_hat[sub]:
                indices.append(d[y])         
    return re, indices,        
    #return max(map(lambda index: r2_score(y_i[index], y_hat[index]), subs))
r2 = multiprocessing_apply(test, get_R2_modify)
r2 = pd.DataFrame([r2.str[0], r2.str[1]]).T
r2[0] = r2[0].astype('float')
r2[r2[0] > .8][1].value_counts()
r2[r2[0] > .8].shape
sns.distplot(r2[r2[0] > 0][0], color='maroon')
def subsets(nums):
    output = [[]]
    for i in range(len(nums)):
        for j in range(len(output)):
            output.append(output[j]+[nums[i]])
    return output
def first_drop(x):
    x = x.fillna(-1)
    x = x[1:7]
    pivot = True
    if x[3] == -1 or x[4] == -1 or x[5] == -1:
        return False
    for i in range(5,-1,-1):
        
        if pivot:
            if x[i] == -1:
                pivot = False
        else:
            if not x[i] == -1:
                return False
    
    return True
def get_R2_modify(x):
    x = x.fillna(-1)
    x = x[1:7]

    y_i = np.array([0.1, 0.15, 0.3, 0.5, 0.75, 1])
    
    y_hat = x / x[-1]
    
    pivot = 0
    for i in range(6):
        if not  x[i] == -1:
            pivot = i
            break
           
    y_i = y_i[pivot:]
    y_hat = y_hat[pivot:]
    return r2_score(y_i, y_hat)
test2 = filters.copy()
test2['normal'] = test2.apply(first_drop, axis=1)
print(test2.shape)
test2 = test2[test2['normal'] == True]
print(test2.shape)
test2['R2'] = test2.apply(get_R2_modify, axis=1)
test2[test2['R2'] > .8].shape
sns.distplot(test2[test2['R2']>0]['R2'])
index2 = test2[test2['R2'] > .8].index
index1 = r2[r2[0] > .8].index
index2.shape, index1.shape
(index1 ^ index2).shape
(index1 | index2).shape
(index1 & index2).shape
2589+2472
(index2 ^ index1).shape
fd['R2'] = r2[0]
fd['pass'] = fd['R2'] > .8
fd[~fd['pass']]['mz'].shape
r2[r2[0] > .8][1].value_counts()
t = pd.concat([filters.iloc[:, 1:], r2], axis=1)
t[1] = t.apply(lambda x: str(x[1]), axis=1)
t[t[0]> .8].groupby(1).mean().iloc[:, :-1].plot()
plt.xticks(rotation=30)
t[t[0]>.8][1].value_counts()
t[t[0]> .8].groupby(1).mean().iloc[:, :-1].astype('int').reindex(t[t[0]>.8][1].value_counts().index).transform(lambda x: np.log(x))
t[t[0]> .8].groupby(1).mean().iloc[:, :-1].astype('int').reindex(t[t[0]>.8][1].value_counts().index)
sum_t = t[t[0]>.1].iloc[:, :6].sum()
print(sum_t)
sum_t = sum_t / sum_t[-1]
r2_score([.1, .15, .3, .5, .75, 1.], sum_t)
t.shape
plt.figure(figsize=(15,10))
ax1 = plt.axes([0.9, 0.1, 0.1, .8])
sns.kdeplot(fd[~fd['pass']]['mz'], shade=True, color=[0.29803922, 0.44705882, 0.69019608], vertical=True, ax=ax1)
sns.kdeplot(fd[fd['pass']]['mz'], shade=True, color='darkred', vertical=True, ax=ax1)
ax1.set_xticks([])
ax1.set_yticks([])
ax1.set_ylabel('')
ax1.legend('')

ax2 = plt.axes([0.1, 0.1, 0.8, 0.8])
sns.scatterplot('rt', 'mz', hue='pass', data=fd, alpha=.3, 
                palette=[[0.29803922, 0.44705882, 0.69019608],'darkred'], 
                edgecolor='w', ax=ax2,
                legend=False)
ax2.set_xlabel('RT(s)', fontsize=22)
ax2.set_ylabel('m/z', fontsize=22)
ax2.tick_params(labelsize=15)

'''top = 900
left = 920
ax2.text(left+40, top, 'Not passed', fontsize=25)
ax2.text(left+40, top-60, 'Passed', fontsize=25)
ax2.scatter(left, top+15, c=[[0.29803922, 0.44705882, 0.69019608]], s=300)
ax2.scatter(left, top-45, c='darkred', s=300)'''

ax3 = plt.axes([0.1, 0.9, 0.8, 0.1])
sns.kdeplot(fd[~fd['pass']]['rt'], shade=True, color=[0.29803922, 0.44705882, 0.69019608], ax=ax3)
sns.kdeplot(fd[fd['pass']]['rt'], shade=True, color='darkred', ax=ax3)
ax3.set_xticks([])
ax3.set_yticks([])
ax3.set_xlabel('')
ax3.legend('')

ax4 = plt.axes([0.9, 0.9, 0.1, 0.1])
ax4.set_xticks([])
ax4.set_yticks([]) 

#plt.savefig('distribution.png')

Order of Sample Sequence

snames = ['QC 2-3', 'QC 1-1', 'QC 2-2', 'QC 4-2', 'QC 1-3', 'QC 5-2', 'QC 5-1', 'QC 2-1', 
         'Sample 4-2', 'Filter 0.3', 'Sample 1-2', 'QC 3-1',
         'Sample 5-2', 'Sample 3-2', 'Sample 1-3', 'QC 1-2',
         'Sample 5-1', 'Sample 2-2', 'Sample 5-3', 'QC 5-3',
         'Filter 0.75', 'Sample 3-3', 'Sample 4-1', 'QC 3-3',
         'Filter 1.0', 'Sample 2-3', 'Sample 4-3', 'QC 3-2', 
         'Filter 0.1', 'Filter 0.5', 'Sample 1-1', 'QC 4-3', 
         'Sample 2-1', 'Filter 0.15', 'Sample 3-1', 'QC 4-1']
snames_df = pd.DataFrame(snames, columns=['Sname'])
def classify(x):
    return x[0][:2]
snames_df['Class'] = snames_df.apply(classify, axis=1)
snames_df.head()
fp1 = FontProperties(fname="../movingdisk/GitHub/python_study/fontawesome-free-5.7.2-desktop/otfs/Font Awesome 5 Free-Solid-900.otf")

cmap = dict(QC='darkblue', Sa='peru', Fi='c')

plt.style.use(plt.style.available[6])

plt.figure(figsize=(16,7.5))

ax=plt.gca()

for i in range(snames_df.shape[0]):
    x = (i % 8)*2.5 + 1.5
    y = -(i // 8)
    cls = snames_df.iloc[i,1]
    sname = snames_df.iloc[i,0]
    ax.text(x, y, '\uf492', fontproperties=fp1, size=50, color=cmap[cls],
           ha='center', va='center')
    ax.text(x, y-0.4, sname, size=20, color=cmap[cls],
           ha='center', va='center')
ax.set_xlim(0,20.5)
ax.set_ylim(-5,0.6)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
#plt.savefig('Injection sequence.png')
QCs_eq = into.iloc[:, [12, 7, 11, 17, 9, 20, 19, 10]]
QCs_nor = into.iloc[:, [13, 8, 21, 15, 14, 18, 16]]
Samples = into.iloc[:, 22:]
Samples_ordered = Samples.iloc[:, [10, 1, 13, 7, 2, 12, 4, 14, 8, 9, 5, 11, 0, 3, 6]]
def RSD(x):
    return int(np.log10(np.mean(x))), np.std(x, ddof=1) / np.mean(x)
QCs_nor.dropna(how='all').shape
rsd = QCs_nor.dropna(how='all').fillna(1).apply(RSD, axis=1).str[1]
sns.distplot(rsd)
index2
(rsd.index & index2).shape
index2.shape
rsd.shape
cv = rsd.loc[index2&rsd.index]
index2.shape
cv = pd.cut(cv, bins=(0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, np.inf), labels=('<5', '5-10', '10-15', '15-20', '20-25', '25-30', '>30')).value_counts()
cv = cv.sort_index()
cv
plt.bar(x=range(cv.shape[0]), height=cv)
plt.gca().set_xticklabels(cv.index.values)
fd['pass'].value_counts()
rsd[rsd<.2].shape
from matplotlib_venn import venn2
A = set(t[t[0] > .8].index)
B = set(rsd[rsd<.2].index)
venn2([A, B], set_labels=('$R^2>.8$', '$RSD<.2$'))
ax = plt.gca()
C = set(test2[test2['R2'] > .8].index)
B = set(rsd[rsd<.2].index)
venn2([C, B], set_labels=('$R^2>.8$', '$RSD<.2$'))
ax = plt.gca()
(test2[test2['R2'] > .8].index & rsd[rsd<.2].index).shape
(t[t[0] > .8].index & rsd[rsd<.2].index).shape
((test2[test2['R2'] > .8].index & rsd[rsd<.2].index) | (t[t[0] > .8].index & rsd[rsd<.2].index)).shape
newindex = test2[test2['R2'] > .8].index
newindex.shape
test2[test2['R2'] > .8].index.shape
t[t[0] > .8].index.shape

Peak Figure 3D

conc = '1.00'
threeD = pd.merge(fd, filters, on='F_index').loc[:,['mz', 'rt', conc]].dropna()

mzmin, mzmax = threeD['mz'].min(), threeD['mz'].max()

mzmin = round(mzmin)
mzmax = round(mzmax)

rt_range = np.arange(1, 1501, 1500/500)
mz_range = np.arange(mzmin, mzmax, (mzmax-mzmin)/500)

threeD['RT'] = pd.cut(threeD['rt'], bins=500, labels=rt_range)
threeD['mz'] = pd.cut(threeD['mz'], bins=500, labels=mz_range)

threeD = threeD[[conc, 'RT', 'mz']]

x = rt_range
y = mz_range
x, y = np.meshgrid(x, y)

mat = threeD.groupby(['RT', 'mz']).mean()

mat = mat.unstack().fillna(1).T

z = mat.values.ravel().reshape(500,500)

#z = np.log(z)
z.shape
plt.contourf(x, y, z, cmap=plt.cm.BuPu)
z.min()
plt.hist(z[z!=0], bins=20)
from mpl_toolkits.mplot3d import axes3d, Axes3D #<-- Note the capitalization! 
fig = plt.figure(figsize=(12,10))

ax = Axes3D(fig)

surf = ax.plot_surface(x, y, z, cmap=plt.cm.Greens, 
                       rstride=10, cstride=10, 
                      edgecolor=None, linewidth=0)

fig.colorbar(surf, shrink=.5)

os.system('mkdir peak3D')
for angle in range(0, 360, 2):
    ax.view_init(30, angle)
    plt.savefig('peak3D/peak_%.3d.png' % angle)
os.system('magick -delay 10 -loop 0 peak3D//peak_*.png peak3D.gif')
from mpl_toolkits.mplot3d import axes3d, Axes3D
plt.style.use('ggplot')
fig = plt.figure(figsize=(10,6))

ax = Axes3D(fig)

surf = ax.plot_surface(x, y, z, 
                       cmap=plt.cm.BuGn, 
                       rstride=10, cstride=10,
                      edgecolor=None, linewidth=0)
fig.set_facecolor('lightgrey')
fig.colorbar(surf, shrink=.6)
ax.set_facecolor('w')

os.system('mkdir peak3D_v1.1')
for angle in range(0, 360, 2):
    ax.view_init(30, angle)
    ax.set_xlabel('Retention Time', fontsize=20)
    ax.set_ylabel('m/z', fontsize=20)
    ax.set_zlabel('Intensity', fontsize=20)
    
    fig.savefig('peak3D_v1.1/peak_%.3d.png' % angle)

os.system('magick -delay 10 -loop 0 peak3D_v1.1//peak_*.png peak3D_v1.1.gif')
dill.dump_session('2_17.pyData')
peaktable = pd.read_csv('cp_centroid.csv').rename({'Unnamed: 0':'Peak_index'}, axis=1)

peaktable = peaktable[peaktable['sample'] == 5]

peaktable = peaktable[['mz', 'rt', 'into']]

peaktable.isnull().sum()
bins_rt = 1000
bins_mz = 500

rts = np.arange(peaktable['rt'].min(), peaktable['rt'].max(), (peaktable['rt'].max()-peaktable['rt'].min())/bins_rt)
mzs = np.arange(peaktable['mz'].min(), peaktable['mz'].max(), (peaktable['mz'].max()-peaktable['mz'].min())/bins_mz)

peaktable['rt'] = pd.cut(peaktable['rt'], bins_rt)
peaktable['mz'] = pd.cut(peaktable['mz'], bins_mz)
def get_med(x):
    return x[0].right, x[1].right
peaktable['mz'], peaktable['rt'] = peaktable.apply(get_med, axis=1).str
raw_mat = peaktable.groupby(['rt','mz']).mean()
raw_mat = raw_mat.unstack()
raw_mat = raw_mat.fillna(1)
raw_mat.shape
rt0 = peaktable['rt'].sort_values().unique()
mz0 = peaktable['mz'].sort_values().unique()
rt0, mz0 = np.meshgrid(rt0, mz0)
z0 = raw_mat.values.ravel().reshape(844,452).T
z0 = np.log(z0)
rt0.shape, mz0.shape, z0.shape
fig = plt.figure(figsize=(12,10))

ax = Axes3D(fig)

surf = ax.plot_surface(rt0, mz0, z0, cmap=plt.cm.Greens, 
                       #rstride=1, cstride=1, 
                      #edgecolor=None, linewidth=0
                      )

fig.colorbar(surf, shrink=.5)

QC

  • DataSets: QCs_eq, QCs_nor, Samples, Samples_ordered
  • **Indexes:**noise_index, pass_index, pass_modified_index
  • Fun: RSD( return 'mag' and 'RSD')
QCs_nor.columns
def RSD_modified(x):
    return int(np.log10(np.mean(x))), np.mean(x), np.std(x, ddof=1) / np.mean(x)   
plt.style.use('seaborn')
pass_index = test2[test2['R2'] > .8].index
QCs_total = QCs_nor.dropna().copy()
QCs_pass = QCs_nor.copy().loc[pass_index].dropna()
QCs_noise = QCs_nor.copy().loc[QCs_nor.index ^ pass_index].dropna()
QCs_noise.shape
QCs_pass.shape
QCs_total = QCs_nor.dropna()
QCs_pass = QCs_nor.loc[pass_index | pass_modified_index].dropna()
QCs_noise = QCs_nor.loc[noise_index].dropna()
QCs_total['mag'], QCs_total['mean'], QCs_total['RSD'] = QCs_total.apply(RSD_modified, axis=1).str
QCs_pass['mag'], QCs_pass['mean'], QCs_pass['RSD'] = QCs_pass.apply(RSD_modified, axis=1).str
QCs_noise['mag'], QCs_noise['mean'], QCs_noise['RSD'] = QCs_noise.apply(RSD_modified, axis=1).str

QCs_total.shape, QCs_pass.shape, QCs_noise.shape
sns.distplot(QCs_noise['RSD'], label='noise', hist=False, kde_kws={'shade':'True'})
sns.distplot(QCs_pass['RSD'], label='passed', hist=False, kde_kws={'shade':'True'})
plt.legend(fontsize=20)
ax= plt.gca()
#ax.set_xlim(-.2,1)
QCs_noise[QCs_noise['RSD'] < .2].shape[0]
QCs_pass[QCs_pass['RSD'] < .2].shape[0]
plt.figure(figsize=(15,6))
sns.boxplot('mag', 'RSD', data=QCs_pass, whis=np.inf)
sns.stripplot('mag', 'RSD', data=QCs_pass, jitter=True, edgecolor='k', linewidth=1, alpha=.25, dodge=True)
#plt.savefig('box_strip.png')
plt.figure(figsize=(15,6))
sns.boxplot('mag', 'RSD', data=QCs_noise, whis=np.inf)
sns.stripplot('mag', 'RSD', data=QCs_noise, jitter=True, edgecolor='k', linewidth=1, alpha=.25, dodge=True)
#plt.savefig('box_strip_noise.png')
plt.style.use('seaborn')
fig = plt.figure(figsize=(16, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

sns.kdeplot(np.log(QCs_pass['mean']), QCs_pass['RSD'], shade=True, cmap=plt.cm.Blues, cut=.5, ax=ax1)
ax1.scatter(np.log(QCs_pass['mean']), QCs_pass['RSD'], alpha=.2, s=10)

sns.kdeplot(np.log(QCs_noise['mean']), QCs_noise['RSD'], shade=True, cmap=plt.cm.Blues, cut=.5, ax=ax2)
ax2.scatter(np.log(QCs_noise['mean']), QCs_noise['RSD'], alpha=.2, s=10)

for i, ax in enumerate([ax1, ax2]):
    ax.set_xticks(np.log([1e5, 1e6, 1e7, 1e8, 1e9]))
    ax.set_xticklabels(['$10^5$', '$10^6$', '$10^7$', '$10^8$', '$10^9$'], fontsize=15)
    ax.set_xlim(11.5, 22)
    ax.set_ylim(0,2)
    ax.yaxis.set_tick_params(labelsize=15)
    ax.set_ylabel('RSD', fontsize=22)
    ax.set_xlabel('')
ax2.set_ylabel('')
ax2.set_yticklabels([])

# ax3
n_peaks_pass = QCs_pass.groupby('mag').aggregate('count').values.T[0] / QCs_pass.shape[0] *100
n_peaks_pass = np.round(n_peaks_pass, 2)
percent_pass = []
for n in n_peaks_pass:
    percent_pass.append('%.2f%%' % n)
bars = ax3.bar(np.arange(6), n_peaks_pass, alpha=1)
c = bars[0].get_facecolor()
for i in range(6):
    ax3.text(i, n_peaks_pass[i]+1.5, percent_pass[i], color=c[:3], ha='center', va='center', fontsize=15)
    
pass_02 = []
for i in range(6):
    per = QCs_pass[(QCs_pass['RSD'] <.2) & (QCs_pass['mag'] == i+4)].shape[0]
    per = per / QCs_pass.groupby('mag').aggregate('count').values.T[0][i] * 100
    pass_02.append(per)
    
ax3.plot(np.arange(6), pass_02, c='maroon', marker='o')

pass_02_str = []
for per in pass_02:
    pass_02_str.append('%.2f%%' % per)

loc_off = [[0,10], [-.5,0], [-.4,3], [-.3,4], [.5,-2], [-.5,0]]
for i in range(6):
    ax3.text(i+loc_off[i][0], pass_02[i]+loc_off[i][1], pass_02_str[i],
            va='center', ha='center',
            fontsize=15, color='maroon')
ax3.set_xticks(np.arange(6))
ax3.set_xticklabels(['$10^4$', '$10^5$', '$10^6$', '$10^7$', '$10^8$', '$10^9$'], fontsize=15)
ax3.set_ylim(0, 105)

# ax4
n_peaks_noise = QCs_noise.groupby('mag').aggregate('count').values.T[0] / QCs_noise.shape[0] *100
n_peaks_noise = np.round(n_peaks_noise, 2)
percent_noise = []
length = len(n_peaks_noise)

for n in n_peaks_noise:
    percent_noise.append('%.2f%%' % n)
    
bars = plt.bar(np.arange(length), n_peaks_noise, alpha=1)
c = bars[0].get_facecolor()
for i in range(length):
    plt.text(i, n_peaks_noise[i]+1.5, percent_noise[i], color=c[:3], ha='center', va='center', fontsize=15)
    
noise_02 = []
for i in range(length):
    per = QCs_noise[(QCs_noise['RSD'] <.2) & (QCs_noise['mag'] == i+5)].shape[0]
    per = per / QCs_noise.groupby('mag').aggregate('count').values.T[0][i] * 100
    noise_02.append(per)
    
plt.plot(np.arange(length), noise_02, c='maroon', marker='o')

noise_02_str = []
for per in noise_02:
    noise_02_str.append('%.2f%%' % per)

loc_off = [[-.2,-4], [.5,-1], [.4,2], [.5,3], [.5,2], [.5,0], [0.5,4]]
for i in range(length):
    plt.text(i+loc_off[i][0], noise_02[i]+loc_off[i][1], noise_02_str[i],
            va='center', ha='center',
            fontsize=15, color='maroon')

ax4.set_xticks(np.arange(length))
ax4.set_xticklabels(['$10^5$', '$10^6$', '$10^7$', '$10^8$', '$10^9$','$10^{10}$'], fontsize=15)

for i, ax in enumerate([ax3, ax4]):
    ax.set_ylabel('% peaks', fontsize=22)
    ax.yaxis.set_tick_params(labelsize=15)
    labels = ['(Passed)', '(Not Passed)']
    ax.set_xlabel('Intensities\n%s' % labels[i], fontsize=22)
    

rect = Rectangle((4.5, 85), 1, 2.5)
ax4.add_patch(rect)
ax4.text(6, 86, '%peaks', ha='center', va='center', fontsize=15)
line = Line2D([4.5, 5.5], [77, 77], color='maroon', marker='o')
ax4.text(6.1, 76, 'RSD \nwithin 20%', ha='center', va='center', fontsize=15)
ax4.add_line(line)
ax4.set_ylabel('')
ax4.set_yticklabels([])
ax4.set_ylim(0,105)

plt.tight_layout()
#plt.savefig('compare.png')
QCs_pass[QCs_pass['RSD'] >.2].shape
QCs_final = QCs_pass[QCs_pass['RSD'] <.2]
QCs_final = QCs_eq.loc[QCs_final.index].join(QCs_final).dropna().iloc[:, :15]
QCs_final.shape
QCs_final.columns
from sklearn.decomposition import PCA
QCs_PCA = PCA(n_components=8).fit_transform(np.log(QCs_final.T))
QCs_PCA.shape
Samples_final = Samples.iloc[QCs_final.index]
allsamples = QCs_final.join(Samples_final).dropna()

seq = [0, 1, 2, 3, 4, 5, 6, 7, 
      25, 16, 8, 
      28, 22, 17, 9,
      27, 19, 29, 10, 
      23, 24, 11, 
      20, 26, 12, 
      15, 13, 
      18, 21, 14]

allsamples = allsamples.iloc[:,seq]
allsamples.columns
PCA_allsamples = PCA(n_components=8).fit_transform(np.log(allsamples.T))
def deal_names(cols):
    re = []
    for i, name in enumerate(cols):
        if i in [0, 1, 2]:
            re.append('QC-eq-%s' % str(i+1))
        elif i in [3, 4, 5, 6, 7]:
            re.append('QC-eq')
        elif name.startswith('Sa'):
            name = name[:10]
            re.append('Sa-%s' % name[-3])
        else:
            re.append('QC-nor')
    
    return re
names = deal_names(allsamples.columns)
colormap = {'QC-eq-1': ['darkred',True], 'QC-eq-2':['red', True], 'QC-eq-3':['lightcoral', True], 
            'QC-eq': ['gold', True], 'QC-nor': ['olive', True], 
           'Sa-1': ['plum', True], 'Sa-2': ['skyblue', True], 'Sa-3': ['teal', True], 'Sa-4': ['forestgreen', True], 'Sa-5': ['peru', True]}
colormap
pca_mat = pd.DataFrame(PCA_allsamples)

pca_mat['class'] = names
pca_mat['cate'] = pca_mat['class'].str[:2]
pca_mat.head()
plt.figure(figsize=(12,8))
scatters = sns.scatterplot(0, 1, hue='class', style='cate', data=pca_mat, s=200)
plt.legend(fontsize=18, bbox_to_anchor=[1,1, .2, 0])
plt.xlabel('Component 1', fontsize=18)
plt.ylabel('Component 2', fontsize=18)

#plt.savefig('pca_all.png')
def deal_names(cols):
    re = []
    for i, name in enumerate(cols):
        if i in [0, 1, 2]:
            re.append('QC-eq-%s' % str(i+1))
        elif i in [3, 4, 5, 6, 7]:
            re.append('QC-eq')
        elif name.startswith('Sa'):
            name = name[:10]
            re.append('Sa-%s' % name[-3])
        else:
            re.append('QC-nor')
    
    return re

def draw_pca(PCA_allsamples):
    names = deal_names(allsamples.columns)
    colormap = {'QC-eq-1': ['darkred',True], 'QC-eq-2':['red', True], 'QC-eq-3':['lightcoral', True], 
            'QC-eq': ['gold', True], 'QC-nor': ['olive', True], 
           'Sa-1': ['plum', True], 'Sa-2': ['skyblue', True], 'Sa-3': ['teal', True], 'Sa-4': ['forestgreen', True], 'Sa-5': ['peru', True]}
    pca_mat = pd.DataFrame(PCA_allsamples)

    pca_mat['class'] = names
    pca_mat['cate'] = pca_mat['class'].str[:2]
    
    plt.figure(figsize=(12,8))
    scatters = sns.scatterplot(0, 1, hue='class', style='cate', data=pca_mat, s=200)
    plt.legend(fontsize=18, bbox_to_anchor=[1,1, .2, 0])
    plt.xlabel('Component 1', fontsize=18)
    plt.ylabel('Component 2', fontsize=18) 
draw_pca(PCA().fit_transform(allsamples.T))
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
ax3 = fig.add_subplot(313)

ax1.plot(pca_mat[0], ':o', c='k', label='Samples')
ax1.scatter(pca_mat[pca_mat['cate'] == 'QC'].index, pca_mat[pca_mat['cate'] == 'QC'][0], marker='*', s=300, c='tab:red', zorder=10, label='QCs')
ax1.set_xticklabels([])
ax1.set_title('PCA component 1', fontsize=20)
ax1.hlines(pca_mat[pca_mat['cate'] == 'QC'][0].mean(), -10, 40, linestyle='--', color='forestgreen', label='mean')
ax1.set_xlim(-1,32)
ax1.legend(fontsize=15)

ax2.plot(pca_mat[1], ':o', c='k', )
ax2.scatter(pca_mat[pca_mat['cate'] == 'QC'].index, pca_mat[pca_mat['cate'] == 'QC'][1], marker='*', s=300, c='tab:red', zorder=10)
ax2.set_xticklabels([])
ax2.set_title('PCA component 2', fontsize=20)
ax2.hlines(pca_mat[pca_mat['cate'] == 'QC'][1].mean(), -10, 40, linestyle='--', color='forestgreen')
ax2.set_xlim(-1,32)

ax3.plot(pca_mat[2], ':o', c='k', )
ax3.scatter(pca_mat[pca_mat['cate'] == 'QC'].index, pca_mat[pca_mat['cate'] == 'QC'][2], marker='*', s=300, c='tab:red', zorder=10)
ax3.set_title('PCA component 3', fontsize=20)
ax3.hlines(pca_mat[pca_mat['cate'] == 'QC'][2].mean(), -10, 40, linestyle='--', color='forestgreen')
ax3.set_xlim(-1,32)
#plt.savefig('PCA_123.png')
lm_test = test.sort_values(by='R2_modified', ascending=False).iloc[[2]][['0.10', '0.15', '0.30', '0.50', '0.75', '1.00']].T
lm_test['x'] = [0.1, 0.15, 0.3, 0.5, 0.75, 1]
lm_test.rename({lm_test.columns[0]:'y'}, axis=1)
test.sort_values(by='R2_modified', ascending=False).iloc[[2000]]
sns.lmplot('x', 'y', data=lm_test)
plt.yticks([])
plt.xticks([])
plt.xlabel('')
plt.ylabel('')
plt.savefig('filter_illus.png')
def get_R2_final(x):
    x = x.fillna(-1)

    y_i = np.array([0.1, 0.15, 0.3, 0.5, 0.75, 1])
    
    y_hat = x / x[-1]
    
    pivot = 0
    for i in range(6):
        if not  x[i] == -1:
            pivot = i
            break
           
    y_i = y_i[pivot:]
    y_hat = y_hat[pivot:]
    return r2_score(y_i, y_hat)
fig = plt.figure(figsize=(16,12))
axs = []
df = test.sort_values(by='R2_modified', ascending=False)[:2000].sample(12)
indices = df.index[:12]
for i in range(12):
    axs.append(fig.add_subplot(3, 4, i+1))

for i, ax in enumerate(axs):
    lm_test = df.loc[[indices[i]]][['0.10', '0.15', '0.30', '0.50', '0.75', '1.00']].T
    lm_test['x'] = [0.1, 0.15, 0.3, 0.5, 0.75, 1]
    lm_test = lm_test.rename({lm_test.columns[0]:'y'}, axis=1)
    lm_test['y'] = lm_test['y'] / lm_test['y'][-1]
    
    sns.regplot('x', 'y', data=lm_test, ax=ax)
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xlim(-.05, 1.05)
    ax.set_ylim(-.05, 1.05)
    
    R2 = get_R2_final(lm_test['y'])
    R2 = round(R2, 5)
    
    index = indices[i]
    
    ax.text(.2, .8, '$R^2$: %s\nNo.%s' % (R2, index), fontsize=20)
    
plt.tight_layout()
plt.savefig('12lmplot.png', dpi=96)
xx = np.linspace(0, 25, 1000)
yy = np.zeros(1000)
for i, x in enumerate(xx):
    if x < 2:
        yy[i] = 2
    if 2 <= x < 8:
        yy[i] = 2 + (x-2) * 8
    if 8 <= x < 16:
        yy[i] = 50
    if 16 <= x < 17:
        yy[i] = 50 + (x-16) * 48
    if 17 <= x < 21:
        yy[i] = 98
    if 21 <= x < 21.1:
        yy[i] = 98 - (x-21) * 960
    if x >= 21.1:
        yy[i] = 2
plt.figure(figsize=(16, 4))
plt.plot(xx, yy, c='darkblue')
plt.fill_between(xx, 0, yy, facecolor='forestgreen', alpha=.5, label='organic phase')
plt.fill_between(xx, yy, 100, facecolor='beige', alpha=1, label='inorganic phase')
plt.xlim(0,25)
plt.ylim(0,100)
plt.legend()
plt.xlabel('Time (min)')
plt.ylabel('%')
plt.tight_layout()
plt.savefig('gradient.png')
now = time.strftime('%H %M-%d%h')
dill.dump_session('%s.jupyterData' % now)

PCA

def deal_names(cols):
    re = []
    for i, name in enumerate(cols):
        if i in [0, 1, 2]:
            re.append('QC-eq-%s' % str(i+1))
        elif i in [3, 4, 5, 6, 7]:
            re.append('QC-eq')
        elif name.startswith('Sa'):
            name = name[:10]
            re.append('Sa-%s' % name[-3])
        else:
            re.append('QC-nor')
    
    return re

def draw_pca(mat):
    PCA_allsamples = PCA(whiten=False).fit_transform(mat)
    names = deal_names(allsamples.columns)
    colormap = {'QC-eq-1': ['darkred',True], 'QC-eq-2':['red', True], 'QC-eq-3':['lightcoral', True], 
            'QC-eq': ['gold', True], 'QC-nor': ['olive', True], 
           'Sa-1': ['plum', True], 'Sa-2': ['skyblue', True], 'Sa-3': ['teal', True], 'Sa-4': ['forestgreen', True], 'Sa-5': ['peru', True]}
    pca_mat = pd.DataFrame(PCA_allsamples)

    pca_mat['class'] = names
    pca_mat['cate'] = pca_mat['class'].str[:2]
    
    fig, axs = plt.subplots(1,2, figsize=(16,6))
    
    ax = axs[0]
    scatters = sns.scatterplot(0, 1, hue='class', style='cate', data=pca_mat, s=200, ax=ax)
    ax.legend(fontsize=18, bbox_to_anchor=[1,1, .2, 0])
    ax.set_xlabel('Component 1', fontsize=18)
    ax.set_ylabel('Component 2', fontsize=18) 
    
    ax = axs[1]
    sns.scatterplot(0, 1, hue='class', style='class', data=pca_mat[pca_mat['cate']=='QC'], s=200, ax=ax)
    ax.legend(fontsize=18)
    ax.set_xlabel('Component 1', fontsize=18)
    ax.set_ylabel('Component 2', fontsize=18)
    plt.tight_layout()
    return pca_mat

def draw_trend(pca_mat):
    fig = plt.figure(figsize=(16,12))
    ax1 = fig.add_subplot(311)
    ax2 = fig.add_subplot(312)
    ax3 = fig.add_subplot(313)

    ax1.plot(pca_mat[0], ':o', c='k', label='Samples')
    ax1.scatter(pca_mat[pca_mat['cate'] == 'QC'].index, pca_mat[pca_mat['cate'] == 'QC'][0], marker='*', s=300, c='tab:red', zorder=10, label='QCs')
    ax1.set_xticklabels([])
    ax1.set_title('PCA component 1', fontsize=20)
    ax1.hlines(pca_mat[pca_mat['cate'] == 'QC'][0].mean(), -10, 40, linestyle='--', color='forestgreen', label='mean')
    ax1.set_xlim(-1,32)
    ax1.legend(fontsize=15)

    ax2.plot(pca_mat[1], ':o', c='k', )
    ax2.scatter(pca_mat[pca_mat['cate'] == 'QC'].index, pca_mat[pca_mat['cate'] == 'QC'][1], marker='*', s=300, c='tab:red', zorder=10)
    ax2.set_xticklabels([])
    ax2.set_title('PCA component 2', fontsize=20)
    ax2.hlines(pca_mat[pca_mat['cate'] == 'QC'][1].mean(), -10, 40, linestyle='--', color='forestgreen')
    ax2.set_xlim(-1,32)

    ax3.plot(pca_mat[2], ':o', c='k', )
    ax3.scatter(pca_mat[pca_mat['cate'] == 'QC'].index, pca_mat[pca_mat['cate'] == 'QC'][2], marker='*', s=300, c='tab:red', zorder=10)
    ax3.set_title('PCA component 3', fontsize=20)
    ax3.hlines(pca_mat[pca_mat['cate'] == 'QC'][2].mean(), -10, 40, linestyle='--', color='forestgreen')
    ax3.set_xlim(-1,32)
pca_mat = draw_pca(allsamples.T)
draw_trend(pca_mat)
pca_mat = draw_pca(np.log(allsamples.T))
draw_trend(pca_mat)
f = allsamples.T.values
f

平衡conc, dropna

f1 = f / (f.sum(1) / f.sum(1).max())[:, np.newaxis]
f1.shape

只平衡

f11 = f1
pca_mat = draw_pca(f11)
draw_trend(pca_mat)

先平衡再log

f12 = np.log(f1)
pca_mat = draw_pca(np.log(f12))
draw_trend(pca_mat)

先log再平衡

f13 = np.log(f)
f13 = f13 / (f13.sum(1) / f13.sum(1).max())[:, np.newaxis]
pca_mat = draw_pca(f13)
draw_trend(pca_mat)

平衡feature intensity, fropna

先平衡feature

sum_median = np.median(f.max(0))
np.median(f.sum(0))
factors = f.sum(0) / sum_median
factors.shape
f21 = (f / factors)
plt.plot(f.sum(0))
pca_mat = draw_pca(f21)
draw_trend(pca_mat)

平衡后再log

f22 = np.log(f21)
pca_mat = draw_pca(f22)
draw_trend(pca_mat)

先log 后平衡feature

f.shape

StandardScaler

from sklearn.preprocessing import StandardScaler
f0 = StandardScaler().fit_transform(f)

只StandardScaler

pca_mat = draw_pca(f0)
draw_trend(pca_mat)

log后 StandardScaler

f01 = StandardScaler().fit_transform(np.log(f))
pca_mat = draw_pca(f01)
draw_trend(pca_mat)

平衡conc后 StandardScaler(没有log)

也许是最好的

f3 = f / (f.sum(1) / f.sum(1).max())[:, np.newaxis]
f3 = f / (f.sum(1) / f.sum(1).max())[:, np.newaxis]
f33 = StandardScaler().fit_transform(f3)
pca_mat = draw_pca(f33);
plt.savefig('best_pca.png')
draw_trend(pca_mat)
plt.savefig('best_trend.png')

平衡conc后 log-> StandardScaler

f34 = StandardScaler().fit_transform(np.log(f3))
pca_mat = draw_pca(f34)
def draw_all(df):
    f = df.T.values
    f = f / (f.sum(1) / f.sum(1).max())[:, np.newaxis]
    f = StandardScaler().fit_transform(f)
    
    pca_mat = draw_pca(f)
    draw_trend(pca_mat)

测试不同的pass集合

1361 dropna

draw_all(allsamples)
i5000 = t[t[0] > .8].index
i2000 = test2[test2['R2'] > .8].index
ints = into.iloc[:, 1:]
ints = ints.loc[:, allsamples.columns]
ints.loc[i2000]
QCs_nor.columns
QCs_nor.dropna(how='all').shape
index = allsamples.index
qc = QCs_nor.copy().dropna().T.values
pivot = QCs_nor.copy().loc[index].T.values
qc = qc / (pivot.sum(1) / pivot.sum(1).max())[:, np.newaxis]
qc = pd.DataFrame(qc, index=QCs_nor.dropna().columns, columns=QCs_nor.dropna().index).T
qc['mag'], qc['mean'], qc['RSD'] = qc.apply(RSD_modified, axis=1).str
QCs_pass[QCs_pass['RSD']<.2].shape
allsamples.index
QCs_pass.head()
_ = qc.loc[qc.index & i2000]
_[_['RSD'] < .2].shape
_.shape
8591 in _.index
_.shape

1426 dropna

draw_all(ints.loc[_[_['RSD'] < .2].index].dropna())
draw_all(ints.loc[_[_['RSD'] < .2].index].fillna(0))
f_t = pd.DataFrame(f3, index=allsamples.columns, columns=allsamples.index).T
f_t.columns = names
f_t.columns
s1 = f_t['Sa-1']
s2 = f_t['Sa-2']
s3 = f_t['Sa-3']
s4 = f_t['Sa-4']
s5 = f_t['Sa-5']

T test

s1 and s2

import statsmodels.api as sm
a, b = pd.concat([s1.T, s2.T]), pd.concat([s3.T, s4.T, s5.T])
a, b = s1.T, s2.T
ttest = sm.stats.ttest_ind(a, b)

pvalues_fdr_bh = sm.stats.multipletests(ttest[1], alpha=.05, method='fdr_bh')[1]
pvalues_fdr_bh

pvalues = -np.log10(pvalues_fdr_bh)

times = np.log2(a.mean(0) / b.mean(0)).values

ttest[1][ttest[1] < .05].shape, pvalues_fdr_bh[pvalues_fdr_bh < .05].shape
volcano = pd.DataFrame([pvalues_fdr_bh, a.mean(0)/b.mean(0)]).T
volcano.index = a.columns
volcano.head()
times.shape, pvalues.shape
plt.figure(figsize=(12,8))
volcano_plot(times, pvalues, zorder=10)
fc = 2
plt.fill_between(np.linspace(-100, -np.log2(fc)), np.ones([50])*-np.log10(5e-2), np.ones([50])*-np.log10(5e-6), facecolor='coral', alpha=.5, zorder=1)
plt.fill_between(np.linspace(np.log2(fc), 100), np.ones([50])*-np.log10(5e-2), np.ones([50])*-np.log10(5e-6), facecolor='coral', alpha=.5, zorder=1)
plt.xlim(-7,7)
plt.ylim(-.5, 5.2)
plt.text(-6.5, 2, 'p<0.05\nfold change > %s'% fc, fontsize=16, ha='left', va='center', color='r')
plt.text(3.7, 2, 'p<0.05\nfold change > %s' % fc, fontsize=16, ha='left', va='center', color='r')
plt.xlabel('$log_{2}(Fold Change)$', fontsize=15)
plt.ylabel('$-log_{10}(p-value)$', fontsize=15)
plt.savefig('volcano2.png')
def volcano_plot(times, pvalues, **kwargs):
    sns.scatterplot(times, pvalues, **kwargs)
plt.style.use('seaborn')
plt.figure(figsize=(10,8))
volcano_plot(times, pvalues)
ttest[1].argsort()
s1.iloc[273]
s2.iloc[273]
df = pd.read_csv('OPLS.csv', index_col=0)

PLS

from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=2)
res = pls.fit(pd.concat([s1.T, s2.T, s3.T, s4.T, s5.T]), [0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1])
pls_mat = pd.DataFrame(res.x_scores_)

pls_mat['c'] = [0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1]
plt.figure(figsize=(8,6))
sns.scatterplot(0, 1, data=pls_mat, hue='c', s=200, legend=False)
plt.savefig('pls.png')
VIPs = pd.read_csv('vip_from_pls_da.csv', index_col=0)

VIPs[(VIPs['comp 1'] > 1)&(VIPs['comp 2'] > 1)]

RandomForest

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20000, n_jobs=-1, max_depth=1)
rf.fit(df.iloc[:, :-1].values, df.iloc[:, -1])
fi = rf.feature_importances_
fi = pd.DataFrame(fi).rename({0: 'var'}, axis=1)
fi['c'] = pd.cut(fi['var'], 10)
fi['c'] = np.where(fi['c'] == pd.cut(fi['var'], 10).value_counts().index[0], 0, 1)
pd.cut(fi['var'], 10).value_counts()
plt.figure(figsize=(25,5))
sns.scatterplot(x=range(fi.shape[0]), y='var', data=fi, hue='c', size='var')
#plt.savefig('feature_importance.png')
fi.index = df.columns[:-1]
fd_final = fd.loc[allsamples.index]
sns.scatterplot(x='rt', y='mz', data=fd_final)
fd_final = fd_final.join(fi, on=fi.index)
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=fd_final[fd_final['c']==0], alpha=.2, )
sns.scatterplot(x='rt', y='mz', data=fd_final[fd_final['c']==1], size='var')
plt.xlim(0, 1000)
plt.ylim(50, 1000)
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=fd_final[fd_final['c']==0], alpha=.2, )
sns.scatterplot(x='rt', y='mz', data=fd_final[fd_final['c']==1], size='var')
plt.xlim(0, 1000)
plt.ylim(50, 1000)
plt.savefig('scatter_rf.png')
sns.kdeplot(fi['var'], shade=True)

Isotopes, Adducts and Neutral loss

Find isotopes

mzs = fd_final['mz'].values
mat = pd.DataFrame(mzs[:, None] - mzs)
def iso(x):
    x = x.values[:x.name][::-1]
    l = len(x)
    res = {}
    Da = 1.0034
    
    for i, x0 in enumerate(x):
        i = l-i-1
        if x0 > 5.1:
            break
        n = round(x0)
        sub = np.abs(Da*n - x0)
        if sub < 0.01 and round(x0)>0:
            res[i] = int(round(x0))
            
    if res:
        return res
def match_rt(x):
    d = x[0]    
    for x0 in list(d):
        sub = fd_final.iloc[x0, 4] - fd_final.iloc[x.name, 4]
        if np.abs(sub) > .5:
            d.pop(x0)
    if d:
        return d
iso_res = pd.DataFrame(mat.apply(iso, axis=1)).dropna()

iso_res = iso_res.apply(match_rt, axis=1).dropna()

iso_res.shape
def flatten(l):
    res = []
    for l0 in l:
        if isinstance(l0, list):
            res.extend(flatten(l0))
        else:
            res.append(l0)
            
    return res
def info(index, msg=True):
    vice_list= flatten([list(item) for item in iso_res.values])
    if index in iso_res.index:
        pass
    elif index in vice_list:
        index = iso_res[pd.notna(iso_res.str.get(index))].index.values[0]
        print('Index in vica list')
    elif not index in iso_res.index:
        raise IndexError('No isotopes found for index: %s' % index)

    indices = [index]
    keys = list(iso_res.loc[index])
    indices.extend(keys)
    #F_index = fd_final.iloc[index, 0]
    
    for key in keys:
        #host = fd_final.iloc[key, 0]
        var = mat.iloc[index, key]
        var = var - round(var) * 0.0034
        if msg:
            print('ParentIndex->{0}, HostIndex->{1}, Sub<{2:.5f}>'.format(index, indices[1:], var))
            #print('ParentIndex->{0}, HostIndex->{1}, Sub<{2:.5f}>'.format(F_index, host, var))
    res = fd_final.iloc[indices, [1, 4, 11]]
    ints = pd.DataFrame(QCs_nor.iloc[res.index].mean(1)).rename({0:'Intensities in QCs'}, axis=1)
    return res.join(ints)
def info(index, msg=True):
    vice_list= flatten([list(item) for item in iso_res.values])
    if index in iso_res.index:
        pass
    elif index in vice_list:
        index = iso_res[pd.notna(iso_res.str.get(index))].index.values[0]
        print('Index in vica list')
    else:
        raise IndexError('No isotopes found for index: %s' % index)

    indices = [index]
    keys = list(iso_res.loc[index])
    indices.extend(keys)
    #F_index = fd_final.iloc[index, 0]
    vals = []
    for key in keys:
        #host = fd_final.iloc[key, 0]
        var = mat.iloc[index, key]
        var = var - round(var) * 0.0034
        vals.append(var)
        if msg:
            print('ParentIndex->{0}, HostIndex->{1}, Sub<{2:.5f}>'.format(index, indices[1:], var))
            #print('ParentIndex->{0}, HostIndex->{1}, Sub<{2:.5f}>'.format(F_index, host, var))
    res = fd_final.iloc[indices, [1, 4, 11]]
    ints = pd.DataFrame(QCs_nor.iloc[res.index].mean(1)).rename({0:'Intensities in QCs'}, axis=1)
    return res.join(ints), vals[0]
info(1300)
def make_iso_table(data):
    res = []
    for i, d in data.iteritems():
        tb = info(i, msg=False)
        
        tb['fd_index'] = tb.index
        i = [i]
        i.extend(list(d))
        tb['iso_index'] = i
        tb['group'] = i[0]
        res.append(tb)
    return pd.concat(res).drop_duplicates()
iso_table = make_iso_table(iso_res)
l = flatten([list(item) for item in iso_res.values])

iso_res.index & np.array(l)
iso_res.index
plt.figure(figsize=(12, 8))
for i in iso_res.index:
    if np.random.random() > .65:
        r = info(i, msg=False)
        sns.scatterplot('rt', 'mz', data=r, s=100, alpha=.7)
plt.figure(figsize=(12, 8))
for i in iso_res.index:
    if np.random.random() > .65:
        r = info(i, msg=False)
        sns.scatterplot('rt', 'mz', data=r, s=100, alpha=.7)
plt.savefig('isotope_pairs.png')
info(37)
def get_sub(x):
    return info(int(x['group']), msg=False)[1]
iso_table['sub'] = iso_table.apply(get_sub, axis=1)
iso_table['sub_normalized'] = iso_table['sub'] / np.round(iso_table['sub'])
iso_table['sub_group'] = pd.cut(iso_table['sub_normalized'], bins=(0.991, 0.999, 1.001, 1.010), labels=('<0.999', '0.999-1.001', '>1.001'))
plt.figure(figsize=(20,5))
#plt.scatter(iso_table['rt'], iso_table['sub_normalized'], marker='s', facecolor='w', edgecolor='r', linewidth=1, alpha=.7)
sns.scatterplot(x='rt', y='sub_normalized', data=iso_table, hue='sub_group', s=80)
plt.savefig('isotope_scatter.png')

Adducts

def deco(fun):
    def wrapper(*args, **kwargs):
        s = time.time()
        results = fun(*args, **kwargs)
        print(time.time()-s)
        return results
    return wrapper

def process(data, fun, axis, **kwargs):
    return data.apply(fun, axis, **kwargs)

def best_number_of_processes(data):
    import multiprocessing as mp
    
    CPUs = mp.cpu_count()
    rows = data.shape[0]
    if rows < 10:
        return 2
    elif rows < 100:
        return 4
    else:
        return CPUs
@deco
def multiprocessing_apply(data, fun, processes=None, axis=1, **kwargs):
    from multiprocessing import Pool
    from functools import partial
    
    if not processes:
        processes = best_number_of_processes(data)
        
    with Pool(processes=processes) as pool:
        if data.isnull().sum().sum():
            print("There are NA")
        
        data = np.array_split(data, processes)
        data = pool.map(partial(process, fun=fun, axis=axis, **kwargs), data)
        
        return pd.concat(data)
adducts = pd.read_html('adducts.html')[0]

adducts = adducts.dropna().drop([0, 3]).drop([3,5,6], axis=1)

pos_adds = adducts.loc[:39]
neg_adds = adducts.loc[40:]

pos_adds.shape, neg_adds.shape
rts = fd_final['rt'].values
def find_close_rt(x, rt):
    if np.abs(x['rt'] - rt) < .5:
        return x

def partial_find_close_rt(rt):
    return fd_final.apply(find_close_rt, rt=rt, axis=1, result_type='broadcast').dropna()

@deco
def multi_find_close_rt(rts):
    with Pool(processes=12) as pool:
        res = pool.map(partial_find_close_rt, rts)
        res = list(res)
    return res
res_after_find_rt = multi_find_close_rt(rts)

sum([len(r) for r in res_after_find_rt])
def parse_mz(formula, mz):
    left, right = formula.split('+')
    if '/' in left:
        left = int(left.split('/')[1])
        
    elif not left.startswith('M'):
        left = 1 / int(left[0])
    else:
        left = 1
    
    right = float(right)
    return (mz-right) * left

def multi_parse_mz(formulas, strucs, mz):
    return {struc: parse_mz(formula, mz) for struc, formula in zip(strucs, formulas)}
def comb(a, length=2):
    N = 2**len(a)
    res = []
    for i in range(N):
        pivot = bin(i)[2:][::-1]
        re = []
        for j in range(len(pivot)):
            if int(pivot[j]):
                re.append(a[j])
        if len(re) == length:
        
            res.append(re)
        
    return res

#@deco
def find_parent_ion_by_adducts(df, ESI_mode='pos'):
    if ESI_mode == 'pos':
        table = pos_adds
    elif ESI_mode == 'neg':
        table = neg_adds
    else:
        print('ESI_mode param can just be either pos or neg!')
        return -1
    
    ms = {}
    if df.shape[0] == 1:
        return None
    
    mzs = df['mz'].values
    indices = df.index
    formulas = table[1]
    strucs = table[0]
    res = pd.DataFrame([multi_parse_mz(formulas, strucs, mz) for mz in mzs]).T
    res = res.rename({i:index for i, index in zip(range(len(indices)), indices)}, axis=1)
    
    c = comb(indices)
    for x1, x2 in c:
        #print(x1, x2)
        m = res[x1].values[:, None] - res[x2].values
        m = pd.DataFrame(m)
        m = m.apply(find_adducts, axis=1).dropna().values
        
        if len(m):
            ms['%s,%s' % (x1, x2)] = m

    if ms:
        return ms, res
    else:
        return None
def find_adducts(x):
    l = len(x)
    res = np.array([-1, -1])
    
    for i, x0 in enumerate(x):
        pivot = np.array([x.name, i])
        if np.abs(x0) < 0.01:
            res = np.vstack([res, pivot])
          
    if res.size > 2:
        return res[1:, :]
@deco
def multi_find_adducts(data):
    with Pool(processes=12) as pool:
        res = pool.map(find_parent_ion_by_adducts, data)
    return res
res_after_find_adducts = multi_find_adducts(res_after_find_rt)
from collections import defaultdict

def make_adducts_table(data):
    if data is None:
        return None
    
    dic, table = data
    raw_table = fd_final.loc[table.columns]
    res = defaultdict(list)
    for key, vals in dic.items():
        l, r = [int(i) for i in key.split(',')]
        for val in vals:
            val = np.ravel(val)
            l_strucs = table.index[val[0]]
            r_strucs = table.index[val[1]]
            
            l_mz = table.loc[l_strucs, l]
            r_mz = table.loc[r_strucs, r]
            res['l_index'].append(l)
            res['r_index'].append(r)
            res['l_originM/Z'].append(raw_table.loc[l, 'mz'])
            res['r_originM/Z'].append(raw_table.loc[r, 'mz'])
            res['l_struc'].append(l_strucs)
            res['r_struc'].append(r_strucs)
            res['l_M/Z'].append(l_mz)
            res['r_M/Z'].append(r_mz)
            res['M/Z_differ'].append(np.abs(l_mz-r_mz))
            res['rt_differ'].append(np.abs(raw_table.loc[l, 'rt'] - raw_table.loc[r, 'rt']))
    return pd.DataFrame(res)

@deco
def multi_make_adducts_table(data):
    with Pool(processes=6) as pool:
        res = pool.map(make_adducts_table, data)
        res = pd.concat(list(res))
        res = res.drop_duplicates()
        res.index = np.arange(res.shape[0])
    return res
adducts_table = multi_make_adducts_table(res_after_find_adducts)
adducts_table[['M/Z_differ', 'rt_differ']].corr()
adducts_table.head()
plt.figure(figsize=(12, 8))
sns.kdeplot(adducts_table['rt_differ'], adducts_table['M/Z_differ'], shade=True)
plt.figure(figsize=(12, 8))
sns.kdeplot(adducts_table['rt_differ'], adducts_table['M/Z_differ'], shade=True, shade_lowest=False)
sns.scatterplot(adducts_table['rt_differ'], adducts_table['M/Z_differ'], color='steelblue', size=25, alpha=.5)
sns.kdeplot(adducts_table['M/Z_differ'], shade=True)
at = adducts_table[adducts_table['M/Z_differ'] < 0.005]
plt.figure(figsize=(12, 8))
sns.kdeplot(at['rt_differ'], at['M/Z_differ'], shade=True)
plt.figure(figsize=(12, 8))
sns.kdeplot(at['rt_differ'], at['M/Z_differ'], shade=True, shade_lowest=False)
sns.scatterplot(at['rt_differ'], at['M/Z_differ'], color='w', edgecolor='steelblue', size=30, legend=False)

总共多少对:

(at['l_index'].astype('str').str[:] + at['r_index'].astype('str').str[:]).unique().size
(adducts_table['l_index'].astype('str').str[:] + adducts_table['r_index'].astype('str').str[:]).unique().size
adducts_table.shape
at.to_csv('adducts.csv', index=None)

Neutral Loss

nloss = \
'''1 hydrogen H2 2.0157
2 water H2O 18.0106
3 methane CH4 16.0313
4 ethene C2H4 28.0313
5 ethine C2H2 26.0157
6 butene C4H8 56.0626
7 pentene C5H8 68.0626
8 benzene C6H6 78.0470
9 formaldehyde CH2O 30.0106
10 carbon monoxide CO 27.9949
11 formic acid CH2O2 46.0055
12 carbon dioxide CO2 43.9898
13 acetic acid C2H4O2 60.0211
14 ketene C2H2O 42.0106
15 propionic acid C3H6O2 74.0368
16 malonic acid C3H4O4 104.0110
17 malonic anhydride C3H2O3 86.0004
18 pentose equivalent C5H8O4 132.0423
19 deoxyhexose equivalent C6H10O4 146.0579
20 hexose equivalent C6H10O5 162.0528
21 hexuronic equivalent acid C6H8O6 176.0321 
22 ammonia NH3 17.0266
23 methylamine CH5N 31.0422
24 methylimine CH3N 29.0266
25 trimethylamine C3H9N 59.0735
26 cyanic acid CHNO 43.0058
27 urea CH4N2O 60.0324
28 phosphonic acid H3PO3 81.9820
29 phosphoric acid H3PO4 97.9769
30 metaphosphoric acid HPO3 79.9663
31 dihydrogen vinyl phosphate C2H5O4P 123.9926
32 hydrogen sulfide H2S 33.9877
33 sulfur S 31.9721
34 sulfur dioxide SO2 63.9619
35 sulfur trioxide SO3 79.9568
36 sulfuric acid H2SO4 97.9674 '''
import re
compound, structure, mass_weight = pd.DataFrame(nloss.split('\n'))[0].str.strip().str.findall('[\d]* (.*) (.*) (.*)').str[0].str
neutral_loss = pd.DataFrame([compound, structure, mass_weight]).T
neutral_loss.columns = ['compound', 'structure', 'mass weight']
neutral_loss['mass weight'] = neutral_loss['mass weight'].astype(float)
neutral_loss.head()
def find_neutral_loss(s):
    for i, val in s.iteritems():
        val = np.abs(val)
        if val < 0.005:
            return True, i, val
    return None, None, None
def find_parent_ion_by_neutral_loss(data):
    mzs = data['mz'].values
    indices = data.index
    d = {i:mz for i, mz in zip(indices, mzs)}
    res = []
    combs = comb(indices)
    for c in combs:
        # l is always smaller than r
        l, r = c
        temp = d[r] - (d[l] + neutral_loss['mass weight'])
        #return temp
        pivot, struc_i, sub = find_neutral_loss(temp)
        if pivot:
            df = pd.DataFrame([l, r, d[l], d[r], 
                                  neutral_loss['structure'][struc_i], 
                                  neutral_loss['mass weight'][struc_i],
                                  sub, np.abs(data.loc[l, 'rt']- data.loc[r, 'rt'])
                              ]).T
            df.columns = ['l_index', 'r_index', 'l_mz', 'r_mz', 
                          'nl_struc', 'mass weight', 'mz_differ', 'rt_differ']
            res.append(df)
    if res:
        return pd.concat(res)
    else:
        return None
def multi_find_neutral_loss(data):
    with Pool(processes=12) as pool:
        res = pool.map(find_parent_ion_by_neutral_loss, data)
        return pd.concat(res).drop_duplicates()
res_after_find_neutral_loss = multi_find_neutral_loss(res_after_find_rt)
res_after_find_neutral_loss.head()
plt.figure(figsize=(10, 6))
sns.kdeplot(res_after_find_neutral_loss['rt_differ'], res_after_find_neutral_loss['mz_differ'], shade=True, shade_lowest=False)
sns.scatterplot('rt_differ', 'mz_differ', data=res_after_find_neutral_loss)
plt.figure(figsize=(10, 6))
sns.kdeplot(res_after_find_neutral_loss['rt_differ'], res_after_find_neutral_loss['mz_differ'], shade=True, shade_lowest=False)
sns.scatterplot('rt_differ', 'mz_differ', data=res_after_find_neutral_loss)
plt.figure(figsize=(10, 6))
sns.kdeplot(res_after_find_neutral_loss['rt_differ'], res_after_find_neutral_loss['mz_differ'], shade=True, shade_lowest=False, cmap='Blues')
sns.scatterplot('rt_differ', 'mz_differ', data=res_after_find_neutral_loss)
xlim = plt.gca().get_xlim()
ylim = plt.gca().get_ylim()
plt.hlines(0.001, -.5, 1.5, colors='navy', linestyles='dashed', alpha=.8)
plt.vlines(0.5, -1, 1, colors='goldenrod', linestyles='dashed', alpha=.8)
plt.xlim(xlim)
plt.ylim(ylim)
plt.text(-.1, 0.0012, '↓0.001↓', ha='center', va='center', size=20, color='navy')
plt.text(.6, 0.0012, '←0.5←', ha='center', va='center', size=20, color='goldenrod')
nl = res_after_find_neutral_loss[(res_after_find_neutral_loss['mz_differ'] < 0.001) & (res_after_find_neutral_loss['rt_differ'] < .5)]
nl['nl_struc'].value_counts()
nl.to_csv('neutral_loss.csv', index=None)

Aggregation

res_after_find_neutral_loss['nl_struc'].value_counts()
adducts_table['l_struc'].value_counts()[:5]
adducts_table['r_struc'].value_counts()[:5]
at['l_struc'].value_counts()[:5]
at['r_struc'].value_counts()[:5]
at['r_struc'].value_counts()[:5]
a = dict(at['l_struc'].value_counts())
b = dict(at['r_struc'].value_counts())
for c in list(b):
    if c in a:
        a[c] += b[c]
    else:
        a[c] = b[c]
pd.Series(a)
a = adducts_table[(adducts_table['l_struc'].str.contains('ACN')) | (adducts_table['r_struc'].str.contains('ACN'))]
np.intersect1d((a['l_index'].values),(a['r_index'].values))
adducts_table[adducts_table['l_index'] == 872]

all indices that involved in iso, adducts or neutral loss

iso_is = set(iso_table['fd_index'])
len(iso_is)
adducts_is = set(adducts_table['l_index'].values)
adducts_is.update(adducts_table['r_index'].values)
len(adducts_is)
nl_is = set(res_after_find_neutral_loss['l_index'].values)
nl_is.update(res_after_find_neutral_loss['r_index'])
len(nl_is)
from matplotlib_venn import venn3
plt.figure(figsize=(10, 6))
venn3([iso_is, adducts_is, nl_is], ('Isotope', 'Adduct', 'neutral loss'))
all_is = set.union(iso_is, adducts_is, nl_is)
single_fd = fd_final.copy().loc[(fd_final.index ^ pd.Index(all_is))]
into_agg = into.copy().loc[fd_final.index]
fd_need_agg = fd_final.copy().loc[pd.Index(all_is)]
fd_need_agg['isoadnl'] = 0
fd_need_agg['drop'] = 1
into_agg.shape, fd_need_agg.shape
def agg_iso(iso_table, fd_table, into_table, how='sum'):
    fd_need_agg = fd_table.copy()
    into_agg = into_table.copy()
    cols = into_agg.columns
    if not (how == 'sum' or how == 'drop'):
        raise TypeError('agg_iso() got an unexpected keyword argument "how" > ("sum" or "drop")')
    prime_list = []
    for group in iso_table['group'].unique():
        temp = iso_table[iso_table['group']==group].copy()
        prime_list.append(temp.iloc[-1, 4])
    
    for group in iso_table['group'].unique():
        temp = iso_table[iso_table['group']==group].copy()
        if temp.iloc[0, 0] < temp.iloc[-1,0]:
            raise IndexError('invalid order in group:%s' % group)
        prime = temp.index[-1]
        free = temp.index[:-1]
        
        total = temp.index
        
        pivot = False
        for f in free:
            if not f in prime_list:
                fd_need_agg.loc[f, 'drop'] = np.nan
                pivot = True
        if pivot:
            fd_need_agg.loc[prime, 'isoadnl'] = 1
        
        if how=='sum':
            for f in free:
                if not f in prime_list:
                    into_agg.loc[prime, cols[1:]] = into_agg.loc[prime, cols[1:]] + into_agg.loc[f, cols[1:]]
    fd_need_agg = fd_need_agg.dropna()
    into_agg = into_agg.loc[fd_need_agg.index]
    return fd_need_agg, into_agg
def agg_iso(iso_table, fd_table, into_table, how='sum', conservative=True):
    fd_need_agg = fd_table.copy()
    into_agg = into_table.copy()
    cols = into_agg.columns
    if not (how == 'sum' or how == 'drop'):
        raise TypeError('agg_iso() got an unexpected keyword argument "how" > ("sum" or "drop")')
    prime_list = []
    for group in iso_table['group'].unique():
        temp = iso_table[iso_table['group']==group].copy()
        prime_list.append(temp.iloc[-1, 4])
    
    for group in iso_table['group'].unique():
        temp = iso_table[iso_table['group']==group].copy()
        if temp.iloc[0, 0] < temp.iloc[-1,0]:
            raise IndexError('invalid order in group:%s' % group)
        prime = temp.index[-1]
        free = temp.index[:-1]
        
        total = temp.index
        
        pivot = False
        
        val = fd_need_agg.loc[prime, 'var']
        for f in free:
            if conservative and fd_need_agg.loc[f, 'var'] > val:
                break
            if not f in prime_list:
                fd_need_agg.loc[f, 'drop'] = np.nan
                pivot = True
            if how=='sum':
                if not f in prime_list:
                    into_agg.loc[prime, cols[1:]] = into_agg.loc[prime, cols[1:]] + into_agg.loc[f, cols[1:]]
        if pivot:
            fd_need_agg.loc[prime, 'isoadnl'] = 1
        
    fd_need_agg = fd_need_agg.dropna()
    into_agg = into_agg.loc[fd_need_agg.index]
    return fd_need_agg, into_agg
fd_iso, into_iso = agg_iso(iso_table, fd_need_agg, into_agg, conservative=True)
fd_iso['isoadnl'].value_counts()
fd_iso.shape
def agg_ad(fd_iso, into_iso, adducts_table):
    fd_ad = fd_iso.copy()
    into_ad = into_iso.copy()
    cols = into_ad.columns
    indices = fd_ad.index
    pairs = at[['l_index', 'r_index']].drop_duplicates().values
    fd_ad['pivot'] = 0
    for pair in pairs:
        if pair[0] in indices and pair[1] in indices:
            temp = fd_ad.loc[pair]
            fd_ad.loc[temp.index[-1], 'drop'] = np.nan
            into_ad.loc[temp.index[0], cols[1:]] += into_ad.loc[temp.index[-1], cols[1:]]
            fd_ad.loc[temp.index[0], 'pivot'] = 2
            
    fd_ad['isoadnl'] = fd_ad['isoadnl'] + fd_ad['pivot']
    fd_ad = fd_ad.drop('pivot', axis=1)
    fd_ad = fd_ad.dropna()
    into_ad = into_ad.loc[fd_ad.index]

    return fd_ad, into_ad
fd_ad, into_ad = agg_ad(fd_iso, into_iso, adducts_table)
fd_ad.shape, into_ad.shape
fd_ad['isoadnl'].value_counts()
def agg_nl(fd_ad, into_ad, res_after_find_neutral_loss):
    fd_nl = fd_ad.copy()
    into_nl = into_ad.copy()
    cols = into_nl.columns
    indices = fd_nl.index
    fd_nl['pivot'] = 0
    pairs = res_after_find_neutral_loss[['l_index', 'r_index']].values
    for pair in pairs:
        if pair[0] in fd_nl.index and pair[1] in indices:
            temp = fd_nl.loc[pair]
            fd_nl.loc[temp.index[-1], 'drop'] = np.nan
            into_nl.loc[temp.index[0], cols[1:]] += into_nl.loc[temp.index[-1], cols[1:]]
            fd_nl.loc[temp.index[0], 'pivot'] = 4
    fd_nl['isoadnl'] = fd_nl['isoadnl'] + fd_nl['pivot']
    fd_nl = fd_nl.drop('pivot', axis=1)
    fd_nl = fd_nl.dropna()
    into_nl = into_nl.loc[fd_nl.index]
    return fd_nl, into_nl
fd_nl, into_nl = agg_nl(fd_ad, into_ad, res_after_find_neutral_loss)
fd_nl.shape, into_nl.shape
fd_nl['isoadnl'].value_counts()
fd_nl = fd_nl.drop('drop', axis=1)
single_fd['isoadnl'] = 0
fd_after_agg = pd.concat([single_fd, fd_nl])
as_after_agg = allsamples.loc[fd_after_agg.index]
fagg = as_after_agg.T.values

fagg = fagg / (fagg.sum(1) / fagg.sum(1).max())[:, np.newaxis]
fagg = StandardScaler().fit_transform(fagg)
pca_mat = draw_pca(fagg)
draw_trend(pca_mat)

Variable selection

VIPs = pd.read_csv('vip_from_pls_da.csv', index_col=0)

vip_index = VIPs[(VIPs['comp 1'] > 1)&(VIPs['comp 2'] > 1)].index
vip_index = set(vip_index)
volcano = pd.DataFrame([pvalues_fdr_bh, np.abs(times)]).T
volcano.index = a.columns
volcano.columns = ['p_val', 'fold_change']
t_index = set(volcano[(volcano['p_val'] < .05)&(volcano['fold_change'] > 2)].index)
rf_index = set(fi[fi['c'] == 1].index.astype(float))
from matplotlib_venn import venn3_circles
plt.figure(figsize=(10,10))
cs = ('lightskyblue', 'darkred', 'forestgreen')
v = venn3((vip_index, t_index, rf_index), ('VIP', 'Volcano\nplot', 'RandomForest'), cs)
c = venn3_circles((vip_index, t_index, rf_index), linestyle='solid', linewidth=2, color='grey')
for p in v.subset_labels:
    if p:
        plt.setp(p, fontsize=18)
for i, p in enumerate(v.set_labels):
    if p:
        plt.setp(p, fontsize=18, color=cs[i], fontweight='bold', fontstyle='italic', ha='center')
plt.gca().set_facecolor('seashell')
plt.gca().set_axis_on()
plt.savefig('venn3.png')
pca_mat = draw_pca(f)
plt.close()
total_ints = allsamples.T
total_ints.index = names
total_ints = total_ints.loc[~total_ints.index.str.startswith('QC-eq')]
total_ints = pd.DataFrame(total_ints.sum(1))
import scipy.stats as stats
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h
total_data = total_ints.pivot_table(0, index=total_ints.index, aggfunc=('mean', 'std', 'min', 'max', mean_confidence_interval))['mean_confidence_interval']
total_data = pd.DataFrame(total_data)
total_data['mean'], total_data['down'], total_data['up'] = total_data['mean_confidence_interval'].str[0], total_data['mean_confidence_interval'].str[1], total_data['mean_confidence_interval'].str[2]
total_data = total_data.drop('mean_confidence_interval', axis=1)
total_data
total_ints['cate'] = total_ints.index
total_ints = total_ints.rename({0:'int'}, axis=1)
groups = ['a', 'b', 'c', 'd', 'e', 'c']
g = sns.catplot(x='cate', y='int', data=total_ints, kind='bar',
           palette='muted')
fig = g.fig
fig.set_size_inches(12, 6)
g.despine(left=True)
axes = fig.axes[0]
axes.set_xticklabels(['Sample4', 'Sample1', 'QC', 'Sample5', 'Sample3', 'Sample2'], fontsize=15)
axes.set_xlabel('Class', fontsize=18)
axes.set_ylabel('Total Intensities', fontsize=18)
for i, g in enumerate(groups):
    plt.text(i, total_data.loc[['Sa-4', 'Sa-1', 'QC-nor', 'Sa-5', 'Sa-3', 'Sa-2']].iloc[i,0]+5e9, g, 
            ha='center', va='center', fontsize=15, fontweight='bold')
plt.savefig('total_intensities.png')
fd_after_agg.shape
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=fd_after_agg[fd_after_agg['c']==0], alpha=.2, )
sns.scatterplot(x='rt', y='mz', data=fd_after_agg[fd_after_agg['c']==1], size='var')
plt.xlim(0, 1000)
plt.ylim(50, 1000)
#plt.savefig('scatter_rf_agg.png')
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=fd_after_agg[fd_after_agg['c']==0], alpha=.2, )
sns.scatterplot(x='rt', y='mz', data=fd_after_agg[fd_after_agg['c']==1], size='var')
plt.xlim(0, 1000)
plt.ylim(50, 1000)
plt.savefig('scatter_rf_agg.png')
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=fd_need_agg[fd_need_agg['c']==0], alpha=.2, )
sns.scatterplot(x='rt', y='mz', data=fd_need_agg[fd_need_agg['c']==1], size='var')
plt.xlim(0, 1000)
plt.ylim(50, 1000)
#plt.savefig('scatter_rf_agg.png')
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=single_fd[single_fd['c']==0], alpha=.2, )
sns.scatterplot(x='rt', y='mz', data=single_fd[single_fd['c']==1], size='var')
plt.xlim(0, 1000)
plt.ylim(50, 1000)
#plt.savefig('scatter_rf_agg.png')
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=fd_final[fd_final['c']==0], alpha=.2, )

sns.scatterplot(x='rt', y='mz', data=single_fd[single_fd['c']==1], size='var', legend=False)
sns.scatterplot(x='rt', y='mz', data=fd_need_agg[fd_need_agg['c']==1], size='var', legend=False)
temp_index = (fd_need_agg.index ^ fd_nl.index)
temp_fd = fd_final.loc[temp_index]
sns.scatterplot(x='rt', y='mz', data=temp_fd[temp_fd['c']==1], size='var', legend=False)
ax=plt.gca()
l = ['Not important variables', 'Monoions', 'Fused ions', 'Ions dropped after fusion']
for i, p in enumerate(ax.get_children()[:4]):
    color = p.get_facecolors()
    plt.text(750, 900-i*50, l[i], fontsize=15,
            ha='left', va='center')
    plt.scatter(700, 900-i*50, c=color, s=100)
    
plt.xlim(0, 1000)
plt.ylim(50, 1000)
plt.savefig('scatter_rf_all.png')
fd_need_agg.shape, fd_nl.shape
def find(rt=None, mz=None, thld=1):
    fd = fd_final.iloc[:, [1,4,-4,-2,-1]].copy()
    if mz:
        fd = fd[(fd['mz'] > mz-thld/2)&(fd['mz'] < mz+thld/2)]
    if rt:
        return fd[(fd['rt'] > rt-thld/2) & (fd['rt'] < rt+thld/2)]
    if fd == fd_final:
        return None
QCs_nor.loc[[2187, 2407]].mean(1)
i = find(644, thld=1)
i
iso_table.loc[i.index & iso_table.index]
adducts_table[np.isin(adducts_table['l_index'], i.index) | np.isin(adducts_table['r_index'], i.index)]
res_after_find_neutral_loss[np.isin(res_after_find_neutral_loss['l_index'], i.index) | np.isin(res_after_find_neutral_loss['r_index'], i.index)]
i
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12,6))
sns.scatterplot(x='rt', y='mz', data=i, hue='c', size='var')
plt.xlim(643.5, 645)
plt.ylim(150, 400)
#plt.vlines(i['rt'].mean(), 0,1000, linestyles='dashed', colors='coral', alpha=.5)
#plt.vlines(i['rt'].mean()-.5, 0,1000, linestyles='solid', colors='lightcoral')
#plt.vlines(i['rt'].mean()+.5, 0,1000, linestyles='solid', colors='lightcoral')
for vals in i.iterrows():
    plt.text(vals[1]['rt']+.02, vals[1]['mz'], vals[0], ha='left', va='center')
#plt.grid(which='both', alpha=.5)
pd.Series(np.concatenate([adducts_table['l_struc'].values, adducts_table['r_struc'].values])).value_counts()
res_after_find_neutral_loss['nl_struc'].value_counts()

More and more..

qc = QCs_nor.loc[pass_index].copy()
qc['RSD'] = qc.apply(lambda x: np.std(x, ddof=1)/np.mean(x), axis=1)
qc['median'] = qc.median(1)
qc['std'] = np.std(qc.iloc[:, :7], 1, ddof=1)
qc = qc.dropna(subset=['std'])
qc['rank'] = qc['median'].argsort().argsort()

fig, ax = plt.subplots(figsize=(12,6))
sns.scatterplot('rank', 'std', data=qc)
LOESS = sm.nonparametric.lowess(qc['std'], qc['rank'])
ax.plot(LOESS[:, 0], LOESS[:, 1], color='r', linewidth=2, linestyle='solid')
qc = QCs_nor.loc[pass_index].copy()
qc['RSD'] = qc.apply(lambda x: np.std(x, ddof=1)/np.mean(x), axis=1)
qc['median'] = qc.median(1)
qc['std'] = np.std(qc.iloc[:, :7], 1, ddof=1)
qc = qc.dropna(subset=['std'])
qc['rank'] = qc['median'].argsort().argsort()

fig, [ax, ax2] = plt.subplots(1, 2, figsize=(12,6))
sns.scatterplot('median', 'std', data=qc, ax=ax)
LOESS = sm.nonparametric.lowess(qc['std'], qc['median'])
ax.plot(LOESS[:, 0], LOESS[:, 1], color='k', linewidth=2, linestyle='dashed')
ax2.plot(LOESS[:, 0], LOESS[:, 1])
qc = QCs_nor.loc[pass_index].copy()
qc['RSD'] = qc.apply(lambda x: np.std(x, ddof=1)/np.mean(x), axis=1)
qc['median'] = np.log(qc.median(1))
qc['std'] = np.std(qc.iloc[:, :7], 1, ddof=1)
qc = qc.dropna(subset=['std'])
qc['rank'] = qc['median'].argsort().argsort()

fig, [ax,ax2] = plt.subplots(1,2,figsize=(12,6))
sns.scatterplot('median', 'std', data=qc, ax=ax)
LOESS = sm.nonparametric.lowess(qc['std'], qc['median'])
ax.plot(LOESS[:, 0], LOESS[:, 1], color='k', linewidth=2, linestyle='dashed')
ax2.plot(LOESS[:,0], LOESS[:, 1])
plt.plot(LOESS[:,0], LOESS[:,1])
qc.head()
qc = QCs_nor.loc[pass_index].copy()
qc['RSD'] = qc.apply(lambda x: np.std(x, ddof=1)/np.mean(x), axis=1)
qc['median'] = qc.median(1)
qc['std'] = np.std(qc.iloc[:, :7], 1, ddof=1)
qc = qc.dropna(subset=['std'])
qc['rank'] = qc['median'].argsort().argsort()

fig, ax = plt.subplots(figsize=(12,6))
sns.scatterplot('rank', 'RSD', data=qc)
#LOESS = sm.nonparametric.lowess(qc['std'], qc['rank'])
#ax.plot(LOESS[:, 0], LOESS[:, 1], color='k', linewidth=2, linestyle='dashed')
qc = np.log(QCs_nor.loc[pass_index].copy())
qc['RSD'] = qc.apply(lambda x: np.std(x, ddof=1)/np.mean(x), axis=1)
qc['median'] = qc.median(1)
qc['std'] = np.std(qc.iloc[:, :7], 1)
qc = qc.dropna(subset=['std'])
qc['rank'] = qc['median'].argsort().argsort()

fig, ax = plt.subplots(figsize=(12,6))
sns.scatterplot('rank', 'std', data=qc)
LOESS = sm.nonparametric.lowess(qc['std'], qc['rank'])
ax.plot(LOESS[:, 0], LOESS[:, 1], color='k', linewidth=2, linestyle='dashed')

Pseudo imitation

qc = QCs_nor.loc[pass_index].copy()
qc['RSD'] = qc.apply(lambda x: np.std(x, ddof=1)/np.mean(x), axis=1)
qc['median'] = qc.median(1)
qc['std'] = np.std(qc.iloc[:, :7], 1, ddof=1)
qc = qc.dropna(subset=['std'])
qc['rank'] = qc['median'].argsort().argsort()

fig, ax = plt.subplots(figsize=(12,6))
sns.scatterplot('rank', 'std', data=qc)
LOESS = sm.nonparametric.lowess(qc['RSD'], qc['rank'])
ax.plot(LOESS[:, 0], LOESS[:, 1], color='k', linewidth=2, linestyle='dashed')
qc = QCs_nor.loc[pass_index].copy()
tqc = qc['QC_3_3.mzXML'].copy()
for i in range(7):
    qc.iloc[:, i] = np.log2(qc.iloc[:, i] / tqc)

fig, ax = plt.subplots(figsize=(8,6))
ax.set_ylim(-2,2)
sns.boxplot('variable', 'value', data=qc.melt(value_vars=qc.columns[:7]), linewidth=1, ax=ax)
ax.tick_params('x', labelrotation=30)
qc['cv'] = pd.cut(qc['RSD'], bins=(0, 0.1, 0.15, 0.20, 0.25, 0.30, 100), labels=('<10%', '10~15%', '15~20%', '20~25%', '25~30%', '>30'))
sns.countplot('cv', data=qc, palette='winter_r')
ft = filters.iloc[:, 1:]
ft = ft.loc[allsamples.index]
combs = comb(ft.columns)
fig, axs = plt.subplots(8, 2, figsize=(18,40))
axs = axs.ravel()
for i, c in enumerate(combs):
    ax = axs[i]
    ax.set_facecolor('floralwhite')
    ftt = pd.DataFrame((ft[c[0]]) / (ft[c[1]])).rename({0:'fd'}, axis=1)
    ftt['fd'] = np.log2(ftt['fd'])
    ftt['rank'] = QCs_nor.mean(1).fillna(0).argsort().argsort()
    ftt['index'] = ftt.index
    ftt['median'] = np.log(QCs_nor.median(1))
    sns.scatterplot('median', 'fd', data=ftt, ax=ax)
    
    LOESS = sm.nonparametric.lowess(ftt['fd'], ftt['median'])
    ax.plot(LOESS[:, 0], LOESS[:, 1], color='k', linewidth=2, linestyle='dashed')
    ax.set_xticks([])
    ax.set_xlabel('')
    change = np.log2(eval(c[0]+'/'+c[1]))
    ax.set_ylim(change-2, change+2)
    ax.text(20, change+1, 'Theory: %s\n%s<>%s' % (round(change,2), c[0], c[1]), fontsize=18, color='r')
    ax.hlines(change, 0, 50, color='r')
    ax.set_xlim(ftt['median'].min()-1, ftt['median'].max()+1)
    print(c, 'DONE!', end='----')
plt.tight_layout()
a = allsamples.join(filters.iloc[:, 1:]).fillna(0)
a = into.iloc[:, 1:]

a = a.iloc[:, [11, 6, 10, 16, 8, 19, 18, 9, 
          31, 1, 22, 12, 
          34, 28, 23, 7, 
          33, 25, 35, 20, 
          3, 29, 30, 14, 
          4, 26, 32, 13, 
          0, 2, 21, 17, 
          24, 5, 27, 15]]
def deal_names2(cols):
    re = []
    for i, name in enumerate(cols):
        if i in [0, 1, 2]:
            re.append('QC-eq-%s' % str(i+1))
        elif i in [3, 4, 5, 6, 7]:
            re.append('QC-eq')
            
        elif name.startswith('Sa'):
            name = name[:10]
            re.append('Sa-%s' % name[-3])
            
        elif name.startswith('QC'):
            re.append('QC-nor')
        else:
            import re as r
            name = r.findall('.*_(\d*)', name)[0]
            name = list('{:0<3}'.format(name))
            name.insert(1, '.')
            name = ''.join(name)
            
            re.append('Fi%s' % name)
    
    return re

def NOR_total_ion(mat):
    p = np.median(mat[[11, 15, 19, 23, 27, 31, 35], :].sum(1))
    return mat / ((mat.sum(1) / p)[:, np.newaxis])

def NOR_median(mat):
    p = np.median(mat[[11, 15, 19, 23, 27, 31, 35], :], 0)
    p = np.median(mat / p, 1)
    return mat / p[:, np.newaxis]

def NOR_loess(mat):
    pass
def draw_pca2(df, normalization_method='median', order=True):
    df = df.dropna(how='all').fillna(10)
    mat = df.T.values
    
    if normalization_method=='median':
        mat = NOR_median(mat)
    elif normalization_method=='total_ion':
        mat = NOR_total_ion(mat)
    elif normalization_method==None:
        pass
    else:
        print('invalid method string, skip normalization')
    #mat = ((mat - mat.mean(0)) / ((mat.std(0, ddof=1))**1))
    PCA_allsamples = PCA(n_components=5, whiten=False).fit_transform(mat)
    names = deal_names2(df.columns)
    colormap = {'QC-eq-1': 'brown', 
                'QC-eq-2': 'darkred', 
                'QC-eq-3': 'red', 
                'QC-eq': 'indianred', 
                'QC-nor': 'lightcoral', 
                'Sa-1': 'plum', 
                'Sa-2': 'skyblue', 
                'Sa-3': 'teal', 
                'Sa-4': 'forestgreen', 
                'Sa-5': 'peru', 
                'Fi0.10': '#E8E8E8',
                'Fi0.15': '#CFCFCF',
                'Fi0.30': '#B5B5B5',
                'Fi0.50': '#9C9C9C',
                'Fi0.75': '#4F4F4F',
                'Fi1.00': '#1C1C1C'
               
               }
    pca_mat = pd.DataFrame(PCA_allsamples)

    pca_mat['class'] = names
    pca_mat['cate'] = pca_mat['class'].str[:2]
    
    fig, axs = plt.subplots(1,2, figsize=(16,6))
    
    ax = axs[0]
    scatters = sns.scatterplot(0, 1, hue='class', style='cate', data=pca_mat, s=200, ax=ax, palette=colormap, alpha=.7)
    if order:
        for i in range(8, 36):
            if names[i] == 'QC-nor':
                continue
            ax.text(pca_mat.iloc[i, 0]+2, pca_mat.iloc[i, 1], i, fontsize=15, ha='left', va='center')
    ax.legend(fontsize=10, bbox_to_anchor=[1,1, .2, 0])
    ax.set_xlabel('Component 1', fontsize=18)
    ax.set_ylabel('Component 2', fontsize=18) 
    
    ax = axs[1]
    sns.scatterplot(0, 1, hue='class', style='cate', data=pca_mat[(pca_mat['cate']=='QC') | (pca_mat['cate'] == 'Fi')], s=200, ax=ax, palette=colormap)
    ax.legend(fontsize=10)
    ax.set_xlabel('Component 1', fontsize=18)
    ax.set_ylabel('Component 2', fontsize=18)
    plt.tight_layout()
    return pca_mat, mat
s, d = draw_pca2(a.loc[pass_index], normalization_method='median')
QCs_nor = QCs_nor.loc[pass_index]
m = QCs_nor.median(1)

fa = (QCs_nor / m[:, None]).median().values

qcs = QCs_nor / fa
qcs = QCs_nor / (QCs_nor.mean() / QCs_nor.mean().max())
orsd = QCs_nor.apply(lambda x: np.std(x, ddof=1)/np.mean(x), 1)
nrsd = qcs.apply(lambda x: np.std(x, ddof=1) / np.mean(x), 1)
sns.kdeplot(orsd, shade=True, label='old')
sns.kdeplot(nrsd, shade=True, label='new')
plt.legend()
a = a.loc[pass_index]
a0 = np.log2((a/a.median(1)[:, None]))
a1 = a / (a/a.median(1)[:,None]).median()
a1 = np.log2(a1/a1.median(1)[:, None])
plt.figure(figsize=(25,5))
sns.boxplot(x='variable', y='value', data=a0.melt(), linewidth=.5)
plt.ylim(-4, 4)
plt.gca().tick_params('x', rotation=30)
plt.figure(figsize=(25,5))
sns.boxplot(x='variable', y='value', data=a1.melt(), linewidth=.5)
plt.ylim(-4, 4)
plt.gca().tick_params('x', rotation=30)
a2 = pd.DataFrame(d).T
a2.index = a1.index
a2.columns = a1.columns
a2 = np.log2(a2/a2.median(1)[:, None])
plt.figure(figsize=(25,5))
sns.boxplot(x='variable', y='value', data=a2.melt(), linewidth=.5)
plt.ylim(-4, 4)
plt.gca().tick_params('x', rotation=30)
loess = sm.nonparametric.lowess