# ROI Extraction

In [1]:
import sys
sys.path.append('../codes')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

In [4]:
import numpy as np
import pandas as pd
import pylab as plt

from VMSfunctions.Common import *
from VMSfunctions.DataGenerator import *
from VMSfunctions.Chromatograms import *

### 1. Load justin beer data

In [5]:
ds = DataSource()

In [6]:
mzml_path = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\Data\\Beers_4Beers_compared\\Positive\\samples\\mzML'
ds.load_data(mzml_path)

INFO:DataGenerator:Loading Beer_1_full1.mzML
INFO:DataGenerator:Loading Beer_1_full2.mzML
INFO:DataGenerator:Loading Beer_1_full3.mzML
INFO:DataGenerator:Loading Beer_2_full1.mzML
INFO:DataGenerator:Loading Beer_2_full2.mzML
INFO:DataGenerator:Loading Beer_2_full3.mzML
INFO:DataGenerator:Loading Beer_3_full1.mzML
INFO:DataGenerator:Loading Beer_3_full2.mzML
INFO:DataGenerator:Loading Beer_3_full3.mzML
INFO:DataGenerator:Loading Beer_QC_full1.mzML
INFO:DataGenerator:Loading Beer_QC_full2.mzML
INFO:DataGenerator:Loading Beer_QC_full3.mzML


In [7]:
roi_file = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\Data\\Beers_4Beers_compared\\rois.csv'
# ds.extract_roi(roi_file)

In [8]:
df = pd.read_csv(roi_file)

In [9]:
unique_filenames = df['file'].unique()

In [10]:
from tqdm import tqdm
tqdm.pandas()

In [11]:
res = df.progress_apply(lambda x: x.to_json(), axis=1)

100%|███████████████████████████████████████████████████████████████████| 394970/394970 [03:17<00:00, 2001.22it/s]


In [None]:
res[0]

### 2. Load ROI data exported from XCMS

In [None]:
all_rois = {} # key: file_name, value: a dict of rois, rois_mzmin, rois_mzmax, rois_rtmin, rois_rtmax
for filename in unique_filenames:
    print('Processing ROIs for %s' % filename)
    rois_data = { 
        'rois': [],
        'mzmin': [],
        'mzmax': [],
        'rtmin': [],
        'rtmax': []
    } 
    
    # convert each row of the dataframe to roi objects
    for idx, row in df.iterrows():
        if (idx % 10000 == 0):
            print('%6d/%6d' % (idx, df.shape[0]))
        file_name = row['file']
        mzmin = row['mzmin']
        mzmax = row['mzmax']
        rtmin = row['rtmin']
        rtmax = row['rtmax']
        scmin = row['scmin']
        scmax = row['scmax']
        pickedPeak = row['pickedPeak']
        mode = row['mode']    
        roi = RegionOfInterest(file_name, mode, pickedPeak, (mzmin, mzmax), (rtmin, rtmax), (scmin, scmax))

        rois_data['rois'].append(roi)
        rois_data['mzmin'].append(mzmin)
        rois_data['mzmax'].append(mzmax)
        rois_data['rtmin'].append(rtmin)
        rois_data['rtmax'].append(rtmax)

    # convert all values to numpy arrays
    rois_data['rois'] = np.array(rois_data['rois'])
    rois_data['mzmin'] = np.array(rois_data['mzmin'])
    rois_data['mzmax'] = np.array(rois_data['mzmax'])
    rois_data['rtmin'] = np.array(rois_data['rtmin'])
    rois_data['rtmax'] = np.array(rois_data['rtmax'])                
    all_rois[filename] = rois_data
    
    print('%d ROIs added' % len(rois_data['rois']))

In [None]:
def get_containing_rois(p, rois_data):
    mzmin_check = rois_data['mzmin'] <= p.mz
    mzmax_check = p.mz <= rois_data['mzmax']
    rtmin_check = rois_data['rtmin'] <= p.rt
    rtmax_check = p.rt <= rois_data['rtmax']
    idx = np.nonzero(mzmin_check & mzmax_check & rtmin_check & rtmax_check)[0]
    rois = rois_data['rois'][idx]
    return rois

In [None]:
for filename in unique_filenames:
    spectra = ds.file_spectra[filename]
    for scan_id, spectrum in spectra.items():
        print('%4d/%d processing spectrum %s' % (scan_id, len(spectra), spectrum))
        rt = ds._get_rt(spectrum)
        for mz, intensity in spectrum.peaks('raw'):
            p = PeakSample(mz, rt, intensity, spectrum.ms_level)            
            rois = get_containing_rois(p, rois_data)
            for roi in rois:
                roi.add(p)            

In [None]:
save_obj(all_rois, '../models/all_rois.p')

### Plot some ROIs we found

In [None]:
all_rois = load_obj('../models/all_rois.p')

In [None]:
file_name = list(all_rois.keys())[0]

In [None]:
file_name

In [None]:
rois_data = all_rois[file_name]

In [None]:
for roi in rois_data['rois'][0:10]:
    mzs = [p.mz for p in roi.peaks]
    rts = [p.rt for p in roi.peaks]
    intensities = [p.intensity for p in roi.peaks]
    print(mzs)
    print(rts)
    print(intensities)
    print(roi.pickedPeak)
    print()

In [None]:
true_rois = [roi for roi in rois_data['rois'] if roi.pickedPeak]
false_rois = [roi for roi in rois_data['rois'] if not roi.pickedPeak]

In [None]:
print(len(true_rois))
print(len(false_rois))

In [None]:
def plot_roi(roi_list, min_length, title):
    count = 0
    for roi in roi_list:
        mzs = [p.mz for p in roi.peaks]
        rts = [p.rt for p in roi.peaks]
        intensities = [p.intensity for p in roi.peaks]
        if len(rts) > min_length and rts[0] > 180:
            plt.plot(rts, intensities)
            plt.xlabel('RT')
            plt.ylabel('Intensity')
            plt.title(title)
            plt.show()
            count += 1
            if count >= 3:
                break

In [None]:
plot_roi(true_rois, 30, 'ROI detected as peak')

In [None]:
plot_roi(false_rois, 500, 'ROI detected as not peak')

Check that we're finding the chromatograms we exported before

In [None]:
def to_chromatogram(roi):
    if len(roi.peaks) == 0 or len(roi.peaks) == 1:
        return None
    mzs = [p.mz for p in roi.peaks]
    rts = [p.rt for p in roi.peaks]
    intensities = [p.intensity for p in roi.peaks]
    chrom = EmpiricalChromatogram(rts, mzs, intensities)
    return chrom

In [None]:
def rois_to_chromatograms(rois):
    chromatograms = []
    for roi in rois:
        chrom = to_chromatogram(roi)
        if chrom is not None:
            chromatograms.append(chrom)
    return chromatograms

In [None]:
extracted_chromatograms = rois_to_chromatograms(true_rois)

In [None]:
xcms_output = '../models/beer_ms1_peaks.csv.gz'
exported_chromatograms = ChromatogramCreator(xcms_output).chromatograms

In [None]:
len(extracted_chromatograms)

In [None]:
len(exported_chromatograms)

In [None]:
def check(chrom, to_search):
    for i in range(len(to_search)):
        item = to_search[j]
        if chrom == item:
            print('Found', chrom, 'at', i)
            return i
    return None

In [None]:
check(extracted_chromatograms[1000], exported_chromatograms)

In [None]:
check(extracted_chromatograms[1001], exported_chromatograms)

In [None]:
check(extracted_chromatograms[1002], exported_chromatograms)

In [None]:
i = 1000
plt.plot(extracted_chromatograms[i].raw_rts, extracted_chromatograms[i].raw_intensities)
print(extracted_chromatograms[i].raw_rts)
print(extracted_chromatograms[i].raw_intensities)
print(extracted_chromatograms[i].raw_mzs)

In [None]:
for i in range(len(exported_chromatograms)):
    chrom = exported_chromatograms[i]
    rt = chrom.raw_rts[0]
    if 266.140 < rt < 266.150:
        print(i)
        break

In [None]:
i = 1882
plt.plot(exported_chromatograms[i].raw_rts, exported_chromatograms[i].raw_intensities)