In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import glob
import os
import peakutils
import datetime
import warnings
from jupyterthemes import jtplot

In [6]:
jtplot.style('default', grid=False)
from rga.rga_sw import *

In [None]:
from rga.rga_sw import 

# Playground for developing class ProcessScanDF

##  Test file (only 11 spectra)

In [51]:
filename = f'../rga_data/20190715_154154_Faraday_Background_20ms_200amu_scan.asc'

##### Check malfunctioning of previous method (process_raw_rga_data)
Since we enter manually the "resolution" (which is actually the number of points per amu), 
the number buffer lines and so on, and assume that the scan width is 200 (while for some reason it stops at 199,63)
then the whole thing fails:

In [75]:
df = process_raw_rga_data(filename, n_spectra=10, resolution=8, buffer_lines=8)

1600
            mass              ion_current
0     Scan Width                   200,00
1     Start Time  7/15/2019 03:43:27.334 
2     Mass [amu]          Ion Current [A]
3           0,00            9,594746e-014
4           0,13            9,260449e-014
5           0,25            2,219874e-012
6           0,38            2,688014e-012
7           0,50            9,255827e-013
8           0,63            1,806799e-012
9           0,75            2,895810e-012
10          0,88            9,155655e-013
11          1,00            1,281121e-012
12          1,13            1,431247e-012
13          1,25            4,302735e-012
14          1,38            2,462607e-012
15          1,50            4,064911e-012
16          1,63            1,173407e-012
17          1,75            8,254930e-013
18          1,88            1,093342e-012
19          2,00            2,362524e-012
20          2,13            5,126308e-012
21          2,25            4,830841e-012
22          2,38            5

ValueError: unconverted data remains:  

In [240]:
n_spectra = 10
df1 = pd.concat([pd.read_csv(filename, sep='\t', skiprows = i * ( Nrows + buffer_lines ) + buffer_lines + 2,
                             nrows = Nrows, header=1, decimal=',', encoding='ascii', 
                             names=['a', 'ion_current']) for i in range(1, n_spectra)], axis=1)
assert (df1.a.iloc[0] == first_mass).all()

assert (round(df1.a.iloc[-1]) == scan_width).all(), "The read spectra do not fit to the expected width"


##### So we write the following

## Method to automatically compute all the spectra metadata: 

In [388]:
def retrieve_metadata(self):
    """Compute from file occurrences the following parameters:
        - skiprows0: number of header lines before first spectrum data
        - Nrows : number of rows each spectrum occupies
        - buffer_lines : number of buffer lines between one spectrum and the next
        - n_spectra : total number of spectra found
        
    """
    of = open(self.filename)
    head = [next(of) for x in range(200*60)]
    repet = 0

    for i,line in enumerate(head):
        if 'Mass [amu]' in line:
            repet += 1
            if repet == 1:
                self.skipRows0 = i
#                 print('skipRows0 = ', skipRows0)
            elif repet == 2:
                self.buffer_lines = i - Nrows - self.skipRows0
#                 print('buffer_lines = ', buffer_lines)

        if 'Task Name' in line and repet == 1:
            self.Nrows = i - self.skipRows0 - 2
#         return [Nrows, skipRows0, buffer_lines]

10
['7/15/2019 03:43:27.334 ' '7/15/2019 03:43:32.260 '
 '7/15/2019 03:43:37.187 ' '7/15/2019 03:43:42.113 '
 '7/15/2019 03:43:47.043 ' '7/15/2019 03:43:51.976 '
 '7/15/2019 03:43:56.910 ' '7/15/2019 03:44:01.843 '
 '7/15/2019 03:44:06.775 ' '7/15/2019 03:44:11.708 '
 '7/15/2019 03:44:16.645 ']


## Now we divide the different tasks into specific functions and organize them into a class 

### This is a shorter version for testing

In [333]:
from dataclasses import dataclass

@dataclass
class ProcessScanDF:
    """Method class Process Raw Scan DataFrame"""
    filename : str
    test : bool = False
    Nrows : int = 0
    skipRows0 : int = 0
    buffer_lines : int = 0
        
    def retrieve_metadata(self):
        """Compute from file occurrences the following parameters:
            - skiprows0: number of header lines before first spectrum data
            - Nrows : number of rows each spectrum occupies
            - buffer_lines : number of buffer lines between one spectrum and the next
        """
        of = open(self.filename)
        head = [next(of) for x in range(200*60)]
        repet = 0

        for i,line in enumerate(head):
            if 'Mass [amu]' in line:
                repet += 1
                if repet == 1:
                    self.skipRows0 = i
    #                 print('skipRows0 = ', skipRows0)
                elif repet == 2:
                    self.buffer_lines = i - Nrows - self.skipRows0
    #                 print('buffer_lines = ', buffer_lines)

            if 'Task Name' in line and repet == 1:
                self.Nrows = i - self.skipRows0 - 2
#         return [Nrows, skipRows0, buffer_lines]
    
    def import_first_column(self)->pd.DataFrame:
        """Import first spectrum to stitch to the rest and keep 'mass [amu]' column"""
        
        df0 = pd.read_csv(self.filename, sep='\t', skiprows= self.skipRows0 - 2, nrows = self.Nrows,
                      header=1, decimal=",", names=['mass', 'ion_current'], encoding='ascii')
        return df0

    def main(self):
        df0 = self.import_first_column()
        print(df0)
        if self.test:
            print('Test is True')

### This is the complete one

In [425]:
from dataclasses import dataclass

@dataclass
class ProcessScanDF:
    """Method class Process Raw Scan DataFrame"""
    filename : str
    test : bool = False
    Nrows : int = 0
    skipRows0 : int = 0
    buffer_lines : int = 0
    n_spectra : int = 0
        
    def retrieve_metadata(self, head_size: int = 2000):
        """Compute from file occurrences the following parameters:
            - skiprows0: number of header lines before first spectrum data
            - Nrows : number of rows each spectrum occupies
            - buffer_lines : number of buffer lines between one spectrum and the next
            - n_spectra : total number of spectra found
        """
        of = open(self.filename)
        head = [next(of) for x in range(head_size)]
        repet = 0

        for i,line in enumerate(head):
            if 'Mass [amu]' in line:
                repet += 1
                if repet == 1:
                    self.skipRows0 = i
    #                 print('skipRows0 = ', skipRows0)
                elif repet == 2:
                    self.buffer_lines = i - self.Nrows - self.skipRows0
    #                 print('buffer_lines = ', buffer_lines)

            if 'Task Name' in line and repet == 1:
                self.Nrows = i - self.skipRows0 - 2
        
            number_lines = sum(1 for line in open(self.filename)) 
        if self.n_spectra == 0:
            self.n_spectra = round(number_lines/(self.Nrows+self.buffer_lines))
            print("Found ",n_spectra, " spectra")
                        
        def retrieve_advanced_metadata(head : list, skipRows0 : int, Nrows: int):
            """Compute first mass, scan width (max. 200 amu) and resolution (fraction of amu read) from file"""
            m0 = float(str.rsplit(head[self.skipRows0 + 1], sep='\t')[0].replace(',','.'))
            m1 = float(str.rsplit(head[self.skipRows0 + 2], sep='\t')[0].replace(',','.'))
            resolution = m1 - m0

            first_mass = float(str.rsplit(head[self.Nrows + self.skipRows0 + 3])[-1].replace(',', '.'))
            scan_width = float(str.rsplit(head[self.Nrows + self.skipRows0 + 4])[-1].replace(',', '.'))

            return(resolution, first_mass, scan_width)

#         return [Nrows, skipRows0, buffer_lines]

    def import_first_column(self)->pd.DataFrame:
        """Import first spectrum to stitch to the rest and keep 'mass [amu]' column"""

        df0 = pd.read_csv(self.filename, sep='\t', skiprows= self.skipRows0 - 2, nrows = self.Nrows,
                      header=1, decimal=",", names=['mass', 'ion_current'], encoding='ascii')
        return df0

    def import_main_no_test(self) -> pd.DataFrame:
        """Main dataframe importing function. Read n_spectra using metadata obtained previously
        %%timeit result: 259 ms ± 4.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
        """

        df1 = pd.concat([pd.read_csv(self.filename, sep='\t', skiprows = i * ( self.Nrows + self.buffer_lines ) + self.buffer_lines + 2,
                                     nrows = self.Nrows, header=1, decimal=',', encoding='ascii', 
                                     names=['a', 'ion_current']).drop('a', axis=1) for i in range(1, self.n_spectra)], axis=1)
        return df1

    def import_main_test(filename : str, buffer_lines: int, Nrows : int, n_spectra : int) -> pd.DataFrame:
        """Main dataframe importing function tested for first_mass and scan_range matching
        %%timeit result 254 ms ± 18.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
        """

        df1 = pd.concat([pd.read_csv(self.filename, sep='\t', skiprows = i * ( self.Nrows + self.buffer_lines ) + self.buffer_lines + 2,
                                     nrows = self.Nrows, header=1, decimal=',', encoding='ascii', 
                                     names=['a', 'ion_current']) for i in range(1, self.n_spectra)], axis=1)

        assert (df1.a.iloc[0] == first_mass).all(), "The read spectra do not match the expected first mass"
        assert (round(df1.a.iloc[-1]) == scan_width).all(), "The read spectra do not match the expected width"

        df1 = df1.drop(columns='a')
        return df1

    def read_start_datetimes(self) -> pd.DataFrame:
        """Read Start Time of each cycle and convert to datetime"""

        dtstr = np.array([pd.read_csv(self.filename, sep='\t', skiprows= i * (self.Nrows + self.buffer_lines) + self.buffer_lines + 2, 
                                      nrows=1, header=None).iloc[0,1] for i in range(self.n_spectra)])
        print(dtstr)
        try:
            dt = np.array([datetime.datetime.strptime(x, '%m/%d/%Y %H:%M:%S.%f ') for x in dtstr])
        except ValueError:
            dt = np.array([datetime.datetime.strptime(x, '%m/%d/%Y %H:%M:%S.%f') for x in dtstr])
            
        try:
            return dt
        except ValueError:
            return np.array([np.nan for x in dtstr])
    
    def export_df(self, df : pd.DataFrame):
        """Write stitched dataframe to '.txt' file with same basename"""
        
        outPath = os.path.dirname(self.filename)
        outName = os.path.splitext(os.path.basename(filename))[0]
        outFile = os.path.join(outPath, outName+'.txt')

        df.to_csv(outFile)
    
    def main(self):
        """Call of all functions, stitch dataframes and export to '.txt'
            - df0 : first cycle including mass column
            - df1 : rest of the cycles with mass column dropped
            - dt : datetimes to use as column names"""
#         [self.Nrows, self.skipRows0, self.buffer_lines] = self.retrieve_metadata()
        self.retrieve_metadata()
        df0 = self.import_first_column()
        
        ##
        #   number_lines = sum(1 for line in open(filename))
        #    
        #if n_spectra == None:
        #n_spectra = round(number_lines/Nrows)
        ##
        
        if self.test:
            df1 = self.import_main_test()
        else:
            df1 = self.import_main_no_test()

        dt = self.read_start_datetimes()

        df = pd.concat([df0,df1], axis=1)
        df.set_index(['mass'], inplace=True)
        df.columns = [dt]

        self.export_df(df)

## Here one can test the methods and call different local variables
The retrieve_metadata function seems to take too long

In [426]:
df = ProcessScanDF(filename)
df.filename

'../rga_data/20190715_154154_Faraday_Background_20ms_200amu_scan.asc'

In [427]:
df.retrieve_metadata()

Found  10  spectra


In [410]:
df.Nrows

1598

In [394]:
df.import_main_no_test()

Unnamed: 0,ion_current,ion_current.1,ion_current.2,ion_current.3,ion_current.4,ion_current.5,ion_current.6,ion_current.7,ion_current.8,ion_current.9
0,7.923713e-14,2.544498e-13,1.793658e-13,8.758847e-14,6.256671e-14,4.176946e-15,-1.250791e-14,1.292925e-13,1.710246e-13,1.626623e-13
1,-1.175880e-13,1.526785e-13,2.027080e-13,1.426422e-13,1.025983e-13,-7.499679e-15,1.326354e-13,6.257831e-14,1.927081e-13,5.255818e-14
2,-2.494104e-12,3.636799e-12,4.645752e-12,3.258844e-12,2.297547e-12,-8.827667e-14,3.161140e-12,1.526423e-12,4.538053e-12,1.138463e-12
3,4.302731e-12,3.917170e-12,1.145778e-12,4.699342e-13,-7.091556e-13,1.934514e-12,3.261344e-12,2.380151e-12,3.879522e-12,-1.244860e-12
4,6.355421e-12,1.759161e-12,4.728354e-12,9.231649e-13,9.932337e-13,1.609012e-12,-2.521654e-12,5.676825e-14,5.006186e-12,2.497435e-13
5,4.660673e-12,4.054896e-12,2.374926e-12,-1.364971e-12,1.448852e-12,2.520277e-12,9.519860e-15,8.105476e-12,4.799746e-13,5.976551e-13
6,5.700633e-13,1.578990e-12,6.798658e-12,-2.256197e-12,-2.935535e-13,3.095999e-12,-3.573047e-12,2.532682e-12,-2.334930e-13,1.696687e-12
7,2.212402e-12,-2.134699e-13,1.461280e-12,4.675296e-13,1.143369e-12,4.778374e-12,-1.482303e-13,1.966968e-12,3.529201e-12,6.176763e-13
8,2.452603e-12,2.935860e-12,-4.713030e-13,1.201014e-12,4.390393e-12,1.253487e-12,-9.870294e-13,5.163944e-12,1.886878e-12,7.353960e-13
9,5.987507e-12,4.007242e-12,2.440181e-12,-3.235943e-13,-6.891396e-13,1.994598e-12,2.112276e-12,9.379886e-13,2.164737e-12,-5.514042e-13


##### This function is not finished, the tests are defined in terms of the variables first_mass and scan_width, which are not global but local to the retrieve_metadata function
Will look into it, it's not too important right now

In [None]:
df.import_main_no_test()

In [428]:
df.read_start_datetimes()

['7/15/2019 03:43:27.334 ' '7/15/2019 03:43:32.260 '
 '7/15/2019 03:43:37.187 ' '7/15/2019 03:43:42.113 '
 '7/15/2019 03:43:47.043 ' '7/15/2019 03:43:51.976 '
 '7/15/2019 03:43:56.910 ' '7/15/2019 03:44:01.843 '
 '7/15/2019 03:44:06.775 ' '7/15/2019 03:44:11.708 '
 '7/15/2019 03:44:16.645 ']


array([datetime.datetime(2019, 7, 15, 3, 43, 27, 334000),
       datetime.datetime(2019, 7, 15, 3, 43, 32, 260000),
       datetime.datetime(2019, 7, 15, 3, 43, 37, 187000),
       datetime.datetime(2019, 7, 15, 3, 43, 42, 113000),
       datetime.datetime(2019, 7, 15, 3, 43, 47, 43000),
       datetime.datetime(2019, 7, 15, 3, 43, 51, 976000),
       datetime.datetime(2019, 7, 15, 3, 43, 56, 910000),
       datetime.datetime(2019, 7, 15, 3, 44, 1, 843000),
       datetime.datetime(2019, 7, 15, 3, 44, 6, 775000),
       datetime.datetime(2019, 7, 15, 3, 44, 11, 708000),
       datetime.datetime(2019, 7, 15, 3, 44, 16, 645000)], dtype=object)

### The main function calls all the rest: imports the columns, stitches them with datetime and writes them into a .txt file

In [429]:
df.main()

['7/15/2019 03:43:27.334 ' '7/15/2019 03:43:32.260 '
 '7/15/2019 03:43:37.187 ' '7/15/2019 03:43:42.113 '
 '7/15/2019 03:43:47.043 ' '7/15/2019 03:43:51.976 '
 '7/15/2019 03:43:56.910 ' '7/15/2019 03:44:01.843 '
 '7/15/2019 03:44:06.775 ' '7/15/2019 03:44:11.708 '
 '7/15/2019 03:44:16.645 ']


In [393]:
load_processed_rga_data('../rga_data/20190715_154154_Faraday_Background_20ms_200amu_scan.txt')

Unnamed: 0_level_0,2019-07-15 03:43:27.334000,2019-07-15 03:43:32.260000,2019-07-15 03:43:37.187000,2019-07-15 03:43:42.113000,2019-07-15 03:43:47.043000,2019-07-15 03:43:51.976000,2019-07-15 03:43:56.910000,2019-07-15 03:44:01.843000,2019-07-15 03:44:06.775000,2019-07-15 03:44:11.708000,2019-07-15 03:44:16.645000
mass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,9.594746e-14,7.923713e-14,2.544498e-13,1.793658e-13,8.758847e-14,6.256671e-14,4.176946e-15,-1.250791e-14,1.292925e-13,1.710246e-13,1.626623e-13
0.13,9.260449e-14,-1.175880e-13,1.526785e-13,2.027080e-13,1.426422e-13,1.025983e-13,-7.499679e-15,1.326354e-13,6.257831e-14,1.927081e-13,5.255818e-14
0.25,2.219874e-12,-2.494104e-12,3.636799e-12,4.645752e-12,3.258844e-12,2.297547e-12,-8.827667e-14,3.161140e-12,1.526423e-12,4.538053e-12,1.138463e-12
0.38,2.688014e-12,4.302731e-12,3.917170e-12,1.145778e-12,4.699342e-13,-7.091556e-13,1.934514e-12,3.261344e-12,2.380151e-12,3.879522e-12,-1.244860e-12
0.5,9.255827e-13,6.355421e-12,1.759161e-12,4.728354e-12,9.231649e-13,9.932337e-13,1.609012e-12,-2.521654e-12,5.676825e-14,5.006186e-12,2.497435e-13
0.63,1.806799e-12,4.660673e-12,4.054896e-12,2.374926e-12,-1.364971e-12,1.448852e-12,2.520277e-12,9.519860e-15,8.105476e-12,4.799746e-13,5.976551e-13
0.75,2.895810e-12,5.700633e-13,1.578990e-12,6.798658e-12,-2.256197e-12,-2.935535e-13,3.095999e-12,-3.573047e-12,2.532682e-12,-2.334930e-13,1.696687e-12
0.88,9.155655e-13,2.212402e-12,-2.134699e-13,1.461280e-12,4.675296e-13,1.143369e-12,4.778374e-12,-1.482303e-13,1.966968e-12,3.529201e-12,6.176763e-13
1.0,1.281121e-12,2.452603e-12,2.935860e-12,-4.713030e-13,1.201014e-12,4.390393e-12,1.253487e-12,-9.870294e-13,5.163944e-12,1.886878e-12,7.353960e-13
1.13,1.431247e-12,5.987507e-12,4.007242e-12,2.440181e-12,-3.235943e-13,-6.891396e-13,1.994598e-12,2.112276e-12,9.379886e-13,2.164737e-12,-5.514042e-13


In [368]:
dtstr = np.array([pd.read_csv(filename, sep='\t', skiprows= i * (Nrows + buffer_lines) + buffer_lines + 2, nrows=1, header=None).iloc[0,1] for i in range(n_spectra)])
try:
    dt = np.array([datetime.datetime.strptime(x, '%m/%d/%Y %H:%M:%S.%f ') for x in dtstr])
except ValueError:
    dt = np.array([datetime.datetime.strptime(x, '%m/%d/%Y %H:%M:%S.%f') for x in dtstr])
dt

array([datetime.datetime(2019, 7, 15, 3, 43, 27, 334000),
       datetime.datetime(2019, 7, 15, 3, 43, 32, 260000),
       datetime.datetime(2019, 7, 15, 3, 43, 37, 187000),
       datetime.datetime(2019, 7, 15, 3, 43, 42, 113000),
       datetime.datetime(2019, 7, 15, 3, 43, 47, 43000),
       datetime.datetime(2019, 7, 15, 3, 43, 51, 976000),
       datetime.datetime(2019, 7, 15, 3, 43, 56, 910000),
       datetime.datetime(2019, 7, 15, 3, 44, 1, 843000),
       datetime.datetime(2019, 7, 15, 3, 44, 6, 775000),
       datetime.datetime(2019, 7, 15, 3, 44, 11, 708000)], dtype=object)