In [10]:
import pandas as pd
import numpy as np
import re # Regular expression library

# StringIO - to read dataframe from string
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [11]:
file_name = "../data/AGGR-STATUS-V/AGGR-STATUS-V_NASKNX07A.txt"
with open(file_name, 'r') as open_file:
    txt = open_file.read()
    separate_logs = txt.split('\n\n#') # Note: Different split string as blank line (\n\n) is not enough here

In [12]:
table_data = []
for log in separate_logs:
    if not "ERROR" in log and log != '':
        log = log.strip()
        
        # Get log info
        log_info = re.match( r'(.* \d{4}).*\((.*)\) INFO\n# hostname: (.*) serialno: (\d+)\n.*', log, re.M).groups()  
        _date = log_info[0]
        _type = log_info[1]
        _hostname = log_info[2]
        _serial_no = log_info[3]
        
        print(_date,_type,_hostname,_serial_no)
        
        # Get aggr name and volumes names
        log_agg_volumes = re.findall(r'\s(aggr[A-Za-z0-9_]*)\sonline.*?\sVolumes: (.*?)\n\n+?', log, re.M|re.DOTALL)
        for agg_volumes in log_agg_volumes:
            _agg_name = agg_volumes[0]           
            volumes = [v.strip() for v in agg_volumes[1].split(',')]
            for _vol in volumes:
                table_data.append(
                    (
                    _date,
                    _type,
                    _hostname,
                    _serial_no,
                    _agg_name,
                    _vol
                    )
                )

df = pd.DataFrame(table_data, columns=['date', 'ASUP_Type','hostname','serial_no','agg','vol'])
df['date'] = pd.to_datetime(df['date'], format='%a %b %d %H:%M:%S %Y')

Sun Jul  2 00:34:30 2017 WEEKLY_LOG NASKNX07A 451416000026
Sun Jun 25 00:33:39 2017 WEEKLY_LOG NASKNX07A 451416000026
Sun Jun 18 00:22:49 2017 WEEKLY_LOG NASKNX07A 451416000026
Sun Jun 11 00:22:06 2017 WEEKLY_LOG NASKNX07A 451416000026


In [13]:
df

Unnamed: 0,date,ASUP_Type,hostname,serial_no,agg,vol
0,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr0,vol0
1,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,vf_NAS_AURDIVIC_rootvol
2,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,wd_aurdivic_hobo_vol01
3,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,wd_aurdivic_branch_vol01
4,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,vf_NAS_AURDI02_rootvol
5,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,A3DOCKD0052_USER
6,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,A3DOCKD0052_PROFILE
7,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,A3DOCKD0052_SHARED
8,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,vf_NAS_AURINT_rootvol
9,2017-07-02 00:34:30,WEEKLY_LOG,NASKNX07A,451416000026,aggr01_nsas,vf_NAS_AURIACT_rootvol


In [15]:
df.to_csv("data/df-agg-status.csv", index=False)