In [1]:
import os
import sys
import pandas as pd
import datetime
import numpy as np
import time
import re
from tqdm import tqdm
import timeit
tqdm.pandas()
import omap


In [2]:
starttime = timeit.default_timer()

def test_folder_path (input_folder_path, start_time, end_time):
    # Retrieve Train number
    train_number = re.findall(r'%s(\d+)' %'Train ', input_folder_path)[0]
    
    # Determine time difference between start and end time
    datetime_format = '%Y/%m/%d %H:%M:%S'
    start_time = datetime.datetime.strptime(start_time, datetime_format)
    end_time = datetime.datetime.strptime(end_time, datetime_format)
    diff = end_time - start_time
    hours = diff.total_seconds() /3600

    start_time_new = start_time.strftime('%m/%d/%Y %H:%M:%S')
    end_time_new = end_time.strftime('%m/%d/%Y %H:%M:%S')
    result_list = []
    for car in os.listdir(input_folder_path):
        car_number = re.findall(r'%s(\d+)' %'Car ', car)[0]
        print("---Processing Train " + train_number + " Car " + car_number + "---")
        date = os.listdir(os.path.join(input_folder_path, car))[0]
        flag = True
        back_date_flag = False
        # Test for correct folder structure
        try:
            datetime.datetime.strptime(date, "%y%m%d")
            #print("Correct date string format.")
        except ValueError:
            print("Inorrect date string format. Example 200116 %y%m%d. This might result in process error.")
            flag = False    
        
        # OMAP Processing
        
        # Look for ATO, ATP, COM and TDMS folder
        if not os.path.exists(os.path.join(input_folder_path, car, date, 'OMAP_ATO')):
            print("ATO folder does not exist. Incorrect folder structure")
            flag = False
        else:
            ATO_file_list = os.listdir(os.path.join(input_folder_path, car, date, 'OMAP_ATO'))       
            sorted(ATO_file_list)
            # Find dataframe/append for the given start and end time
            ATO_dataframe = pd.DataFrame()
            ATP_dataframe = pd.DataFrame()
            COM_dataframe = pd.DataFrame()
            TDMS_dataframe = pd.DataFrame()
            for index in range (0, len(ATO_file_list)):
            #for file in ATO_file_list:
                file = ATO_file_list[index]
                process_flag = True
                # For each ATO file, finding corresponding ATP, COM and TDMS file
                ATP_file_list = os.listdir(os.path.join(input_folder_path, car, date, 'OMAP_ATP'))
                if not file.replace('ATO','ATP') in ATP_file_list:
                    print(file + " does not have a corresponding ATP file. Hence will not be processed. ")
                    process_flag = False
                
                COM_file_list = os.listdir(os.path.join(input_folder_path, car, date, 'OMAP_COM'))
                if not file.replace('ATO','COM') in COM_file_list:
                    print(file + " does not have a corresponding COM file. Hence will not be processed. ")
                    process_flag = False
                    
                TDMS_file_list = os.listdir(os.path.join(input_folder_path, car, date, 'OMAP_TDMS'))
                if not file.replace('ATO','TDMS') in TDMS_file_list:
                    print(file + " does not have a corresponding TDMS file. Hence will not be processed. ")
                    process_flag = False               
                
                if process_flag:
                    # Read ATO log file and find relevant rows based on start and end time)
                    df_ATO = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_ATO', file), sep="\t")
                    df_ATO['Date'] = pd.to_datetime(df_ATO['Date'], format='%m/%d/%Y %H:%M:%S')
                    # Read ATP log file and find relevant rows based on start and end time)
                    df_ATP = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_ATP', file.replace('ATO','ATP')), sep="\t")
                    df_ATP['Date'] = pd.to_datetime(df_ATP['Date'], format='%m/%d/%Y %H:%M:%S')
                    # Read COM log file and find relevant rows based on start and end time)
                    df_COM = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_COM', file.replace('ATO','COM')), sep="\t")
                    df_COM['Date'] = pd.to_datetime(df_COM['Date'], format='%m/%d/%Y %H:%M:%S')
                    # Read TDMS log file and find relevant rows based on start and end time)
                    df_TDMS = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_TDMS', file.replace('ATO','TDMS')), sep="\t")
                    df_TDMS['Date'] = pd.to_datetime(df_TDMS['Date'], format='%m/%d/%Y %H:%M:%S')
                                        
                    if hours == 1: # Process only 1 log file
                        # Compare between cuurent index and next index (to be removed as it doesnt max sense to do it this way)
                        # For the sake of being the same as ATLAS
                        if (index+1 < len(ATO_file_list)):
                            df_ATO_next = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_ATO', ATO_file_list[index+1]), sep="\t")
                            df_ATO_next['Date'] = pd.to_datetime(df_ATO_next['Date'], format='%m/%d/%Y %H:%M:%S')
                            next_length = len((df_ATO_next.loc[(df_ATO_next['Date'] == start_time_new)]))
                            current_length = len((df_ATO.loc[(df_ATO['Date'] == start_time_new)]))
                            
                            if next_length > current_length:
                                # Read logs from next index
                                df_ATP_next = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_ATP', ATO_file_list[index+1].replace('ATO','ATP')), sep="\t")
                                df_ATP_next['Date'] = pd.to_datetime(df_ATP_next['Date'], format='%m/%d/%Y %H:%M:%S')
                                df_COM_next= pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_COM', ATO_file_list[index+1].replace('ATO','COM')), sep="\t")
                                df_COM_next['Date'] = pd.to_datetime(df_COM_next['Date'], format='%m/%d/%Y %H:%M:%S')
                                df_TDMS_next = pd.read_csv(os.path.join(input_folder_path, car, date, 'OMAP_TDMS', ATO_file_list[index+1].replace('ATO','TDMS')), sep="\t")
                                df_TDMS_next['Date'] = pd.to_datetime(df_TDMS_next['Date'], format='%m/%d/%Y %H:%M:%S') 
                                
                                ATO_dataframe = ATO_dataframe.append(df_ATO_next.loc[(df_ATO_next['Date'] >= start_time_new) & (df_ATO_next['Date'] <= end_time_new)])
                                ATP_dataframe = ATP_dataframe.append(df_ATP_next.loc[(df_ATP_next['Date'] >= start_time_new) & (df_ATP_next['Date'] <= end_time_new)])
                                COM_dataframe = COM_dataframe.append(df_COM_next.loc[(df_COM_next['Date'] >= start_time_new) & (df_COM_next['Date'] <= end_time_new)])
                                TDMS_dataframe = TDMS_dataframe.append(df_TDMS_next.loc[(df_TDMS_next['Date'] >= start_time_new) & (df_TDMS_next['Date'] <= end_time_new)])
                                break
                            else:
                                # Read logs from current index
                                ATO_dataframe = ATO_dataframe.append(df_ATO.loc[(df_ATO['Date'] >= start_time_new) & (df_ATO['Date'] <= end_time_new)])
                                ATP_dataframe = ATP_dataframe.append(df_ATP.loc[(df_ATP['Date'] >= start_time_new) & (df_ATP['Date'] <= end_time_new)])
                                COM_dataframe = COM_dataframe.append(df_COM.loc[(df_COM['Date'] >= start_time_new) & (df_COM['Date'] <= end_time_new)])
                                TDMS_dataframe = TDMS_dataframe.append(df_TDMS.loc[(df_TDMS['Date'] >= start_time_new) & (df_TDMS['Date'] <= end_time_new)])
                                break
                        else:
                            # Read logs from only index
                            ATO_dataframe = ATO_dataframe.append(df_ATO.loc[(df_ATO['Date'] >= start_time_new) & (df_ATO['Date'] <= end_time_new)])
                            ATP_dataframe = ATP_dataframe.append(df_ATP.loc[(df_ATP['Date'] >= start_time_new) & (df_ATP['Date'] <= end_time_new)])
                            COM_dataframe = COM_dataframe.append(df_COM.loc[(df_COM['Date'] >= start_time_new) & (df_COM['Date'] <= end_time_new)])
                            TDMS_dataframe = TDMS_dataframe.append(df_TDMS.loc[(df_TDMS['Date'] >= start_time_new) & (df_TDMS['Date'] <= end_time_new)])
                            break
                    else:
                        # Need to fix for whole number hours (probably not because dont have to copy how atlas process which files)
                        ATO_dataframe = ATO_dataframe.append(df_ATO.loc[(df_ATO['Date'] >= start_time_new) & (df_ATO['Date'] < end_time_new)])
                        ATP_dataframe = ATP_dataframe.append(df_ATP.loc[(df_ATP['Date'] >= start_time_new) & (df_ATP['Date'] < end_time_new)])
                        COM_dataframe = COM_dataframe.append(df_COM.loc[(df_COM['Date'] >= start_time_new) & (df_COM['Date'] < end_time_new)])
                        TDMS_dataframe = TDMS_dataframe.append(df_TDMS.loc[(df_TDMS['Date'] >= start_time_new) & (df_TDMS['Date'] < end_time_new)])
                        # Check if log is '00'_00_00 time and within time period
                        if re.findall(r'%s(\d+)' %'_', file)[0] == '00' and len(df_ATO.loc[(df_ATO['Date'] >= start_time_new) & (df_ATO['Date'] < end_time_new)])>0:
                            back_date_flag = True
                else:
                    sys.exit(car + " data folder is incorrect / missing corresponding log files. Cannot proceed with processing! Kindly check whether ATO, ATP COM and TDMS have their corresponding log files.")
            
            
            # Check if log is '00'_00_00 time and within time period
            if re.findall(r'%s(\d+)' %'_', file)[0] == '00' and len(ATO_dataframe.loc[(ATO_dataframe['Date'] >= start_time_new) & (ATO_dataframe['Date'] <= end_time_new)])>0:
                back_date_flag = True
                print("back_date_flag")
                        
            ATO_dataframe = ATO_dataframe.reset_index(drop=True)
            ATP_dataframe = ATP_dataframe.reset_index(drop=True)
            COM_dataframe = COM_dataframe.reset_index(drop=True)
            TDMS_dataframe = TDMS_dataframe.reset_index(drop=True)
            
            if(ATO_dataframe.shape[0] == 0):
                sys.exit("No matching time window from start to end time! Kindly check if you have input the correct start and end time!")
            
            #print(ATO_dataframe.tail(30))
            # Process ATO, ATP, COM, TDMS individually
            ATO_dataframe = omap.process_ato(ATO_dataframe, back_date_flag)
            ATP_dataframe = omap.process_others(ATP_dataframe, 'ATP')
            COM_dataframe = omap.process_others(COM_dataframe, 'COM')
            TDMS_dataframe = omap.process_others(TDMS_dataframe, 'TDMS')

            # Merge ATO-ATP, ATO-COM, ATO-TDMS
            ATO_ATP_result = omap.merge_ato_n_others(ATO_dataframe, ATP_dataframe)
            ATO_COM_result = omap.merge_ato_n_others(ATO_dataframe, COM_dataframe)
            ATO_TDMS_result = omap.merge_ato_n_others(ATO_dataframe, TDMS_dataframe)

            # Merge all results dataframes
            df_result = omap.merge_all(ATO_ATP_result, ATO_COM_result, ATO_TDMS_result, train_number, car_number, start_time, end_time)
            
            # Append results for Unit Test
            result_list.append([ATO_ATP_result, ATO_COM_result, ATO_TDMS_result, df_result])
            
    return result_list
                          
            
input_folder_path =  './T20 OMAP DATA/Train 20 CSV/'
#input_folder_path =  './test_folder/Train 19 CSV/'
#Machine Time of interest ('%Y/%m/%d %H:%M:%S')
start_time = '2020/01/16 05:00:00' 
end_time = '2020/01/16 08:00:00'

# Machine Time of interest ('%m/%d/%Y %H:%M:%S')
# start_time = '01/16/2020 07:00:00' 
# end_time = '01/16/2020 08:00:00'
#sample_output_filename = ['./OMAP_Train_20_Car_39_20200116_0600_to_20200116_0700.csv']
#sample_output_filename = ['./OMAP_Train_20_Car_39_20200116_0700_to_20200116_0800.csv']
sample_output_filename = ['./OMAP_Train_20_Car_39_20200116_0500_to_20200116_0800.csv']

# input_folder_path = './Train 23 14NOV2019 Work Folder/raw logs/OMAP/Train 23 Copy'
# start_time = '2019/11/14 18:00:00' 
# end_time = '2019/11/14 19:00:00'
# # sample_output_filename = ['./Train 23 14NOV2019 Work Folder/preprocessing_output/OMAP_Train_23_Car_45_20191114_1700_to_20191114_1800.csv',
# #                           './Train 23 14NOV2019 Work Folder/preprocessing_output/OMAP_Train_23_Car_46_20191114_1700_to_20191114_1800.csv']
# sample_output_filename = ['./Train 23 14NOV2019 Work Folder/preprocessing_output/OMAP_Train_23_Car_46_20191114_1800_to_20191114_1900.csv']


# input_folder_path = './Train 10 26FEB2018 Work Folder/raw_logs/OMAP/Train 10'
# start_time = '2018/02/26 01:00:00' 
# end_time = '2018/02/26 02:00:00' 
# sample_output_filename = ['./Train 10 26FEB2018 Work Folder/preprocessed_output/OMAP_Train_10_Car_19_20180226_0100_to_20180226_0200.csv']

# input_folder_path = './Train 10 22FEB2018 Work Folder/raw_logs/OMAP/Train 10'
# start_time = '2018/02/22 01:00:00' 
# end_time = '2018/02/22 02:00:00' 
# sample_output_filename = ['./Train 10 22FEB2018 Work Folder/preprocessed_output/OMAP_Train_10_Car_19_20180222_0100_to_20180222_0200.csv']


result_list = test_folder_path(input_folder_path, start_time, end_time)
print("Time Taken:", timeit.default_timer() - starttime)

for index in range (0, len(result_list)):
    # Unit Test
    omap.unit_test(sample_output_filename[index], 'ATP', result_list[index][0])
    omap.unit_test(sample_output_filename[index], 'COM', result_list[index][1])
    omap.unit_test(sample_output_filename[index], 'TDMS', result_list[index][2])
    omap.unit_test_all(sample_output_filename[index], result_list[index][3])


---Processing Train 20 Car 39---
---Processing ATO Dataframe---
Number of null value entry =  0
Found non-monotonic sequence at index:  Int64Index([19474, 59862, 68624], dtype='int64')
[Timestamp('2020-01-16 05:20:10'), Timestamp('2020-01-16 05:20:11')]
[17, 16]
[17, 16]
19461 2020-01-16 05:20:10
19462 2020-01-16 05:20:10
19463 2020-01-16 05:20:10
19464 2020-01-16 05:20:10
19465 2020-01-16 05:20:10
19466 2020-01-16 05:20:10
19467 2020-01-16 05:20:10
19468 2020-01-16 05:20:10
19469 2020-01-16 05:20:10
19470 2020-01-16 05:20:10
19471 2020-01-16 05:20:10
19472 2020-01-16 05:20:10
19473 2020-01-16 05:20:10
19474 2020-01-16 05:20:10
19475 2020-01-16 05:20:10
19476 2020-01-16 05:20:10
19477 2020-01-16 05:20:10
19478 2020-01-16 05:20:11
19479 2020-01-16 05:20:11
19480 2020-01-16 05:20:11
19481 2020-01-16 05:20:11
19482 2020-01-16 05:20:11
19483 2020-01-16 05:20:11
19484 2020-01-16 05:20:11
19485 2020-01-16 05:20:11
19486 2020-01-16 05:20:11
19487 2020-01-16 05:20:11
19488 2020-01-16 05:20:11


100%|██████████████████████████████████████████████████████████████████████████| 80625/80625 [00:32<00:00, 2515.97it/s]
100%|█████████████████████████████████████████████████████████████████████████| 80625/80625 [00:04<00:00, 16752.62it/s]
100%|██████████████████████████████████████████████████████████████████████████| 80625/80625 [01:17<00:00, 1040.79it/s]


---Merging Dataframes---


100%|██████████████████████████████████████████████████████████████████████████| 80468/80468 [00:44<00:00, 1817.17it/s]
100%|█████████████████████████████████████████████████████████████████████████| 80468/80468 [00:03<00:00, 21569.21it/s]
100%|██████████████████████████████████████████████████████████████████████████| 80468/80468 [01:14<00:00, 1086.30it/s]


---Merging Dataframes---


100%|██████████████████████████████████████████████████████████████████████████| 87775/87775 [00:31<00:00, 2767.35it/s]
100%|█████████████████████████████████████████████████████████████████████████| 87775/87775 [00:04<00:00, 19682.04it/s]
100%|██████████████████████████████████████████████████████████████████████████| 87775/87775 [01:08<00:00, 1283.72it/s]


---Merging all Dataframes (ATO, ATP, COM, TDMS)---
Time Taken: 667.8381876000001
---Unit Test between merged ATO and ATP Dataframe---
(80625, 546) (80625, 546)
None
None
             ATO_1220_Energy_delta  ATO_2008_ATP_Energy_delta
0     self                   8.989                      8.989
      other                  8.989                      8.989
1     self                   8.989                      8.989
      other                  8.989                      8.989
2     self                   8.989                      8.989
...                            ...                        ...
80613 other                677.366                    677.366
80614 self                 677.366                    677.366
      other                677.366                    677.366
80616 self                 677.366                    677.366
      other                677.366                    677.366

[7122 rows x 2 columns]
Equality Between Sample Output and Self Processed:  True
---U

In [3]:
from datetime import datetime

ts = int("1573751100")

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
print(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))

ts = int("1573751117")

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
print(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))

ts = int("1519603200")

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
print(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))


2019-11-14 17:05:00
2019-11-14 17:05:17
2018-02-26 00:00:00


In [4]:
print(re.findall(r'%s(\d+)' %'_', '200215_00_00_00_329_OMAP_ATO.txt')[0])
if re.findall(r'%s(\d+)' %'_', '200215_00_00_00_329_OMAP_ATO.txt')[0] == '00':
    print('True')

00
True


In [5]:
import collections
import itertools 
#interval_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]
interval_list = [20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]
#interval_list = [20, 20, 19, 19]
#interval_list = [20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]


def distribute_equal (interval_list):
    # Get count of each unique interval
    unique_value_collection = collections.Counter(interval_list)
    unique_value = unique_value_collection.keys()
    unique_value_count = unique_value_collection.values()
    unique_value, unique_value_count = list(unique_value), list(unique_value_count)
    #print(unique_value, unique_value_count)

    remain_list = unique_value.copy()
    min_index = unique_value_count.index(min(unique_value_count))
    max_index = unique_value_count.index(max(unique_value_count))
    remain_list.remove(unique_value[min_index])

    master_list = []
    counter = 0
    for x in range (0, min(unique_value_count)):
        unique_value_copy = unique_value.copy()
        for y in remain_list:
            #print(unique_value_collection[y])
            #print(unique_value_collection[y]//min(unique_value_count))
            if(unique_value_collection[y]//min(unique_value_count)!=1):
                for z in range (0, unique_value_collection[y]//min(unique_value_count)-1):
                    unique_value_copy.append(y)
        counter = counter + len(unique_value_copy)
        master_list.append(unique_value_copy)

    # Flattened_list
    master_list = list(itertools.chain.from_iterable(master_list))

    if counter != len(interval_list):
        if unique_value[max_index] > unique_value[min_index]:
            master_list.insert(0, unique_value[max_index])
        else:
            master_list.append(unique_value[max_index])
    print(master_list)
    return master_list