In [1]:
import os
import pandas as pd
import datetime
import numpy as np
import time
from tqdm import tqdm
import timeit
tqdm.pandas()

In [2]:
def nan_equal(a,b):
    try:
        np.testing.assert_equal(a,b)
    except AssertionError:
        return False
    return True

def clean_dataframe(df):
    """Return a dataframe
    
    Clean up the dataframe for any duplicates or null values
    """
    count_start = len(df)
    
    # Does it make sense to drop duplicates?
#     # Clean - Drop duplicates entries
#     df.drop_duplicates(keep=False,inplace=True)
#     # Clean - Check for duplicates counts
#     print("Number of duplicates dropped = ",  count_start-len(df))

    # Clean - Drop null value entries
    count_start = len(df)
    df = df.dropna(how='any',axis=0)
    # Clean - Check for null value counts
    print("Number of null value entry = ",  count_start-len(df))
    return df

def check_for_monotonic (df):
    """ Return a dataframe
    
    Detect and clean unsequence real time

    """
    if not df['Realtime'].is_monotonic_increasing:
        # Get index of non-monotonic location - non increasing order
        df_non_monotonic = df.loc[df['Realtime'].diff() < pd.to_timedelta('0 seconds')]
        
        # *** Need to determine if for loop forward or backwards (changing to which real time index) - Not Implemented
        
        print("Found non-monotonic sequence at index: ", (df.loc[df['Realtime'].diff() < pd.to_timedelta('0 seconds')].index))
        for index in df_non_monotonic.index:
            non_monotonic_realtime = df['Realtime'][index-1]
            count_non_monotonic_realtime = len(df[df['Realtime'] == non_monotonic_realtime].index)
                       
            # Find start location of index with the same real time (index)
            start_index = min(df[df['Realtime'] == df['Realtime'][index]].index)
            count_first_realtime = len(df[df['Realtime'] == df['Realtime'][index]].index)
            
            total_count = count_first_realtime + count_non_monotonic_realtime
            windows = total_count // 2
            first_realtime_windows = total_count - windows
            
            # Next time index information
            start_index_next_realtime = start_index + first_realtime_windows
                               
            # Update First Real Time
            for count in range (0,first_realtime_windows):
                df.loc[start_index+count,'Realtime'] = df['Realtime'][index]
                print(start_index+count, df.loc[start_index+count,'Realtime'])
                                
            # Update Real time for the next time index
            for count in range (0,windows):
                df.loc[start_index_next_realtime+count,'Realtime'] = non_monotonic_realtime
                print(start_index_next_realtime+count, df.loc[start_index_next_realtime+count,'Realtime'])
    return df

def preprocess_machine_time(df):
    """Return a dataframe
    
    Convert Machine Time (Column 'Date') to Unix Timestamp with Millisecond Resolution
    """
    # Date to Unix Timestamp
    #df.info(verbose=True)
    # Convert Date object (mm/dd/yyyy hh:mm:ss)to datetime type
    df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %H:%M:%S')

    # Convert datetime to timestamp (Unix) in milisecond resolution
    df['Timestamp'] = df.Date.values.astype(np.int64) // 10 ** 9 *1000

    #df['Converted Date'] = pd.to_datetime(df['Timestamp'], unit='ms')
    #df.head(20)
    #df.info(verbose=True)

    # Add column for Timestampms
    df['Timestampms'] = ""
    df['Timestampms'].replace('','0',inplace=True)
    df["Timestampms"] = df["Timestampms"].astype(np.int64)
    
#     # Add column for interval_ms
#     df['interval_ms'] = ""
#     df['interval_ms'].replace('','0',inplace=True)
#     df["interval_ms"] = df["interval_ms"].astype(np.int64)
#     #df.info(verbose=True)

    # # Create empty dataframe
    # df2 = pd.DataFrame(data=None, columns=df.columns)

    # list_df = []
    # list_df.append(df2)
    #Iterate through unique Timestamp - Add milisecond to Machine Time
    for item in sorted(df["Timestamp"].unique()):
        #timestamp_df = df.query('Timestamp==@item')
        timestamp_df = df.loc[df['Timestamp'] == item]
        # For every Timestamp (1000 millisecond), find the interval
        interval_ms = round(1000 / timestamp_df.shape[0])
        counter = 0
        for index, row in timestamp_df.iterrows():
            #print(index)
            df.loc[index,'Timestampms'] = row.Timestamp + counter*interval_ms
            #df.loc[index,'interval_ms'] = interval_ms
            counter += 1
    
    #list_df.append(timestamp_df)
    return df

            
def generate_real_time(x):
    """Return real time
    
    Check if real time needs to increase by 1 day
    """
    time_difference = datetime.timedelta(hours = x.Hour, minutes = x.Minute, seconds = x.Second) - datetime.timedelta(hours = real_time_start.hour, minutes = real_time_start.minute, seconds = real_time_start.second)
    datetime_real_time = datetime.datetime.combine(x.Date.date(), datetime.time(x.Hour, x.Minute, x.Second))
                                                  
    if(time_difference < datetime.timedelta(days=0)):
        print("Increase by 1 day")
        datetime_real_time = datetime_real_time + datetime.timedelta(days=1)

    x['Realtime'] = datetime_real_time
    return x['Realtime']
                                                   
    
def preprocess_real_time(df):
    """Return a dataframe
    
    Convert Real Time (Columns 'Hour', 'Minute', Second) to datetime.time with Millisecond Resolution
    """
    # Combine Hours Minutes and Seconds to datetime
    #df['Millisecond'] = ""
    df['Realtime'] = ""
    df['Realtime'].replace('','0',inplace=True)
    df["Realtime"] = df["Realtime"].astype(np.int64)
    global real_time_start 
    real_time_start = datetime.time(df.Hour[0], df.Minute[0], df.Second[0])
    df['Realtime'] = df.apply(generate_real_time, axis=1)                                               
#     df['Realtime'] = df.apply(lambda row: 
#                               datetime.datetime.combine(row.Date.date(), datetime.time(row.Hour, row.Minute, row.Second)), 
#                               axis=1)
    # Check for unsequence Real Time (Non-Monotonic)
    df = check_for_monotonic(df)
    
    # Convert Date object (mm/dd/yyyy hh:mm:ss)to datetime type
    df['Realtime'] = pd.to_datetime(df['Realtime'], format='%m/%d/%Y %H:%M:%S')

    # Convert datetime to timestamp (Unix) in milisecond resolution
    df['Real_Timestamp'] = df.Realtime.values.astype(np.int64) // 10 ** 9 *1000
    
    #df.info(verbose=True)
    
    # Add column for Real_Time Timestampms
    df['Real_Timestampms'] = ""
    df['Real_Timestampms'].replace('','0',inplace=True)
    df["Real_Timestampms"] = df["Real_Timestampms"].astype(np.int64)

    #Iterate through unique Real_Timestamp - Add milisecond to Real Time
    for item in sorted(df["Real_Timestamp"].unique()):
        #timestamp_df = df.query('Timestamp==@item')
        real_timestamp_df = df.loc[df['Real_Timestamp'] == item]
        # For every Timestamp (1000 millisecond), find the interval
        interval_ms = round(1000 / real_timestamp_df.shape[0])
        counter = 0
        for index, row in real_timestamp_df.iterrows():
            #print(index)
            df.loc[index,'Real_Timestampms'] = row.Real_Timestamp + counter*interval_ms
            counter += 1
    
    return df

In [3]:
def nearest(lst, K):
    """Return a index for the closest value
    
    Find the closest value inside the list
    """
    return min(range(len(lst)), key = lambda i: abs(lst[i]-K)) 

def calculate_ATO_real_timestampms(x):
    """Return ATO Real Time and nearest index
    
    With the unmapped ATP machine time, we need to calculate it's equivalent ATO Real Time by
    finding the closest ATO machine time
    """
    if(pd.isnull(x['Real_Timestampms'])):
        if x.name == 0:
            # The closest is the next index
            x['nearest_index'] = 1
            x['Real_Timestampms'] = abs(df_result.loc[x.name+1,'Timestampms'] - df_result.loc[x.name,'Timestampms']) + df_result.loc[x.name+1,'Real_Timestampms']
            
        else:
            # Find the closest between the index before and after
            difference_before = df_result.loc[x.name-1,'Timestampms']
            difference_after = df_result.loc[x.name+1,'Timestampms']
            index_timestampms = df_result.loc[x.name,'Timestampms']
            difference_list = [difference_before, difference_after]
            # Find nearest index
            nearest_index = nearest(difference_list, index_timestampms)
            x['nearest_index'] = nearest_index
            if nearest_index == 0:
                # Compute ATO Real Time - if nearest_index is the index before, take difference between the two Machine time
                # and add it to the equivalent ATO real time to preseve the ATP's time interval
                x['Real_Timestampms'] = abs(index_timestampms - difference_before) + df_result.loc[x.name-1,'Real_Timestampms']
            else:
                # Compute ATO Real Time - if nearest_index is the index after, take difference between the two Machine time
                # and add it to the equivalent ATO real time to preseve the ATP's time interval
                x['Real_Timestampms'] = abs(difference_after - index_timestampms) + df_result.loc[x.name+1,'Real_Timestampms']
            
    
    return pd.Series([x['Real_Timestampms'], x['nearest_index']])

def indicate_switch(x):
    """Return nearest index
    
    With previous row's nearest_index indicating 1 (needs to be switched after the index after),
    set the current row's nearest_index as 2
    """
    if(x.name>0):
        if df_result.loc[x.name-1,'nearest_index'] == 1:
            x['nearest_index'] = 2
    
    return x['nearest_index']

def switch_row_position(x):
    """Return dataframe
    
    Retrieve row based on nearest_index to perform switching of row when required
    """
    if(x['nearest_index']==1):
        #if nearest_index is the index after, switch
        return df_result.iloc[x.name+1]
    elif(x['nearest_index']==2):
        return df_result.iloc[x.name-1]
    else:
        #if nearest_index is the index before, remain the order
        return df_result.iloc[x.name]

In [4]:
starttime = timeit.default_timer()

index_file = 0
sample_output_filename = './OMAP_Train_20_Car_39_20200116_0600_to_20200116_0700.csv'
#sample_output_filename = './OMAP_Train_20_Car_39_20200116_0700_to_20200116_0800.csv'

In [5]:
# Read files from ATO Directory
path = './T20 OMAP DATA/Train 20 CSV/Car 39/200116/OMAP_ATO/'
ATO_file_list = os.listdir(path)
sorted(ATO_file_list)
print(ATO_file_list)
# Read & Clean the first ATO .txt
df = pd.read_csv(path + ATO_file_list[index_file], sep="\t")
df = clean_dataframe(df)

# Preprocess Machine Time
df = preprocess_machine_time(df)
#df2 = pd.concat(list_df) # Concat a list of dataframes is faster than appending individual dataframes
# Preprocess Real Time
df = preprocess_real_time(df)

print(df.loc[df['Realtime'].diff() < pd.to_timedelta('0 seconds')].index)
#df.to_csv('./clean_real_time.csv', index=False, header=True)
# Drop Date, Timestamp & Real Time Timestamp Columns
df.drop(['Date', 'Timestamp', 'Realtime','Real_Timestamp'], inplace=True, axis=1)
    
# Add Prefix to Columns name (ATO_***)
df = df.add_prefix('ATO_')
# Rename ATO_Timestampms to the column Timestampms 
df.rename(columns = {"ATO_Timestampms": "Timestampms"},  inplace = True)
df.rename(columns = {"ATO_Real_Timestampms": "Real_Timestampms"},  inplace = True) 
# Shift Timestampms to first column
df = df[ ['Real_Timestampms'] + [ col for col in df.columns if col != 'Real_Timestampms' ] ]
df = df[ ['Timestampms'] + [ col for col in df.columns if col != 'Timestampms' ] ]
#df.head(20)
# Note that there are 280 columns belonging to ATO

['200116_05_00_00_333_OMAP_ATO.txt', '200116_06_00_00_333_OMAP_ATO.txt', '200116_07_00_00_333_OMAP_ATO.txt']
Number of null value entry =  0
Found non-monotonic sequence at index:  Int64Index([26005, 34767], dtype='int64')
25991 2020-01-16 06:29:07
25992 2020-01-16 06:29:07
25993 2020-01-16 06:29:07
25994 2020-01-16 06:29:07
25995 2020-01-16 06:29:07
25996 2020-01-16 06:29:07
25997 2020-01-16 06:29:07
25998 2020-01-16 06:29:07
25999 2020-01-16 06:29:07
26000 2020-01-16 06:29:07
26001 2020-01-16 06:29:07
26002 2020-01-16 06:29:07
26003 2020-01-16 06:29:07
26004 2020-01-16 06:29:07
26005 2020-01-16 06:29:07
26006 2020-01-16 06:29:07
26007 2020-01-16 06:29:07
26008 2020-01-16 06:29:08
26009 2020-01-16 06:29:08
26010 2020-01-16 06:29:08
26011 2020-01-16 06:29:08
26012 2020-01-16 06:29:08
26013 2020-01-16 06:29:08
26014 2020-01-16 06:29:08
26015 2020-01-16 06:29:08
26016 2020-01-16 06:29:08
26017 2020-01-16 06:29:08
26018 2020-01-16 06:29:08
26019 2020-01-16 06:29:08
26020 2020-01-16 06:29:

In [6]:
# Read files from TDMS Directory
path = './T20 OMAP DATA/Train 20 CSV/Car 39/200116/OMAP_TDMS/'
TDMS_file_list = os.listdir(path)
sorted(TDMS_file_list)
print(TDMS_file_list)

# Read & Clean the first ATP .txt
df_TDMS = pd.read_csv(path + TDMS_file_list[index_file], sep="\t")
df_TDMS = clean_dataframe(df_TDMS)

# Preprocess Machine Time
df_TDMS = preprocess_machine_time(df_TDMS)

# Add Prefix to Columns name (TDMS_***)
df_TDMS = df_TDMS.add_prefix('TDMS_')
# Rename ATP_Timestampms to the column Timestampms 
df_TDMS.rename(columns = {"TDMS_Timestampms": "Timestampms"},  inplace = True) 

#df_TDMS.head(35)

['200116_05_00_00_333_OMAP_TDMS.txt', '200116_06_00_00_333_OMAP_TDMS.txt', '200116_07_00_00_333_OMAP_TDMS.txt']
Number of null value entry =  0


In [7]:
# Merge TDMS Dataframe to result Dataframe with same Timestampms
df_result = pd.merge_ordered(df, df_TDMS, how='outer', on='Timestampms')
#Output to CSV
#df_result.to_csv('./result_4.csv', index=False, header=True)

# Nearest_index to indicate switch condition 0: Remain, 1:Take index after, 2: Take index before
df_result['nearest_index'] = ""

# Compute ATP real time and map it onto ATO while maintaining the ATP's time interval
df_result[['Real_Timestampms', 'nearest_index']] = df_result.progress_apply(calculate_ATO_real_timestampms, axis=1)
# Indicate which row needs to be switched
df_result['nearest_index'] = df_result.progress_apply(indicate_switch, axis=1)
# Switch row position 
df_result = df_result.progress_apply(switch_row_position, axis=1)
# Sort Real Time
#df_result['Timestampms'] = df_result['Timestampms'].sort_values().values
df_result = df_result.sort_values(by='Real_Timestampms',ascending=True).reset_index(drop=True)
df_result = df_result.drop(['Timestampms', 'TDMS_Date', 'TDMS_Timestamp', 'nearest_index'], axis=1)
df_result_ATO_TDMS = df_result.copy(deep=True)
del df_result

#Output to CSV
#df_result_ATO_TDMS.to_csv('./result_5.csv', index=False, header=True)

100%|██████████████████████████████████████████████████████████████████████████| 44292/44292 [00:09<00:00, 4636.83it/s]
100%|█████████████████████████████████████████████████████████████████████████| 44292/44292 [00:01<00:00, 28460.98it/s]
100%|██████████████████████████████████████████████████████████████████████████| 44292/44292 [00:26<00:00, 1690.68it/s]


Time Taken: 40.2465418


In [8]:
# Unit Test for ATO to TDMS
# Import Output File
df_output = pd.read_csv(sample_output_filename)
#df_output = pd.read_csv('./OMAP_Train_20_Car_39_20200116_0700_to_20200116_0800.csv')
df_output_ATO_TDMS = df_output.drop(df_output[(df_output['ATO_0101__General'].isnull()) & (df_output['TDMS_002_General_Data'].isnull())].index)
#drop column with prefix ATP and COM
df_output_ATO_TDMS = df_output_ATO_TDMS.loc[:, ~df_output_ATO_TDMS.columns.str.startswith('ATP')]
df_output_ATO_TDMS = df_output_ATO_TDMS.loc[:, ~df_output_ATO_TDMS.columns.str.startswith('COM')]
df_output_ATO_TDMS['epoch'] = df_output_ATO_TDMS.epoch.values.astype(np.float64)
df_output_ATO_TDMS = df_output_ATO_TDMS.reset_index(drop=True)
print(df_output_ATO_TDMS.shape)
#Output to CSV
#df_output_ATO_TDMS.to_csv('./ATO_TDMS.csv', index=False, header=True)

#df_drop = df_result_ATO_TDMS.drop(['Timestampms', 'TDMS_Date', 'TDMS_Timestamp', 'nearest_index'], axis=1)

#Output to CSV
df_result_ATO_TDMS.to_csv('./ATO_TDMS_Test.csv', index=False, header=True)
df_drop_test = pd.read_csv('./ATO_TDMS_Test.csv')
#df_drop_test = df_drop_test.sort_values(by='ATO_Real_Timestampms',ascending=True).reset_index(drop=True)
# Assert whether sample output and self processed are equal
assert_equal = nan_equal(df_drop_test['Real_Timestampms'].values, df_output_ATO_TDMS['epoch'].values)

df_drop_test.columns = df_output_ATO_TDMS.columns
print(np.testing.assert_allclose(df_drop_test.values, df_output_ATO_TDMS.values, rtol=1e-10, atol=0))
print(pd.testing.assert_frame_equal(df_drop_test, df_output_ATO_TDMS, check_dtype=False))
print(df_drop_test.compare(df_output_ATO_TDMS, align_axis=0))

#assert_equal = nan_equal(df_drop_test.values, df_output_ATO_TDMS.values)
#assert_equal = nan_equal(df_drop['ATO_* General'].values, df_output_ATO_TDMS['ATO_0101__General'].values)
print("Equality Between Sample Output and Self Processed: ", assert_equal)
#print(np.testing.assert_equal(df_drop_test.values, df_output_ATO_TDMS.values))

(44292, 438)
None
None
             ATO_1220_Energy_delta  ATO_2008_ATP_Energy_delta
29000 self                 223.434                    223.434
      other                223.434                    223.434
29001 self                 223.434                    223.434
      other                223.434                    223.434
29002 self                 223.434                    223.434
...                            ...                        ...
44282 other                677.366                    677.366
44283 self                 677.366                    677.366
      other                677.366                    677.366
44284 self                 677.366                    677.366
      other                677.366                    677.366

[1550 rows x 2 columns]
Equality Between Sample Output and Self Processed:  True


In [9]:
# Read files from ATP Directory
path = './T20 OMAP DATA/Train 20 CSV/Car 39/200116/OMAP_ATP/'
ATP_file_list = os.listdir(path)
sorted(ATP_file_list)
print(ATP_file_list)

# Read & Clean the first ATP .txt
df_ATP = pd.read_csv(path + ATP_file_list[index_file], sep="\t")
df_ATP = clean_dataframe(df_ATP)

# Preprocess Machine Time
df_ATP = preprocess_machine_time(df_ATP)
# Drop Timestamp Columns
df_ATP.drop(['Date', 'Timestamp'], inplace=True, axis=1)

# Add Prefix to Columns name (ATP_***)
df_ATP = df_ATP.add_prefix('ATP_')
# Rename ATP_Timestampms to the column Timestampms 
df_ATP.rename(columns = {"ATP_Timestampms": "Timestampms"},  inplace = True) 

['200116_05_00_00_333_OMAP_ATP.txt', '200116_06_00_00_333_OMAP_ATP.txt', '200116_07_00_00_333_OMAP_ATP.txt']
Number of null value entry =  0


In [10]:
# Merge ATP Dataframe to ATO Dataframe with same Timestampms
df_result = pd.merge_ordered(df, df_ATP, how='outer', on='Timestampms')
df_result['nearest_index'] = ""
# Compute ATP real time and map it onto ATO while maintaining the ATP's time interval
df_result[['Real_Timestampms', 'nearest_index']] = df_result.progress_apply(calculate_ATO_real_timestampms, axis=1)
# Indicate which row needs to be switched
df_result['nearest_index'] = df_result.progress_apply(indicate_switch, axis=1)
# Switch row position 
df_result = df_result.progress_apply(switch_row_position, axis=1)
# Drop nearest_index Column
df_result.drop(['Timestampms', 'nearest_index'],inplace=True, axis=1)

#print(df_result.columns)
df_result_ATO_ATP = df_result.copy(deep=True)
del df_result


100%|██████████████████████████████████████████████████████████████████████████| 41089/41089 [00:09<00:00, 4543.02it/s]
100%|█████████████████████████████████████████████████████████████████████████| 41089/41089 [00:01<00:00, 28212.20it/s]
100%|██████████████████████████████████████████████████████████████████████████| 41089/41089 [00:24<00:00, 1674.40it/s]


In [11]:
# Unit Test for ATO to ATP
# Import Output File
df_output = pd.read_csv(sample_output_filename)
#df_output = pd.read_csv('./OMAP_Train_20_Car_39_20200116_0700_to_20200116_0800.csv')
df_output_ATO_ATP = df_output.drop(df_output[(df_output['ATO_0101__General'].isnull()) & (df_output['ATP_002_Loc_fault'].isnull())].index)
#drop column with prefix TDMS and COM
df_output_ATO_ATP = df_output_ATO_ATP.loc[:, ~df_output_ATO_ATP.columns.str.startswith('TDMS')]
df_output_ATO_ATP = df_output_ATO_ATP.loc[:, ~df_output_ATO_ATP.columns.str.startswith('COM')]
df_output_ATO_ATP['epoch'] = df_output_ATO_ATP.epoch.values.astype(np.float64)
df_output_ATO_ATP = df_output_ATO_ATP.reset_index(drop=True)
print(df_output_ATO_ATP.shape)
#Output to CSV
#df_output_ATO_TDMS.to_csv('./ATO_TDMS.csv', index=False, header=True)

#df_drop = df_result_ATO_ATP.drop(['Timestampms', 'TDMS_Date', 'TDMS_Timestamp', 'nearest_index'], axis=1)

#Output to CSV
df_result_ATO_ATP.to_csv('./ATO_ATP_Test.csv', index=False, header=True)
df_drop_test = pd.read_csv('./ATO_ATP_Test.csv')
df_drop_test = df_drop_test.sort_values(by='Real_Timestampms',ascending=True).reset_index(drop=True)
# Assert whether sample output and self processed are equal
assert_equal = nan_equal(df_drop_test['Real_Timestampms'].values, df_output_ATO_ATP['epoch'].values)

df_drop_test.columns = df_output_ATO_ATP.columns
print(np.testing.assert_allclose(df_drop_test.values, df_output_ATO_ATP.values, rtol=1e-10, atol=0))
print(pd.testing.assert_frame_equal(df_drop_test, df_output_ATO_ATP, check_dtype=False))
print(df_drop_test.compare(df_output_ATO_ATP, align_axis=0))

#assert_equal = nan_equal(df_drop_test.values, df_output_ATO_TDMS.values)
#assert_equal = nan_equal(df_drop['ATO_* General'].values, df_output_ATO_TDMS['ATO_0101__General'].values)
print("Equality Between Sample Output and Self Processed: ", assert_equal)

(41089, 546)
None
None
             ATO_1220_Energy_delta  ATO_2008_ATP_Energy_delta
26363 self                 223.434                    223.434
      other                223.434                    223.434
26364 self                 223.434                    223.434
      other                223.434                    223.434
26365 self                 223.434                    223.434
...                            ...                        ...
41077 other                677.366                    677.366
41078 self                 677.366                    677.366
      other                677.366                    677.366
41080 self                 677.366                    677.366
      other                677.366                    677.366

[1550 rows x 2 columns]
Equality Between Sample Output and Self Processed:  True


In [12]:
# Read files from COM Directory
path = './T20 OMAP DATA/Train 20 CSV/Car 39/200116/OMAP_COM/'
COM_file_list = os.listdir(path)
sorted(COM_file_list)
print(COM_file_list)

# Read & Clean the first ATP .txt
df_COM = pd.read_csv(path + COM_file_list[index_file], sep="\t")
df_COM = clean_dataframe(df_COM)

# Preprocess Machine Time
df_COM = preprocess_machine_time(df_COM)
# Drop Date & Timestamp Columns
df_COM.drop(['Date', 'Timestamp'], inplace=True, axis=1)

# Add Prefix to Columns name (COM_***)
df_COM = df_COM.add_prefix('COM_')
# Rename COM_Timestampms to the column Timestampms 
df_COM.rename(columns = {"COM_Timestampms": "Timestampms"},  inplace = True) 


['200116_05_00_00_333_OMAP_COM.txt', '200116_06_00_00_333_OMAP_COM.txt', '200116_07_00_00_333_OMAP_COM.txt']
Number of null value entry =  0


In [13]:
# Merge COM Dataframe to ATO Dataframe with same Timestampms
df_result = pd.merge_ordered(df, df_COM, how='outer', on='Timestampms')
df_result['nearest_index'] = ""
# Compute ATP real time and map it onto ATO while maintaining the ATP's time interval
df_result[['Real_Timestampms', 'nearest_index']] = df_result.progress_apply(calculate_ATO_real_timestampms, axis=1)
# Indicate which row needs to be switched
df_result['nearest_index'] = df_result.progress_apply(indicate_switch, axis=1)
# Switch row position 
df_result = df_result.progress_apply(switch_row_position, axis=1)
# Drop nearest_index Column
df_result.drop(['Timestampms', 'nearest_index'],inplace=True, axis=1)
df_result_ATO_COM = df_result.copy(deep=True)
# del df_result

100%|██████████████████████████████████████████████████████████████████████████| 41086/41086 [00:09<00:00, 4483.33it/s]
100%|█████████████████████████████████████████████████████████████████████████| 41086/41086 [00:01<00:00, 35433.24it/s]
100%|██████████████████████████████████████████████████████████████████████████| 41086/41086 [00:17<00:00, 2406.38it/s]


In [14]:
# Unit Test for ATO to COM
# Import Output File
df_output = pd.read_csv(sample_output_filename)
#df_output = pd.read_csv('./OMAP_Train_20_Car_39_20200116_0700_to_20200116_0800.csv')
df_output_ATO_COM = df_output.drop(df_output[(df_output['ATO_0101__General'].isnull()) & (df_output['COM_002_SAFE_INPUTS'].isnull())].index)
#drop column with prefix TDMS and COM
df_output_ATO_COM = df_output_ATO_COM.loc[:, ~df_output_ATO_COM.columns.str.startswith('TDMS')]
df_output_ATO_COM = df_output_ATO_COM.loc[:, ~df_output_ATO_COM.columns.str.startswith('ATP')]
df_output_ATO_COM['epoch'] = df_output_ATO_COM.epoch.values.astype(np.float64)
df_output_ATO_COM = df_output_ATO_COM.reset_index(drop=True)
print(df_output_ATO_COM.shape)
#Output to CSV
#df_output_ATO_TDMS.to_csv('./ATO_TDMS.csv', index=False, header=True)

#df_drop = df_result_ATO_ATP.drop(['Timestampms', 'TDMS_Date', 'TDMS_Timestamp', 'nearest_index'], axis=1)

#Output to CSV
df_result_ATO_COM.to_csv('./ATO_COM_Test.csv', index=False, header=True)
df_drop_test = pd.read_csv('./ATO_COM_Test.csv')
df_drop_test = df_drop_test.sort_values(by='Real_Timestampms',ascending=True).reset_index(drop=True)
# Assert whether sample output and self processed are equal
assert_equal = nan_equal(df_drop_test['Real_Timestampms'].values, df_output_ATO_COM['epoch'].values)

df_drop_test.columns = df_output_ATO_COM.columns
print(np.testing.assert_allclose(df_drop_test.values, df_output_ATO_COM.values, rtol=1e-10, atol=0))
print(pd.testing.assert_frame_equal(df_drop_test, df_output_ATO_COM, check_dtype=False))
print(df_drop_test.compare(df_output_ATO_COM, align_axis=0))

#assert_equal = nan_equal(df_drop_test.values, df_output_ATO_TDMS.values)
#assert_equal = nan_equal(df_drop['ATO_* General'].values, df_output_ATO_TDMS['ATO_0101__General'].values)
print("Equality Between Sample Output and Self Processed: ", assert_equal)

(41086, 327)
None
None
             ATO_1220_Energy_delta  ATO_2008_ATP_Energy_delta
26327 self                 223.434                    223.434
      other                223.434                    223.434
26328 self                 223.434                    223.434
      other                223.434                    223.434
26329 self                 223.434                    223.434
...                            ...                        ...
41074 other                677.366                    677.366
41075 self                 677.366                    677.366
      other                677.366                    677.366
41077 self                 677.366                    677.366
      other                677.366                    677.366

[1550 rows x 2 columns]
Equality Between Sample Output and Self Processed:  True


In [15]:
# Merge Result ATO_ATP and ATO_COM based on Real Timestampms
# First drop ATO columns from ATO_COM
df_result_drop_ATO_COM = df_result_ATO_COM.loc[:, ~df_result_ATO_COM.columns.str.startswith('ATO')]
# duplicateRowsDF = df_result_drop_ATO_COM[df_result_drop_ATO_COM.duplicated()]
# print(duplicateRowsDF)

#df_result_drop_ATO_COM.info(verbose=True)
#df_result_ATO_ATP.info(verbose=True)
print(df_result_ATO_ATP.shape, df_result_drop_ATO_COM.shape)
# Merge onto result ATO_ATP
df_temp_result = pd.merge_ordered(df_result_ATO_ATP, df_result_drop_ATO_COM, how='outer', on='Real_Timestampms')

# First drop ATO columns from ATO_COM
df_result_drop_ATO_TDMS = df_result_ATO_TDMS.loc[:, ~df_result_ATO_TDMS.columns.str.startswith('ATO')]
# Merge TDMS onto the result dataframe
df_temp_result = pd.merge_ordered(df_temp_result, df_result_drop_ATO_TDMS, how='outer', on='Real_Timestampms')

df_temp_result.to_csv('./temp_result.csv', index=False, header=True)
print("Time Taken:", timeit.default_timer() - starttime)

(41089, 546) (41086, 48)


In [16]:
# Unit Test for End Result
# Import Output File
df_output = pd.read_csv(sample_output_filename)
#df_output = pd.read_csv('./OMAP_Train_20_Car_39_20200116_0700_to_20200116_0800.csv')
#df_output['epoch'] = df_output.epoch.values.astype(np.float64)
#df_output = df_output.reset_index(drop=True)
print(df_output.shape)
#assert_equal = nan_equal(df_temp_result['Real_Timestampms'].values, df_output['epoch'].values)
#assert_equal = nan_equal(df_drop_test.values, df_output_ATO_TDMS.values)
assert_equal = nan_equal(df_temp_result['ATO_* General'].values, df_output['ATO_0101__General'].values)
print("Equality Between Sample Output and Self Processed: ", assert_equal)
#print(np.testing.assert_equal(df_temp_result.values, df_output.values))
print(np.testing.assert_equal(df_temp_result['Real_Timestampms'].values, df_output['epoch'].values))
print(np.testing.assert_equal(df_temp_result['ATO_* General'].values, df_output['ATO_0101__General'].values))
print(np.testing.assert_equal(df_temp_result['ATP_Loc fault'].values, df_output['ATP_002_Loc_fault'].values))
print(np.testing.assert_equal(df_temp_result['COM_SAFE INPUTS'].values, df_output['COM_002_SAFE_INPUTS'].values))
print(np.testing.assert_equal(df_temp_result['TDMS_General Data'].values, df_output['TDMS_002_General_Data'].values))
print(np.testing.assert_equal(df_temp_result['TDMS_Sec'].values, df_output['TDMS_016_Sec'].values))

# Energry Delta has some error (show in powerpoint slides).
#print(np.testing.assert_equal(df_temp_result['ATO_Energy delta'].values, df_output['ATO_1220_Energy_delta'].values))

#print(np.testing.assert_allclose(df_temp_result.values, df_output.values, rtol=1e-10, atol=0))
#print(np.testing.assert_equal(df_temp_result.values, df_output.values))
df_temp_result_2 = df_temp_result.copy(deep=True)
df_temp_result_2.columns = df_output.columns
print(pd.testing.assert_frame_equal(df_temp_result_2, df_output, check_dtype=False, check_column_type=False))
df_temp_result_2.compare(df_output, align_axis=0)

(50886, 751)
Equality Between Sample Output and Self Processed:  True
None
None
None
None
None
None
None


Unnamed: 0,Unnamed: 1,ATO_1220_Energy_delta,ATO_2008_ATP_Energy_delta
32927,self,223.434,223.434
32927,other,223.434,223.434
32928,self,223.434,223.434
32928,other,223.434,223.434
32929,self,223.434,223.434
...,...,...,...
50874,other,677.366,677.366
50875,self,677.366,677.366
50875,other,677.366,677.366
50877,self,677.366,677.366


In [17]:
# # Unit Test for ATP & COM
# # Import Output File
# df_output = pd.read_csv('./OMAP_Train_20_Car_39_20200116_0600_to_20200116_0700.csv')

# df_output_ATP_COM = df_output.drop(df_output[(df_output['ATP_002_Loc_fault'].isnull()) & (df_output['COM_002_SAFE_INPUTS'].isnull())].index)
# # Drop column with prefix ATO and TDMS
# df_output_ATP_COM = df_output_ATP_COM.loc[:, ~df_output_ATP_COM.columns.str.startswith('ATO')]
# df_output_ATP_COM = df_output_ATP_COM.loc[:, ~df_output_ATP_COM.columns.str.startswith('TDMS')]
# df_output_ATP_COM = df_output_ATP_COM.reset_index(drop=True)
# # Drop epoch as we're not comparing the timestamp
# df_output_ATP_COM = df_output_ATP_COM.drop(['epoch'], axis=1)
# df_output_ATP_COM = df_output_ATP_COM.reset_index(drop=True)
# print(df_output_ATP_COM.shape)


# df_result_ATP_COM_test = pd.read_csv('./ATP_COM_Test.csv')
# #df_result_ATP_COM_test = df_result_ATP_COM_test.drop(['Timestampms'], axis=1)
# print(df_result_ATP_COM_test.shape)


# df_result_ATP_COM_test['result_3'] = df_result_ATP_COM_test['ATP_Invariants Elapsed'].fillna('-').eq(df_output_ATP_COM['ATP_043_Invariants_Elapsed'].fillna('-'))
# print(df_result_ATP_COM_test['result_3'].value_counts())
# df_result_ATP_COM_test.to_csv('./ATP_COM_Test_1.csv', index=False, header=True)


# assert_equal = nan_equal(df_result_ATP_COM_test['ATP_Invariants Elapsed'].values, df_output_ATP_COM['ATP_043_Invariants_Elapsed'].values)
# print("Equality Between Sample Output and Self Processed: ", assert_equal)
# #print(np.testing.assert_equal(df_result_ATP_COM_test['ATP_Invariants Elapsed'].values, df_output_ATP_COM['ATP_043_Invariants_Elapsed'].values))

# # ***** Cant do a unit test here as the swap of real time hasnt occurred.

# # df_result_ATP_COM_test.to_csv('./ATP_COM_Test_1.csv', index=False, header=True)

In [18]:
# def check_for_monotonic (df):
#     """ Return a dataframe
    
#     Detect and clean unsequence real time
    
#     Edge cases might not be covered(e.g. 10 10 10 11 11 11 10 10 10, 11 11 11 10 10 10 11 11 11)
#     """
#     if not df['Realtime'].is_monotonic_increasing:
#         # Get index of non-monotonic location - non increasing order
#         df_non_monotonic = df.loc[df['Realtime'].diff() < pd.to_timedelta('0 seconds')]
        
#         # *** Need to determine if for loop forward or backwards (changing to which real time index) - Not Implemented
        
#         print("Found non-monotonic sequence at index: ", (df.loc[df['Realtime'].diff() < pd.to_timedelta('0 seconds')].index))
#         for index in df_non_monotonic.index:
#             non_monotonic_realtime = df['Realtime'][index-1]
#             counter = 0
#             # Loop backwards to find how many non_monotonic_realtime count 
#             while(True and (index-1-counter >0)):
#                 if non_monotonic_realtime == df['Realtime'][index-1-counter]:
#                     counter += 1
#                 else:
#                     break
#             #print(counter)
            
#             # Find start location of index with the same real time (index)
#             start_index = min(df[df['Realtime'] == df['Realtime'][index]].index)
#             total_no_interval = len(df[df['Realtime'] == df['Realtime'][index]].index) - counter
            
#             # Next time index information
#             start_index_next_realtime = start_index + total_no_interval
#             total_no_interval_next_realtime = len(df[df['Realtime'] == df['Realtime'][index-1]]) + counter
#             next_realtime = df['Realtime'][index-1]
            
#             # Update Real Time
#             for count in range (0,counter):
#                 df.loc[index-1-count,'Realtime'] = df['Realtime'][index]
#                 #print(index-1-count, df.loc[index-1-count,'Realtime'])
                                
#             # Update Real time for the next time index
#             for count in range (0,total_no_interval_next_realtime):
#                 df.loc[start_index_next_realtime+count,'Realtime'] = next_realtime
#                 #print(start_index_next_realtime+count, df.loc[start_index_next_realtime+count,'Realtime'])
#     return df