### Testing for parsing some KPIs from log file.

In [56]:
import pandas as pd
import re
import datetime

def extract_box_name(line):
    match = re.findall(r'([A-Z]+_\d+)', str(line))
    if match:
        return match[0]
    else:
        return None

def extract_line(line):
    match = re.findall(r'([A-Z]+_\d+)_(.+)', str(line))
    if match:
        return match[0][1]
    else:
        return None

# read data
df = pd.read_excel('Onlinelog.xlsm', sheet_name='Survey', skiprows=2)

# handle date and time
df['Time'] = df['Time'].astype(str).str.split('.').str[0]
df['Date_Time'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Time'])

# extract box_name and line from "Run Line/Target" using regex
df['box_name'] = df['Run Line/Target'].apply(extract_box_name)
df['line'] = df['Run Line/Target'].apply(extract_line)

# filter Log on and Log off events
df = df[df['Event'].isin(['Log on', 'Log off', 'Log on DNP'])]

# sort the values
df = df.sort_values(['box_name', 'line', 'Date_Time'])

# list to store rows
rows = []

# iterate over the dataframe
for box_name in df['box_name'].dropna().unique():
    temp_df = df[df['box_name'] == box_name]
    for line in temp_df['line'].dropna().unique():
        line_df = temp_df[temp_df['line'] == line]
        
        total_duration_seconds = 0
        start_time = None
        multiple_runs = 0
        for i, row in line_df.iterrows():
            if row['Event'] in ['Log on', 'Log on DNP']:
                start_time = row['Date_Time']
                multiple_runs += 1
            elif row['Event'] == 'Log off' and start_time is not None:
                duration_seconds = int((row['Date_Time'] - start_time).total_seconds())
                total_duration_seconds += duration_seconds
                start_time = None

        if total_duration_seconds > 0:
            duration = str(datetime.timedelta(seconds=total_duration_seconds))

            multiple_runs_text = f"Multiple runs: {multiple_runs}" if multiple_runs > 1 else ""
            row = {
                'box_name': box_name,
                'line': line,
                'operator': line_df.iloc[0]['Op.'],
                'start_time': line_df.iloc[0]['Date_Time'].strftime('%Y-%m-%d %H:%M:%S'),
                'end_time': line_df.iloc[-1]['Date_Time'].strftime('%Y-%m-%d %H:%M:%S'),
                'duration': duration,
                'multiple_runs': multiple_runs_text
            }
            rows.append(row)

# create dataframe from rows
result = pd.DataFrame(rows)

# save to excel
result.to_excel('output.xlsx', index=False)


## Adding Directional Column

In [86]:
import pandas as pd
import re
import datetime
from geopy.distance import geodesic

def extract_box_name(line):
    match = re.findall(r'([A-Z]+_\d+)', str(line))
    if match:
        return match[0]
    else:
        return None

def extract_line(line):
    match = re.findall(r'([A-Z]+_\d+)_(.+)', str(line))
    if match:
        return match[0][1]
    else:
        return None

def parse_coords(coord_str):
    if pd.isnull(coord_str):
        return None
    deg, minutes, seconds, direction = re.split('[^\d\.]', coord_str)[:4]
    decimal_deg = float(deg) + float(minutes)/60 + float(seconds)/(60*60)
    if direction in ['S', 'W']:
        decimal_deg *= -1
    return decimal_deg

def calculate_distance(start_coords, end_coords):
    return geodesic(start_coords, end_coords).meters

df = pd.read_excel('Onlinelog.xlsm', sheet_name='Survey', skiprows=2)

df['Time'] = df['Time'].astype(str).str.split('.').str[0]
df['Date_Time'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Time'])

df['box_name'] = df['Run Line/Target'].apply(extract_box_name)
df['line'] = df['Run Line/Target'].apply(extract_line)

df = df[df['Event'].str.lower().str.startswith(('log on', 'log off'))]

df = df.sort_values(['box_name', 'line', 'Date_Time'])

rows = []

for box_name in df['box_name'].dropna().unique():
    temp_df = df[df['box_name'] == box_name]
    for line in temp_df['line'].dropna().unique():
        line_df = temp_df[temp_df['line'] == line]
        
        total_duration_seconds = 0
        total_distance = 0
        start_time = None
        start_coords = None
        multiple_runs = 0
        for i, row in line_df.iterrows():
            event_lower = row['Event'].lower()
            if event_lower.startswith('log on'):
                start_time = row['Date_Time']
                lat = parse_coords(row['SE Lat'])
                long = parse_coords(row['SE Long'])
                if lat is not None and long is not None:
                    start_coords = (lat, long)
                multiple_runs += 1
            elif event_lower.startswith('log off') and start_time is not None and start_coords is not None:
                duration_seconds = int((row['Date_Time'] - start_time).total_seconds())
                total_duration_seconds += duration_seconds
                start_time = None

                lat = parse_coords(row['SE Lat'])
                long = parse_coords(row['SE Long'])
                if lat is not None and long is not None:
                    end_coords = (lat, long)
                    distance = calculate_distance(start_coords, end_coords)
                    total_distance += distance
                start_coords = None

        if total_duration_seconds > 0:
            duration = str(datetime.timedelta(seconds=total_duration_seconds))
            distance = f"{total_distance / 1000:.2f} km"
            row = {
                'box_name': box_name,
                'line': line,
                'operator': line_df.iloc[0]['Op.'],
                'from': line_df['Date_Time'].min().strftime("%Y-%m-%d %H:%M:%S"),
                'to': line_df['Date_Time'].max().strftime("%Y-%m-%d %H:%M:%S"),
                'duration': duration,
                'distance': distance,
                'runs': multiple_runs
            }
            rows.append(row)

df_output = pd.DataFrame(rows)
df_output.to_excel('output.xlsx', index=False)


In [66]:
import re
from geopy import distance

def dms_to_dd(dms):
    parts = re.split('[;]+', dms)
    direction = ''
    if 'N' in parts[2]:
        direction = 'N'
        parts[2] = parts[2].strip('N')
    elif 'S' in parts[2]:
        direction = 'S'
        parts[2] = parts[2].strip('S')
    elif 'E' in parts[2]:
        direction = 'E'
        parts[2] = parts[2].strip('E')
    elif 'W' in parts[2]:
        direction = 'W'
        parts[2] = parts[2].strip('W')

    dd = float(parts[0]) + float(parts[1])/60 + float(parts[2])/(60*60)
    if direction in ('S','W'):
        dd *= -1
    return dd

# Convert DMS to DD and calculate distance
def calculate_distance(start_lat_dms, start_lon_dms, end_lat_dms, end_lon_dms):
    start_lat = dms_to_dd(start_lat_dms)
    start_lon = dms_to_dd(start_lon_dms)
    end_lat = dms_to_dd(end_lat_dms)
    end_lon = dms_to_dd(end_lon_dms)

    return distance.distance((start_lat, start_lon), (end_lat, end_lon)).km

# DMS coordinates for start and end
start_lat_dms = '56;02;11.56374N'
start_lon_dms = '7;44;53.21619E'
end_lat_dms = '56;02;17.06824N'
end_lon_dms = '7;44;42.96696E'

# Calculate and print the distance
dist = calculate_distance(start_lat_dms, start_lon_dms, end_lat_dms, end_lon_dms)
print(dist)


0.24591916825679608
