In [25]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_aleco_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)
    
    date_regex = re.compile("DATE: .*\, 202.")
    location_regex = re.compile("SUBSTATION: .* REASON")
    location_regex2 = re.compile("SUBSTATION: .*  REASON")
    location_regex3 = re.compile("SUBSTATION: .* DATE")
    location_regex4 = re.compile("SUBSTATION: .*  DATE")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)
                if 'POWER INTERRUPTION' not in normalized: continue
                    
                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("DATE: .* 202."), normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("DATE:.* 202."), normalized, lambda x: x[5:])
                    if not date_str:
                        date_str = try_regex(re.compile("Date: .* 202."), normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("Date:.* 202."), normalized, lambda x: x[5:])
                    
                    print(date_str)
                    date = datetime.strptime(date_str, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str)
                    
                try:
                    location_str = try_regex(location_regex, normalized, lambda x: x[12:-6])
                    if not location_str:
                        location_str = try_regex(location_regex2, normalized, lambda x: x[12:-6])
                        
                    if not location_str:
                        location_str = try_regex(location_regex3, normalized, lambda x: x[12:-5])
                    
                    if not location_str:
                        location_str = try_regex(location_regex4, normalized, lambda x: x[12:-5])
                        
                    if not location_str:
                        location_str = try_regex("Substation: .*  DATE", normalized, lambda x: x[12:-5])
                    
                    location = location_str
                    
                    type = 'ALECO'
                    row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                    df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        
    return df
        
                

In [26]:
process_aleco_file('ALECO_public_5.txt')

MARCH 12, 2024
March 10, 2024
March 09, 2024
March 08, 2024
March 7, 2024
March 8, 2024
March 05, 2024
March 05, 2024
March 4, 2024
March 4, 2024
March 04, 2024
March 03, 2024
March 03, 2024
March 02, 2024
March 02, 2024
March 02, 2024
March 6, 2024
FEBRUARY 29, 2024
NGCP POWER INTERRUPTION  Power service interruption scheduled February 29, 2024 (0700H - 1900H) and March 2, 2024 (1630H - 1700H) will be shortened to February 29, 2024 (0830H - 0900H).  Details:  Substation: TABACO & MALINAO SUBSTATION Date: February 29, 2024
Error getting date from UPDATE: NGCP POWER INTERRUPTION  Power service interruption scheduled February 29, 2024 (0700H - 1900H) and March 2, 2024 (1630H - 1700H) will be shortened to February 29, 2024 (0830H - 0900H).  Details:  Substation: TABACO & MALINAO SUBSTATION Date: February 29, 2024 Time: 8:30 AM - 9:00 AM Reason: • Momentary shutdown of Ligao-Pawa 69 kV Line to isolate Cale ADS.: time data 'NGCP POWER INTERRUPTION  Power service interruption scheduled Febru

Unnamed: 0,date,location,type
0,2023-08-25,LIGAO FEEDER 2 AFFECTED AREAS: (Portion) Doña...,ALECO
1,2023-08-28,WASHINGTON FEEDER 3 (TAGUNTONG RECLOSER),ALECO
2,2023-08-30,BITANO FEEDER 2 AFFECTED AREAS: (12:00 MN -...,ALECO
3,2023-08-30,WASHINGTON FEEDER 2 TIME DURATION: 8:00 AM - 5...,ALECO
4,2023-08-31,WASHINGTON FEEDER 1 AFFECTED AREAS: (Portion) ...,ALECO
...,...,...,...
268,2024-03-07,SANTA MISERICORDIA FEEDER 1 & 2,ALECO
269,2024-03-08,TABACO FEEDER 3 (SAN JOSE RECLOSER),ALECO
270,2024-03-09,WASHINGTON FEEDER 4,ALECO
271,2024-03-10,LIGAO FEEDER 1,ALECO
