In [1]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_aleco_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)
    
    date_regex = re.compile("DATE: .*\, 202.")
    location_regex = re.compile("SUBSTATION: .* REASON")
    location_regex2 = re.compile("SUBSTATION: .*  REASON")
    location_regex3 = re.compile("SUBSTATION: .* DATE")
    location_regex4 = re.compile("SUBSTATION: .*  DATE")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)
                if 'POWER INTERRUPTION' not in normalized: continue
                    
                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("DATE: .* 202."), normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("DATE:.* 202."), normalized, lambda x: x[5:])
                    if not date_str:
                        date_str = try_regex(re.compile("Date: .* 202."), normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("Date:.* 202."), normalized, lambda x: x[5:])
                    
                    print(date_str)
                    date = datetime.strptime(date_str, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str)
                    
                try:
                    location_str = try_regex(location_regex, normalized, lambda x: x[12:-6])
                    if not location_str:
                        location_str = try_regex(location_regex2, normalized, lambda x: x[12:-6])
                        
                    if not location_str:
                        location_str = try_regex(location_regex3, normalized, lambda x: x[12:-5])
                    
                    if not location_str:
                        location_str = try_regex(location_regex4, normalized, lambda x: x[12:-5])
                        
                    if not location_str:
                        location_str = try_regex("Substation: .*  DATE", normalized, lambda x: x[12:-5])
                    
                    location = location_str
                    
                    type = 'ALECO'
                    row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                    df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        
    return df
        
                

In [2]:
res = process_aleco_file('ALECO_public_5.txt')
res.to_csv('data.csv', index=False, header=False, mode='a')

MARCH 12, 2024
March 10, 2024
March 09, 2024
March 08, 2024
March 7, 2024
March 8, 2024
Error getting location from NGCP POWER INTERRUPTION ​ Substation: WASHINGTON SUBSTATION, BITANO SUBSTATION AND STA. MISERICORDIA SUBSTATION Date: March 8, 2024 Time: 5:00 AM - 5:00 PM Reason: • To facilitate erection and installation of new steel poles SP-02 and SP-05 under SLSUP Stage 2 Project - Daraga SS.: 'str' object has no attribute 'findall'
--- False
March 05, 2024
March 05, 2024
March 4, 2024
March 4, 2024
March 04, 2024
March 03, 2024
March 03, 2024
March 02, 2024
March 02, 2024
March 02, 2024
March 6, 2024
Error getting location from NGCP POWER INTERRUPTION ​ Substation: WASHINGTON SUBSTATION, BITANO SUBSTATION AND STA. MISERICORDIA SUBSTATION Date: March 6, 2024 Time: 5:00 AM - 5:00 PM Reason: • To facilitate erection and installation of new steel poles SP-02 and SP-05 under SLSUP Stage 2 Project - Daraga SS.: 'str' object has no attribute 'findall'
--- False
FEBRUARY 29, 2024
NGCP POWER

NOVEMBER 19, 2023
NOVEMBER 20, 2023
NOVEMBER 19, 2023
NOVEMBER 20, 2023
NOVEMBER 19, 2023
NOVEMBER 19, 2023
NOVEMBER 18, 2023
NOVEMBER 18, 2023
NOVEMBER 18, 2023
NOVEMBER 17, 2023
NOVEMBER 16, 2023
November 18, 2023
November 27, 2023
Error getting location from NGCP POWER INTERRUPTION ​ Substation: Polangui- All Feeders Date: November 27, 2023 (MONDAY) Time: ​6:00 AM - 6:00 PM Reason: Preventive maintenance of Naga 69kV Bus 1; Replacement of Naga-Iriga 69kV line's phase A current transformer.: 'str' object has no attribute 'findall'
--- False
NOVEMBER 16, 2023
NOVEMBER 15, 2023
NOVEMBER 14,  2023
NOVEMBER 14, 2023
NOVEMBER 13, 2023
NOVEMBER 11, 2023
November 11, 2023
NOVEMBER 09, 2023
NOVEMBER 08, 2023
NOVEMBER 08, 2023
NOVEMBER 06, 2023
NOVEMBER 06, 2023
NOVEMBER 05, 2023
NOVEMBER 05, 2023
NOVEMBER 05, 2023
NOVEMBER 04, 2023
NOVEMBER 04, 2023
NOVEMBER 02, 2023
November 02, 2023
NOVEMBER 1, 2023
NOVEMBER 1, 2023
NOVEMBER 1, 2023
November 4, 2023
October 31, 2023
October 30, 2023
Octobe