In [47]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_oedc_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    date_regex = re.compile("Date: .*\.? (?:Estimated )?Time")
    date_regex2 = re.compile("Date: .*\.? (?:Estimated )?time")
    location_regex = re.compile("Affected areas:.* Reason")
    location_regex2 = re.compile("Affected Areas:.* Reason")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if 'NOTICE OF UNSCHEDULED/EMERGENCY POWER INTERRUPTION TO CUSTOMERS' not in normalized:
                    continue
                
                print(normalized)
                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x.split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(date_regex2, normalized, lambda x: x.split(':', maxsplit=1)[1].strip())
                    date_str = " ".join(date_str.split(" ")[:3])
                    if date_str[-1] == '.':
                        date_str = date_str[:-1]
                    
                    date_str = date_str.replace("Febuary", "February")
                    try:
                        date = datetime.strptime(date_str, "%d %B %Y")
                    except Exception as e:
                        date = datetime.strptime(date_str, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str)
                
                try:
                    location_str = try_regex(location_regex, normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(location_regex2, normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Affected areas: .* Line crew"), normalized, lambda x: x)
                    location_str = location_str.split(':', maxsplit=1)[1]
                    
                    if ',' in location_str:
                        locations = location_str.split(',')
                    elif '•' in location_str:
                        locations = location_str.split('•')
                    elif '-' in location_str:
                        locations = location_str.split('-')
                    
                    type = 'OEDC'

                    for location in locations:
                        if not location.strip():
                            continue
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [48]:
process_oedc_file('OEDCOfficial.txt')

NOTICE OF UNSCHEDULED/EMERGENCY POWER INTERRUPTION TO CUSTOMERS:  Please be informed of our unscheduled power interruption today, Date: April 1, 2024  Time Interrupted: 12:00 Noon Time Estimated: 1:00 PM  Affected areas:   • Kalaklan Ridge • Bgry. Kalaklan - CBMU St. • Portion of Brgy. West Tapinac - Portion of Esteban St. - 14th St. - Portion of Corpuz St. • Brgy. East Tapinac • Brgy. New Kalalake • Brgy. Pag-Asa • Brgy. New Asinan  Reason: To facilitate the repair of cut primary loop @14 Street Cor. Hansen Street, Brgy. East Tapinac.  Our linemen are currently working in the area to restore power service immediately.  Thank you for understanding.  “In case you will operate standby generator set during shutdown, please make sure to trip-off/open your respective breaker/isolator going to our line to ensure safety of our maintenance personnel.”
NOTICE OF UNSCHEDULED/EMERGENCY POWER INTERRUPTION TO CUSTOMERS: Please be informed of our unscheduled power interruption today, Date: 27 March 

Unnamed: 0,date,location,type
0,2021-12-04,portion of West and East Bajac-Bajac. Reason,OEDC
1,2021-12-04,Portion of Kalaklan,OEDC
2,2021-12-13,Brgy. Kalaklan (Globe Tower) Reason,OEDC
3,2021-12-13,Upper Nagbaculao,OEDC
4,2022-01-09,Brgy. Kalaklan (Globe Tower) Reason,OEDC
...,...,...,...
751,2024-04-01,Brgy. New Kalalake,OEDC
752,2024-04-01,Brgy. East Tapinac,OEDC
753,2024-04-01,Portion of Brgy. West Tapinac - Portion of Es...,OEDC
754,2024-04-01,Bgry. Kalaklan - CBMU St.,OEDC
