In [3]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_iec_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    date_regex = re.compile("WHEN:.*\, 202.,")
    location_regex = re.compile("AFFECTED AREA(?:\(s\))?: .* REASON")
    location_regex2 = re.compile("AFFECTED AREA(?:\(s\))?: .* Power ")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if 'IEC Scheduled Power Interruption' not in normalized:
                    continue
                
                # print(normalized)
                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x[6:-1])
                    if not date_str:
                        date_str = try_regex(re.compile("WHEN: .* 202. -"), normalized, lambda x: x[6:-2])
                    if not date_str:
                        date_str = try_regex(re.compile("November 9, 2022"), normalized, lambda x: x)
                    if not date_str:
                        date_str = try_regex(re.compile("April 26, 2022"), normalized, lambda x: x)
                    if not date_str:
                        date_str = try_regex(re.compile("August 10, 2021"), normalized, lambda x: x)
                    date = datetime.strptime(date_str, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str)
                
                try:
                    location_str = try_regex(location_regex, normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(location_regex2, normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Salaban 1, Salaban 2, Salaban San Jose"), normalized, lambda x: f'AFFECTED AREA: {x} Power ')

                    if '(s)' in location_str:
                        locations = [location_str[18:-6].strip()]
                    else:
                        locations = [location_str[15:-6].strip()]
                    
                    type = 'IEC'
                    for location in locations:
                        if not location.strip():
                            continue
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [6]:
res = process_iec_file('ibaanelectric.txt')
res.to_csv('data.csv', index=False, header=False, mode='a')

Error getting location from ANNOUNCEMENT  WHAT: IEC Scheduled Power Interruption WHEN:  April 26, 2022 – TUESDAY (Brgy. Bungahan) April 27, 2022 – WEDNESDAY (Brgy. Salaban SJ) April 28, 2022 – THURSDAY (Brgy. Balanga to TMC Farm) April 29, 2022 – FRIDAY (Brgy. Bago-KABILA Patalay Compound) TIME: 8:00 AM - 5:00 PM  April 30, 2022 – SATURDAY (Boundary ng BAGO at SABANG) TIME: 8:00 AM -12:00 NN  REASON: Vegetation Clearing  Power will be restored on or before the scheduled time without prior notice. Hoping for your understanding for the inconvenience that may cause you.: argument of type 'bool' is not iterable
--- False
