In [3]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_batanelco_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)
    
    date_regex = re.compile("DATE: .*\, 202.")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)
                if 'power interruption' not in normalized: continue
                    
                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile(", .*, 202.,"), normalized, lambda x: x[2:-1])
                    if not date_str:
                        date_str = try_regex(re.compile("DATE: .* 202."), normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("DATE:.* 202."), normalized, lambda x: x[5:])
                    if not date_str:
                        date_str = try_regex(re.compile("Date: .* 202."), normalized, lambda x: x[6:])
                    if not date_str:
                        date_str = try_regex(re.compile("Date:.* 202."), normalized, lambda x: x[5:])
                    
                    print(date_str)
                    date = datetime.strptime(date_str, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str)
                    
                try:
                    location_str = try_regex(re.compile("Area: .* Time:"), normalized, lambda x: x[6:-5])
                    if not location_str:
                        location_str = try_regex(re.compile("\) .*:"), normalized, lambda x: x[2:-1])
                    if not location_str:
                        location_str = try_regex(re.compile("\)  .*:"), normalized, lambda x: x[3:-1])
                    if not location_str:
                        location_str = try_regex(re.compile("\)  .* Time:"), normalized, lambda x: x[3:-5])
                    
                    location = location_str
                    
                    print(date, location)
                    
                    type = 'BATANELCO'
                    row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                    df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        
    return df
        
res = process_batanelco_file('BATANELCO_public_2.txt')
res.to_csv('data.csv', index=False, header=False, mode='a')

March 15, 2024
2024-03-15 00:00:00  Vahangsion Time: 8:00 AM - 12
March 15, 2024
2024-03-15 00:00:00 Mun. Of Itbayat Time: 5:00 AM - 8
False
Error getting date from POWER INTERRUPTION ADVISORY  A 20-minute power interruption affecting South Feeder (Chanarian-Tukon, Mahatao, Ivana, and Uyugan) will occur anytime from now to repair / replace primary jumper wire in Tukon.  Sorry for the inconvenience. Thank you for bearing with us.: strptime() argument 1 must be str, not bool
--- False
2024-03-15 00:00:00 False
The scheduled power interruption in Itbayat on Friday, March 8, 2024
Error getting date from UPDATE: The scheduled power interruption in Itbayat on Friday, March 8, 2024, is now POSTPONED until further notice. We'll keep you posted on the new schedule. Thank you!: time data 'The scheduled power interruption in Itbayat on Friday, March 8, 2024' does not match format '%B %d, %Y'
--- The scheduled power interruption in Itbayat on Friday, March 8, 2024
2024-03-15 00:00:00 False
March 8

False
Error getting date from To our Valued Member-Consumer Owners:  There will be a scheduled power interruption in the following Barangay:  FEBRUARY 23, 2022 (WEDNESDAY) Area: Manaraw, Kayvaluganan Time: 8:00 AM - 5:00 PM  FEBRUARY 26, 2022 (SATURDAY) Area: BATAN ISLAND Time: 7:30 AM - 5:00 PM  Thank you for bearing with us.  Stay healthy. Stay home. Stay safe.: strptime() argument 1 must be str, not bool
--- False
2022-03-23 00:00:00 Manaraw, Kayvaluganan Time: 8:00 AM - 5:00 PM  FEBRUARY 26, 2022 (SATURDAY) Area: BATAN ISLAND 
False
Error getting date from EMERGENCY POWER INTERRUPTION Date/Time off: 02/13/2022, 9:00AM Affected Area(s): Brgy. San Antonio Cause: Replacement of Blacer Transformer (in regard to the power interruption earlier) Date/Time Restored: 02/13/2022, 10:10AM  -------------------------------------------  POWER INTERRUPTION Date/Time Off: 02/13/2022, 8:20AM Affected Area(s): Parts of Brgy. San Antonio Cause: Burnt bushing of Blacer Transformer near DOLE Office  Ou