In [6]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def process_panelco3_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    date_regex = re.compile('DATE: .* TIME')
    location_regex = re.compile('AFFECTED AREAS?: .*All works')
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if 'NOTICE OF SCHEDULED POWER INTERRUPTION' not in normalized:
                    continue

                try:
                    date_str = date_regex.findall(normalized)[0]
                    _, date = date_str.split(': ', maxsplit=1)
                    date = date.split('(', maxsplit=1)[0].strip()
                    date = datetime.strptime(date, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                
                try:
                    location_str = location_regex.findall(normalized)[0][15:-9].strip()
                    if ';' in location_str:
                        tlocations = location_str.split(';')
                        locations = []
                        for location in tlocations:
                            locations += location.split(',')
                    else:
                        locations = location_str.split(',')
                    
                    type = 'PANELCO3'

                    for location in locations:
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)

                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [7]:
res = process_panelco3_file('panelco3official.txt')
res
res.to_csv('data.csv', header=False, index=False, mode='a')

Error getting location from NOTICE OF SCHEDULED POWER INTERRUPTION BY THE NATIONAL GRID CORPORATION OF THE PHILIPPINES (NGCP)  DATE: April 6, 2024 (Saturday)  TIME: 6:00AM - 6:00PM  PURPOSE: National Grid Corporation of the Philippines (NGCP) to implement the following activities: 1. Upgrading projects in San Manuel Substation; 2. Preventive Maintenance and correction of defects along the line  AFFECTED AREAS:  URDANETA CITY - Brgys. Anonas, Cabaruan, Camantiles, Catablan, Cayambanan, Dilan-Paurido, Labit West, Labit Proper, Mabanogbog, Nancalobasaan, Nancamaliran East, Nancamaliran West, Oltama, Pinmaludpod, Poblacion, San Jose, San Vicente, Sugcong, Tulong and part of Brgy. Nancayasan (Gracia Village);  ASINGAN - Brgys. Bobonan, Calepaan, Coldit, Palaris, Sobol, Toboy;  BINALONAN - Entire Municipality; LAOAC - Entire Municipality; MANAOAG - Brgy. Inamotan; MAPANDAN - Entire Municipality; POZORRUBIO - Entire Municipality; SAN MANUEL - Entire Municipality except Brgys. San Juan, San Vi