In [129]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_iselcoii_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    date_regex = re.compile("DATE(?: and TIME)?:.* REASON")
    date_regex2 = re.compile("DATE(?: AND TIME)?:.* REASON")
    location_regex = re.compile("AFFECTED AREAS?: .*All works")
    location_regex2 = re.compile("AFFECTED AREAS?: .*Power wil")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if '!!NOTICE OF ISELCO II UNSCHEDULED POWER INTERRUPTION!!' not in normalized:
                    continue

                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x[:-7].split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(date_regex2, normalized, lambda x: x[:-7].split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(re.compile("DATE and TIME:.* Reason"), normalized, lambda x: x[:-7].split(':', maxsplit=1)[1].strip())
                    x=None
                    if '-' in date_str:
                        date = date_str.split('-')[0].strip()
                    elif '|' in date_str:
                        date = date_str.split('|')[0].strip()
                    elif ' at ' in date_str:
                        date = date_str.split(' at ')[0].strip()
                    elif 'am' in date_str.lower() or 'pm' in date_str.lower():
                        x = date 
                        date = " ".join(date_str.split(" ")[:3]).strip()
                    date = datetime.strptime(date, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str, x)
                
                try:
                    try:
                        location_str = location_regex.findall(normalized)[0][15:-9].strip()
                    except Exception:
                        location_str = location_regex2.findall(normalized)[0][15:-9].strip()
                    if ';' in location_str:
                        locations = location_str.split(';')
                    elif '•' in location_str:
                        locations = location_str.split('•')
                    elif ',' in location_str:
                        locations = location_str.split(',')
                    else:
                        locations = location_str.split('  ')
                    
                    type = 'ISELCOII'

                    for location in locations:
                        if not location:
                            continue
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)

                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [130]:
process_iselcoii_file('ISELCOII.txt')

Error getting location from !!NOTICE OF ISELCO II UNSCHEDULED POWER INTERRUPTION!!   DATE AND TIME: March 14, 2024 at 8:40 am   REASON: Feeder 3 line Tripped at NGCP ILAGAN  AFFECTED AREAS: Feeder 3  Parts of the Municipality of Ilagan  Portion of Baligatan Alibagu Namnama Salindingan Gayong gayong norte Bigao Siffu  Municipality of Gamu  Parts of the Municipality of Naguilian Magsaysay Portion of San Manuel (Near Guibang)  Parts of the Municipality of Burgos  Raniag  San Bonifacio San Miguel Sitio Duco San Roque  Sitio Singson  *For your information and guidance*  *Power service may be restored upon completion of maintenance. For your safety, always consider our lines energized.  **Please do not copy or download photos containing information about public interest like power interruption,billing,etc.then posting it on your timeline.That information may be changed or updated without notice . So Pleases just click the SHARE button .Thank you.***: list index out of range
Error getting loc

Unnamed: 0,date,location,type
0,2022-07-13 00:00:00,Parts of City of Ilagan *Brgy. Osmeña *Brgy. ...,ISELCOII
1,2022-09-08 00:00:00,"Parts of Municipality of Roxas, Mallig, and Q...",ISELCOII
2,2022-09-17 00:00:00,Isabela,ISELCOII
3,2022-09-17 00:00:00,Isabela Municipality of Quezon,ISELCOII
4,2022-09-17 00:00:00,Municipality of Mallig,ISELCOII
...,...,...,...
2137,2024-03-28 00:00:00,Aguinaldo,ISELCOII
2138,2024-03-28 00:00:00,Sunlife,ISELCOII
2139,2024-03-28 00:00:00,Portion of San Manuel (Going to San Mariano),ISELCOII
2140,2024-03-28 00:00:00,Parts of the Municipality of Naguilian,ISELCOII
