In [27]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_iselcoii_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    date_regex = re.compile("DATE(?: and TIME)?:.* REASON")
    date_regex2 = re.compile("DATE(?: AND TIME)?:.* REASON")
    location_regex = re.compile("AFFECTED AREAS?: .*All works")
    location_regex2 = re.compile("AFFECTED AREAS?: .*Power wil")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if '!!NOTICE OF ISELCO II UNSCHEDULED POWER INTERRUPTION!!' not in normalized:
                    continue

                try:
                    date_str = try_regex(re.compile("December 17, 2022"), normalized, lambda x: x)
                    if not date_str:
                        date_str = try_regex(date_regex, normalized, lambda x: x[:-7].split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(date_regex2, normalized, lambda x: x[:-7].split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(re.compile("DATE and TIME:.* Reason"), normalized, lambda x: x[:-7].split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(re.compile("October 23, 2023"), normalized, lambda x: x)
                    if not date_str:
                        date_str = try_regex(re.compile("August 27, 2023"), normalized, lambda x: x)
                    date_str = date_str.replace("Octoberber", "October")
                    x=None
                    if '-' in date_str:
                        date1 = date_str.split('-')[0].strip()
                    elif '|' in date_str:
                        date1 = date_str.split('|')[0].strip()
                    elif ' at ' in date_str and 'pm' not in date_str:
                        date1 = date_str.split(' at ')[0].strip()
                    elif 'am' in date_str.lower() or 'pm' in date_str.lower():
                        date1 = " ".join(date_str.strip().split(" ")[:3]).strip()
                    try:
                        date = datetime.strptime(date1, "%B %d, %Y")
                    except Exception:
                        date = datetime.strptime(date1, "%B %d,%Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str, x)
                
                try:
                    location_str = try_regex(location_regex, normalized, lambda x: x[15:-9].strip())
                    if not location_str:
                        location_str = try_regex(location_regex2, normalized, lambda x: x[15:-9].strip()) 
                    if not location_str:
                        location_str = try_regex(re.compile("AFFECTED AREAS: .+ *For your information"), normalized, lambda x: x[15:-20].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("Parts of Municipality of Tumauini • Arcon  • Balug  • Bayabo East  • Lalauanan  • Lapogan  • Bantug  • Maligaya  • Minanga  • Paragu  • Sisim Alto   • Sisim Abajo  • Liwanag  • Pilitan   Parts of City of Ilagan • Alinguigan 1st • Alinguigan 2nd  • Alinguigan 3rd  • Ballacong  • Bangag  • Marana 1st  • Marana 2nd  • Marana 3rd  • Minabang  • Morado  • Nanaguan  • Sta. Victoria  • Tangkul  • Binatacan  • Cabisera 10  • Cabisera 14 -16  • Cabisera 25  • Cabisera 27  • Pasa  • Quimalabasa  • Rangayan  • Rugao  • San Andress  • Sindun Bayabo  • Sindun Maride  • Villa Imelda  • Cadu  • Capellan  • Capo  • Fuyo  • Manaring  • San Isidro  • San Juan  • San Lorenzo  • San Pablo  • San Rodrigo  • Santa Catalina  • Sipay"), normalized, lambda x: x)
                    if ';' in location_str:
                        locations = location_str.split(';')
                    elif ' *' in location_str:
                        locations = location_str.split(' *')
                    elif '•' in location_str:
                        locations = location_str.split('•')
                    elif ',' in location_str:
                        locations = location_str.split(',')
                    else:
                        locations = location_str.split('  ')
                    
                    type = 'ISELCOII'

                    for location in locations:
                        if not location:
                            continue
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)

                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [29]:
res = process_iselcoii_file('ISELCOII.txt')
res
res.to_csv('data.csv', index=False, header=False, mode='a')