In [2]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            # print(res, e)
            return False
    return False

def process_maselco_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if 'power interruption' not in normalized.lower():
                    continue
                if normalized.startswith('Cause of power interruption'):
                    continue
                if not any([x in normalized for x in '0123456789']):
                    continue
                if 'cancelled' in normalized.lower():
                    continue
                
                try:
                    date = try_regex(re.compile("[a-zA-Z]+ [1-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x, "%B %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[1-3]?[0-9]-[a-zA-Z]+-202[0-9]"), normalized, lambda x: datetime.strptime(x, "%d-%b-%Y"))
                    if not date:
                        date = try_regex(re.compile("[1-3]?[0-9]-[a-zA-Z]+-202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Sept', 'Sep'), "%d-%b-%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [1-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b. %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\.[1-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b.%d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [1-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b. %d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\.[1-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b.%d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [0-3][0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b. %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Sept', 'Sep'), "%b. %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\.[0-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Sept', 'Sep'), "%b.%d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [0-3][0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Sept', 'Sep'), "%b. %d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x, "%B %d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x, "%B %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x, "%B. %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b %d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\. [0-3]?[0-9] ,202[0-9]"), normalized, lambda x: datetime.strptime(x, "%b. %d ,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Sept', 'Sep'), "%b %d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\.[0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Sept', 'Sep'), "%b.%d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Augus', 'Aug'), "%b %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9] ,202[0-9]"), normalized, lambda x: datetime.strptime(x, "%B %d ,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Hunyo', 'Jun'), "%b %d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Enero', 'Jan'), "%b %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Enero', 'Jan'), "%b %d, %Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+\.[0-3]?[0-9],202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Nob', 'Nov'), "%b.%d,%Y"))
                    if not date:
                        date = try_regex(re.compile("[a-zA-Z]+ [0-3]?[0-9], 202[0-9]"), normalized, lambda x: datetime.strptime(x.replace('Nobyembre', 'Nov'), "%b %d, %Y"))
                    
                    if date == False:
                        print(normalized)
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date)
                
                try:
                    location_str = try_regex(re.compile("at [a-zA-Z- ,]*\. We"), normalized, lambda x: x[3:-4])
                    if not location_str:
                        location_str = try_regex(re.compile("Areas affected: [a-zA-Z- ,/]*\. +We"), normalized, lambda x: x[16:].split('.')[0].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("at [a-zA-Z- ,]*\. Our"), normalized, lambda x: x[3:-5])
                    if not location_str:
                        location_str = try_regex(re.compile("Areas affected: [a-zA-Z- ,\/\][ ]+We"), normalized, lambda x: x[16:].split('We')[0].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("Areas affected: [a-zA-Z- ,\/\][ ]+Th"), normalized, lambda x: x[16:].split('We')[0].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("Areas affected: [a-zA-Z- .,\/\][ ]+Th"), normalized, lambda x: x[16:].split('We')[0].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("at [a-zA-Z- ,.]+: dam"), normalized, lambda x: x[3:-5])
                    if not location_str:
                        location_str = try_regex(re.compile("at [a-zA-Z- ,.]*\. Our"), normalized, lambda x: x[3:-5])
                    if not location_str:
                        location_str = try_regex(re.compile("Areas affected: [0-9a-zA-Z- .,\/\][ ]+We"), normalized, lambda x: x[16:].split('We')[0].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("AREAS AFFECTED: All Feeders"), normalized, lambda x: x[16:].strip())
                    if not location_str:
                        location_str = try_regex(re.compile("at [ña-zA-Z- ,.]*:"), normalized, lambda x: x[3:-1])
                    if not location_str:
                        location_str = try_regex(re.compile("at [0-9a-zA-Z- &,.]*\."), normalized, lambda x: x[3:-1])
                    if not location_str:
                        location_str = try_regex(re.compile("Apektadong Lugar: [a-zA-Z- ,.]*\."), normalized, lambda x: x[18:-1])
                    if not location_str:
                        location_str = try_regex(re.compile("affecting the municipalities of [a-zA-Z- ,.]+ and [a-zA-Z]+"), normalized, lambda x: x[32:])
                    if not location_str:
                        location_str = try_regex(re.compile("Miaga, Uson to Gahit, Cataingan"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Gahit, Cataingan to Miaga, Uson"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Mobo to Miaga, Uson"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("affecting all feeders"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Malinta to Milagros. Bañadero, Mobo to Uson, Dimasalang, Palanas, Gahit,Cataingan, Esperanza, Placer and Pio V Corpus"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Kinamaligan to Milagros and Baleno. Bañadero, Mobo to Uson, Dimasalang, Palanas, Gahit,Cataingan, Esperanza and Pio V Corpus"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Malinta to Milagros, and Baleno"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Brgy. Kinamaligan"), normalized, lambda x: x)
                    
                    locations = location_str.split(', ')
                    
                    type = 'MASELCO'
                    for location in locations:
                        if not location.strip():
                            continue
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [3]:
res = process_maselco_file('OfficialMASELCO.txt')
res
res.to_csv('data.csv', index=False, header=False, mode='a')