In [23]:
from datetime import datetime
import unicodedata
import ast
import emoji
import pandas as pd
import re

def normalize_line(line: str):
    try:
        string = ast.literal_eval(line).decode()
    except Exception:
        string = ''
    no_endline = string.replace('\n', ' ')
    no_unicode_letters = unicodedata.normalize('NFKC', no_endline)
    no_emoji = emoji.replace_emoji(no_unicode_letters, replace='')

    return no_emoji

def try_regex(reg, string, todo):
    res = reg.findall(string)
    if len(res) > 0:
        try:
            return todo(res[0])
        except Exception as e:
            print(res, e)
            return False
    return False

def process_oedc_file(file_path):
    data = { 'date': [], 'location': [], 'type': [] }
    df = pd.DataFrame(data)

    date_regex = re.compile("Date: .*\.? (?:Estimated )?Time")
    date_regex2 = re.compile("Date: .*\.? (?:Estimated )?time")
    location_regex = re.compile("Affected areas:.* Reason")
    location_regex2 = re.compile("Affected Areas:.* Reason")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                normalized = normalize_line(line)

                if 'NOTICE OF UNSCHEDULED/EMERGENCY POWER INTERRUPTION TO CUSTOMERS' not in normalized:
                    continue
                
                try:
                    date_str = try_regex(date_regex, normalized, lambda x: x.split(':', maxsplit=1)[1].strip())
                    if not date_str:
                        date_str = try_regex(date_regex2, normalized, lambda x: x.split(':', maxsplit=1)[1].strip())
                    if date_str:
                        date_str = " ".join(date_str.split(" ")[:3])
                    if date_str and date_str[-1] == '.':
                        date_str = date_str[:-1]
                    if not date_str:
                        date_str = try_regex(re.compile("Date: .+ We are sorry"), normalized, lambda x: x[6:-12].strip())
                    if not date_str:
                        date_str = try_regex(re.compile("July 03, 2023"), normalized, lambda x: x)
                    if not date_str:
                        date_str = try_regex(re.compile("December 19, 2021"), normalized, lambda x: x)
                    
                    date_str = date_str.replace("Febuary", "February")
                    try:
                        date = datetime.strptime(date_str, "%d %B %Y")
                    except Exception as e:
                        try:
                            date = datetime.strptime(date_str, "%B %d, %Y")
                        except Exception as e:
                            date = datetime.strptime(date_str, "%B %d, %Y")
                except Exception as e:
                    print(f"Error getting date from {normalized}: {e}")
                    print("---", date_str)
                
                try:
                    location_str = try_regex(location_regex, normalized, lambda x: x[:-7])
                    if not location_str:
                        location_str = try_regex(location_regex2, normalized, lambda x: x[:-7])
                    if not location_str:
                        location_str = try_regex(re.compile("Affected areas: .* Line crew"), normalized, lambda x: x)
                    if not location_str:
                        location_str = try_regex(re.compile("Affected area: .+ Reason"), normalized, lambda x: x[:-7])
                    if not location_str:
                        location_str = try_regex(re.compile("AFFECTED AREAS: .+ Reason"), normalized, lambda x: x[:-7])
                    if location_str:
                        location_str = location_str.split(':', maxsplit=1)[1]
                    if not location_str:
                        location_str = try_regex(re.compile("• .+ Reason"), normalized, lambda x : x[2:-7])
                    
                    if ',' in location_str:
                        locations = location_str.split(',')
                    elif '•' in location_str:
                        locations = location_str.split('•')
                    elif '-' in location_str:
                        locations = location_str.split('-')
                    
                    type = 'OEDC'

                    for location in locations:
                        if not location.strip():
                            continue
                        row_df = pd.DataFrame([[date, location, type]], columns=df.columns)
                        df = pd.concat([row_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error getting location from {normalized}: {e}")
                    print("---", location_str)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
    return df

In [24]:
res = process_oedc_file('OEDCOfficial.txt')
res.to_csv('data.csv', header=False, index=False, mode='a')