In [34]:
import pandas as pd
import sys
import os
# from os import path
import re
from itertools import groupby
from operator import itemgetter
import logging
import logging.handlers
log_formatter = logging.Formatter('[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')
logging.basicConfig(stream=sys.stdout, format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')
log = logging.getLogger()
log.setLevel(logging.INFO)

LIMIT = 100
PATH = "./input"
DIR_LIST = [i for i in os.listdir(PATH)]
print(DIR_LIST)


['confirmed-cases-since-120465.csv', 'confirmed-cases-since-120465.xlsx', 'DESCRIBE_LOG_EVENTS_2021_01_24_20210725_170200.csv', 'DESCRIBE_LOG_EVENTS_2023 0625_164902.txt', 'DESCRIBE_LOG_EVENTS_20230625_154002.txt', 'DESCRIBE_LOG_EVENTS_20230625_160411.txt', 'DESCRIBE_LOG_EVENTS_20230625_160416.txt', 'DESCRIBE_LOG_EVENTS_20230625_160558.txt', 'DESCRIBE_LoG_EVENTS_20230625_165129.txt', 'DESCRIBE_LOG_EVENTS_20230625_254726.txt', 'ha_aod_003_dataenv_2.csv']


In [35]:
accumulated_log = []
has_added_header = False
for file_name in [i for i in DIR_LIST if re.match("^DESCRIBE_LOG_EVENTS_[0-9]{8}_[0-1]{1}[0-9]{1}[0-9]{4}.txt$", i)]:
    file_path = f"{PATH}/{file_name}"
    
    is_file = os.path.isfile(file_path)

    if is_file:
        with open(file_path, 'r') as file_log:
            log.info(f"Processed file : {file_path}")
            
            is_header = True
            line_count = 0
            for line in file_log:
                if (is_header):
                    if not (has_added_header):
                        accumulated_log.append(line.split('|'))
                        has_added_header = True
                    is_header = False
                else:
                    if not (re.match("^\d{4}.+$", line)):
                        continue
                    accumulated_log.append(line.rstrip().split("|"))
                    line_count += 1


[2023-07-04 21:46:58,377] {2948869347.py:10} INFO - Processed file : ./input/DESCRIBE_LOG_EVENTS_20230625_154002.txt
[2023-07-04 21:46:58,869] {2948869347.py:10} INFO - Processed file : ./input/DESCRIBE_LOG_EVENTS_20230625_160416.txt
[2023-07-04 21:46:59,395] {2948869347.py:10} INFO - Processed file : ./input/DESCRIBE_LOG_EVENTS_20230625_160558.txt


In [36]:
log.info(f"Total processed record(s) : {len(accumulated_log)-1}")


[2023-07-04 21:47:01,187] {2395206342.py:1} INFO - Total processed record(s) : 450000


In [37]:
list(enumerate(accumulated_log[0]))

[(0, 'DATE_TIME'),
 (1, 'NAME'),
 (2, 'CITY'),
 (3, 'ZIPCODE'),
 (4, 'BBAN'),
 (5, 'LOCALE'),
 (6, 'BANK_COUNTRY'),
 (7, 'IBAN'),
 (8, 'COUNTRY_CALLING_CODE'),
 (9, 'MSISDN'),
 (10, 'PHONE_NUMBER'),
 (11, 'STATUS'),
 (12, 'GENDER\n')]

In [38]:
def filter_active(record):
    if (record[11] == 'active'):
        return True
    else:
        return False
        
i = 0
active_logs = list(filter(filter_active, accumulated_log))
log.info(f"Total active record(s) : {len(active_logs)}")

[2023-07-04 21:47:01,351] {1994419476.py:9} INFO - Total active record(s) : 224553


In [39]:
group_by_gender_output = []
for _, value in groupby(sorted(active_logs, key=itemgetter(12)), key=itemgetter(12)):
    value = list(value)
    value_count = len(value)
    log.info(f"Total active record(s) in Gender [{value[0][12]}] : {value_count}")
    group_by_gender_output.append(value)

[2023-07-04 21:47:01,468] {2756953441.py:5} INFO - Total active record(s) in Gender [f] : 112215
[2023-07-04 21:47:01,496] {2756953441.py:5} INFO - Total active record(s) in Gender [m] : 112338


In [40]:
sorted_by_zipcode = sorted(accumulated_log[1:], key=itemgetter(3))
log.info(f"Minimum number of zipcode : {sorted_by_zipcode[0][3]}")
log.info(f"Maximum number of zipcode : {sorted_by_zipcode[-1][3]}")

[2023-07-04 21:47:02,309] {150575055.py:2} INFO - Minimum number of zipcode : 00501
[2023-07-04 21:47:02,310] {150575055.py:3} INFO - Maximum number of zipcode : 99950


In [41]:
def filter_phonenumber_format(record):
    if (re.match("^\(\d{3}\)\d{3}-\d{4}$", record[10])):
        return True
    else:
        return False

phonenumber_output = []
phonnumber_formats = list(filter(filter_phonenumber_format, accumulated_log))
log.info(f"Total record(s) of special phone number format : {len(phonnumber_formats)}")
with open("./special_phonenumber.txt", "w") as file:
    for i in phonnumber_formats:
        file.write(f'{i[10]}\n')

[2023-07-04 21:47:02,817] {2500381286.py:9} INFO - Total record(s) of special phone number format : 36164
