### Testing on raw-data

##### Create custom dataset

In [1]:
import pandas as pd

notifications_df = pd.read_csv('raw-data.txt', quotechar='"', escapechar='\\')

# app_labels = ['Gojek', 'GoPay', 'Grab', 'Jenius', 'Amazon Shopping']
# filtered_df = notifications_df[notifications_df['APP LABEL'].isin(app_labels)]
# filtered_df.to_csv('custom_notifications.csv', index=False)
# print(filtered_df.head())

unwanted_labels = ['Stay Focused', 'Android System']
filtered_df = notifications_df[~notifications_df['APP LABEL'].isin(unwanted_labels)]

column_mapping = {
    'ID': 'id',
    'PACKAGE NAME': 'package_name', 
    'APP LABEL': 'app_label',
    'MESSAGE': 'message',
    'DATE': 'date',
    'CONTENTS': 'contents',
    'TIMESTAMP': 'timestamp'
}

filtered_df = filtered_df.rename(columns=column_mapping)

filtered_df.to_csv('custom_notifications.csv', index=False)

app_counts = filtered_df['app_label'].value_counts()
print("\nCount of notifications by app label:")
print(app_counts)

# # To see if any of the requested labels are missing from the data
# missing_labels = set(app_labels) - set(app_counts.index)
# if missing_labels:
#     print("\nRequested labels not found in the data:", missing_labels)


Count of notifications by app label:
app_label
Tokopedia                                                                                                              6777
FT                                                                                                                     3896
Messages                                                                                                               3672
MSN                                                                                                                    2667
Duolingo                                                                                                               2467
                                                                                                                       ... 
by.U                                                                                                                      1
My HKG                                                                              

##### Notification parser

In [2]:
import pandas as pd
from notification_reader import process_notification_data

def main():
    # Define file paths
    notifications_file = 'custom_notifications.csv'
    dictionary_file = 'dictionary.json'
    patterns_file = 'regex_patterns.json'
    
    output_file = 'processed_transactions.csv'
    
    try:
        # Load the dataset
        print("Loading notifications data...")
        notifications_df = pd.read_csv(notifications_file)
        print(f"Loaded {len(notifications_df)} notifications")
        
        # Process the data with dictionary and patterns files
        print("Processing notifications...")
        results_df = process_notification_data(
            notifications_df, 
            dictionary_file, 
            patterns_file
        )
        
        # Remove entries with "unknown" transaction type
        valid_results_df = results_df[results_df['transaction_type'] != 'unknown']
        # Remove entries with no amount
        valid_results_df = valid_results_df.dropna(subset=['amount'])
        
        print(f"Found {len(valid_results_df)} valid transactions out of {len(results_df)} processed")
        
        # Save to CSV        
        valid_results_df.to_csv(output_file, index=False)
        
        print(f"\nSuccessfully processed and saved {len(valid_results_df)} transactions to {output_file}")
        
        # Display summary statistics
        if len(valid_results_df) > 0:
            print("\nTransaction Summary:")
            print(f"Transaction Types:")
            print(valid_results_df['transaction_type'].value_counts())
            print(f"\nTop Categories:")
            print(valid_results_df['category'].value_counts().head(10))
        
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Please ensure the following files exist:")
        print(f"- {notifications_file}")
        print(f"- {dictionary_file}")
        print(f"- {patterns_file}")
    except KeyError as e:
        print(f"Missing required data in dictionary file: {e}")
    except ValueError as e:
        print(f"Invalid file format: {e}")
    except Exception as e:
        print(f"Error processing data: {e}")

if __name__ == "__main__":
    main()

Loading notifications data...
Loaded 42369 notifications
Processing notifications...


Processed 942 transactions
Skipped 30082 apps not in allowlist
Skipped 0 blacklisted apps
Filtered out 7074 promotional messages
Found 911 valid transactions out of 942 processed

Successfully processed and saved 911 transactions to processed_transactions.csv

Transaction Summary:
Transaction Types:
transaction_type
expense     726
income      111
transfer     74
Name: count, dtype: int64

Top Categories:
category
miscellaneous    496
savings          242
shopping         117
housing           19
entertainment     17
food               7
refund             7
transport          4
business           2
Name: count, dtype: int64


### Testing on raw-data-1

In [3]:
import pandas as pd
from notification_reader import process_notification_data

def main():
    # Define file paths
    notifications_file = 'raw-data-1.csv'  # Updated to use new file
    dictionary_file = 'dictionary.json'
    patterns_file = 'regex_patterns.json'
    
    output_file = 'processed_transactions_1.csv'
    
    try:
        # Load the dataset
        print("Loading notifications data...")
        notifications_df = pd.read_csv(notifications_file, quotechar='"', escapechar='\\')
        print(f"Loaded {len(notifications_df)} notifications")
        
        # Filter out unwanted system notifications
        unwanted_labels = ['Stay Focused', 'Android System', 'android systemui']  # Added 'android systemui'
        filtered_df = notifications_df[~notifications_df['app_label'].isin(unwanted_labels)]
        
        # Update column mapping for new CSV structure
        column_mapping = {
            '_id': 'id',
            'package_name': 'package_name',
            'app_label': 'app_label', 
            'title': 'message',  # Map 'title' to 'message'
            'text': 'contents',  # Map 'text' to 'contents'
            'post_time': 'timestamp'  # Map 'post_time' to 'timestamp'
        }
        
        # Rename columns to match what the parser expects
        filtered_df = filtered_df.rename(columns=column_mapping)
        
        # Add user_id column if it doesn't exist (using a default value)
        if 'user_id' not in filtered_df.columns:
            filtered_df['user_id'] = 'default_user'
        
        # Save filtered data for inspection
        filtered_df.to_csv('custom_notifications_1.csv', index=False)
        print(f"Filtered to {len(filtered_df)} notifications after removing system apps")
        
        # Show app distribution
        app_counts = filtered_df['app_label'].value_counts()
        print("\nCount of notifications by app label:")
        print(app_counts.head(10))  # Show top 10
        
        # Process the data with dictionary and patterns files
        print("\nProcessing notifications...")
        results_df = process_notification_data(
            filtered_df, 
            dictionary_file, 
            patterns_file
        )
        
        # Remove entries with "unknown" transaction type
        valid_results_df = results_df[results_df['transaction_type'] != 'unknown']
        # Remove entries with no amount
        valid_results_df = valid_results_df.dropna(subset=['amount'])
        
        print(f"Found {len(valid_results_df)} valid transactions out of {len(results_df)} processed")
        
        # Save to CSV        
        valid_results_df.to_csv(output_file, index=False)
        
        print(f"\nSuccessfully processed and saved {len(valid_results_df)} transactions to {output_file}")
        
        # Display summary statistics
        if len(valid_results_df) > 0:
            print("\nTransaction Summary:")
            print(f"Transaction Types:")
            print(valid_results_df['transaction_type'].value_counts())
            print(f"\nTop Categories:")
            print(valid_results_df['category'].value_counts().head(10))
            print(f"\nAmount Statistics:")
            print(f"Total Amount: {valid_results_df['amount'].sum():.2f}")
            print(f"Average Amount: {valid_results_df['amount'].mean():.2f}")
            print(f"Min Amount: {valid_results_df['amount'].min():.2f}")
            print(f"Max Amount: {valid_results_df['amount'].max():.2f}")
        
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Please ensure the following files exist:")
        print(f"- {notifications_file}")
        print(f"- {dictionary_file}")
        print(f"- {patterns_file}")
    except KeyError as e:
        print(f"Missing required data in dictionary file: {e}")
    except ValueError as e:
        print(f"Invalid file format: {e}")
    except Exception as e:
        print(f"Error processing data: {e}")

if __name__ == "__main__":
    main()

Loading notifications data...
Loaded 1000 notifications
Filtered to 539 notifications after removing system apps

Count of notifications by app label:
app_label
whatsapp                                 415
microsoft windowsintune companyportal     35
gojek app                                 18
bca mybca omni android                    15
bitsmedia android muslimpro               13
huawei health                             12
samsung android calendar                   8
samsung android bixby wakeup               6
bumble app                                 3
grabtaxi passenger                         3
Name: count, dtype: int64

Processing notifications...
Filtered out 4 promotional messages
Found 15 valid transactions out of 15 processed

Successfully processed and saved 15 transactions to processed_transactions_1.csv

Transaction Summary:
Transaction Types:
transaction_type
expense    15
Name: count, dtype: int64

Top Categories:
category
transport        5
food             4
shoppi

In [None]:
import pandas as pd
from notification_reader import process_notification_data

def main():
    # Define file paths
    notifications_file = 'raw-data-1.csv'  # Updated to use new file
    dictionary_file = 'dictionary.json'
    patterns_file = 'regex_patterns.json'
    
    output_file = 'processed_transactions_1.csv'
    
    try:
        # Load the dataset
        print("Loading notifications data...")
        notifications_df = pd.read_csv(notifications_file, quotechar='"', escapechar='\\')
        print(f"Loaded {len(notifications_df)} notifications")
        
        # Filter out unwanted system notifications
        unwanted_labels = ['Stay Focused', 'Android System', 'android systemui']  # Added 'android systemui'
        filtered_df = notifications_df[~notifications_df['app_label'].isin(unwanted_labels)]
        
        # Update column mapping for new CSV structure
        column_mapping = {
            '_id': 'id',
            'package_name': 'package_name',
            'app_label': 'app_label', 
            'title': 'message',  # Map 'title' to 'message'
            'text': 'contents',  # Map 'text' to 'contents'
            'post_time': 'timestamp'  # Map 'post_time' to 'timestamp'
        }
        
        # Rename columns to match what the parser expects
        filtered_df = filtered_df.rename(columns=column_mapping)
        
        # Add user_id column if it doesn't exist (using a default value)
        if 'user_id' not in filtered_df.columns:
            filtered_df['user_id'] = 'default_user'
        
        # Save filtered data for inspection
        filtered_df.to_csv('custom_notifications_1.csv', index=False)
        print(f"Filtered to {len(filtered_df)} notifications after removing system apps")
        
        # Show app distribution
        app_counts = filtered_df['app_label'].value_counts()
        print("\nCount of notifications by app label:")
        print(app_counts.head(10))  # Show top 10
        
        # Process the data with dictionary and patterns files
        print("\nProcessing notifications...")
        results_df = process_notification_data(
            filtered_df, 
            dictionary_file, 
            patterns_file
        )
        
        # Remove entries with "unknown" transaction type
        valid_results_df = results_df[results_df['transaction_type'] != 'unknown']
        # Remove entries with no amount
        valid_results_df = valid_results_df.dropna(subset=['amount'])
        
        print(f"Found {len(valid_results_df)} valid transactions out of {len(results_df)} processed")
        
        # Save to CSV        
        valid_results_df.to_csv(output_file, index=False)
        
        print(f"\nSuccessfully processed and saved {len(valid_results_df)} transactions to {output_file}")
        
        # Display summary statistics
        if len(valid_results_df) > 0:
            print("\nTransaction Summary:")
            print(f"Transaction Types:")
            print(valid_results_df['transaction_type'].value_counts())
            print(f"\nTop Categories:")
            print(valid_results_df['category'].value_counts().head(10))
            print(f"\nAmount Statistics:")
            print(f"Total Amount: {valid_results_df['amount'].sum():.2f}")
            print(f"Average Amount: {valid_results_df['amount'].mean():.2f}")
            print(f"Min Amount: {valid_results_df['amount'].min():.2f}")
            print(f"Max Amount: {valid_results_df['amount'].max():.2f}")
        
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Please ensure the following files exist:")
        print(f"- {notifications_file}")
        print(f"- {dictionary_file}")
        print(f"- {patterns_file}")
    except KeyError as e:
        print(f"Missing required data in dictionary file: {e}")
    except ValueError as e:
        print(f"Invalid file format: {e}")
    except Exception as e:
        print(f"Error processing data: {e}")

if __name__ == "__main__":
    main()

=== CONFIG FILE DEBUG ===
✓ dictionary.json exists
✓ dictionary.json is valid JSON
✓ Found 'categories' in dictionary
✓ Found 'merchants' in dictionary
✓ Found 'transaction_types' in dictionary
  - transaction_types has keys: ['income', 'expense', 'transfer']
✓ Found 'blacklist' in dictionary
✓ regex_patterns.json exists
✓ regex_patterns.json is valid JSON
✓ Found 'amount_patterns' in patterns
✓ Found 'account_patterns' in patterns
=== END CONFIG DEBUG ===

Loading notifications data...
Loaded 1000 notifications
Filtered to 36 notifications after keeping only financial apps

Sample notifications:
App: gojek app
Title: Rp 53.000-nya belum kami tarik untuk pembelian GoFood-mu
Text: Pembayaran cuma akan ditarik kalau pembelianmu udah selesai.
---
App: gojek app
Title: nan
Text: nan
---
App: bca mybca omni android
Title: Catatan Finansial
Text: Pengeluaran sebesar IDR 53,000.00 di kategori Transportasi.
---

Count of financial notifications by app label:
app_label
gojek app                