### Create custom dataset

In [11]:
import pandas as pd

notifications_df = pd.read_csv('Push Notif.txt', quotechar='"', escapechar='\\')

app_labels = ['Gojek', 'GoPay', 'Grab', 'Jenius', 'Amazon Shopping']
filtered_df = notifications_df[notifications_df['APP LABEL'].isin(app_labels)]
filtered_df.to_csv('custom_notifications.csv', index=False)
print(filtered_df.head())

app_counts = filtered_df['APP LABEL'].value_counts()
print("\nCount of notifications by app label:")
print(app_counts)

# To see if any of the requested labels are missing from the data
missing_labels = set(app_labels) - set(app_counts.index)
if missing_labels:
    print("\nRequested labels not found in the data:", missing_labels)

      ID            PACKAGE NAME APP LABEL  \
0    437         com.gojek.gopay     GoPay   
1    439  com.grabtaxi.passenger      Grab   
128  942         com.gojek.gopay     GoPay   
129  943           com.gojek.app     Gojek   
130  944         com.gojek.gopay     GoPay   

                                     MESSAGE DATE  \
0          Pembayaran ke TOKOPEDIA berhasil.  NaN   
1            Gercep biar kebagian diskonnya.  NaN   
128         Ka-ching! You got GoPay Coins 💰.  NaN   
129  Payment successfully made to TOKOPEDIA.  NaN   
130  Payment successfully made to TOKOPEDIA.  NaN   

                                              CONTENTS            TIMESTAMP  
0    Dana sebesar Rp1.407 telah dipotong dari GoPay...  2023-07-09 03:13:14  
1                 Diskon s.d. 30% tiap Minggu-Selasa 👉  2023-07-09 03:44:43  
128  Yay, you just earned 997 GoPay Coins! Tap here...  2023-07-09 10:21:29  
129  An amount of Rp997 has been made from your GoPay.  2023-07-09 10:37:59  
130  An amount

### Notification parser

In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import warnings
from enum import Enum
import spacy
from spacy.tokens import DocBin
from spacy.lang.en import English

# Suppress warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)

class TransactionType(Enum):
    INCOME = "income"
    EXPENSE = "expense"
    TRANSFER = "transfer"
    TOP_UP = "top_up"
    UNKNOWN = "unknown"

class Transaction:
    def __init__(self):
        self.user = None
        self.timestamp = datetime.now().isoformat()
        self.transaction_type = TransactionType.UNKNOWN
        self.amount = None
        self.account_number = None
        self.from_account = None
        self.to_account = None
        self.balance = None
        self.category = None
    
    def to_dict(self):
        # Get transaction type as string
        transaction_type_str = self.transaction_type.value if isinstance(self.transaction_type, TransactionType) else str(self.transaction_type)
        
        # Determine the final category based on transaction_type if category is not set
        final_category = self.category
        if not final_category and self.transaction_type != TransactionType.UNKNOWN:
            if self.transaction_type == TransactionType.INCOME:
                final_category = "other"
            elif self.transaction_type == TransactionType.EXPENSE:
                final_category = "other"
            elif self.transaction_type == TransactionType.TRANSFER:
                final_category = "general"
            elif self.transaction_type == TransactionType.TOP_UP:
                final_category = "finance"
        
        return {
            "user": self.user,
            "timestamp": self.timestamp,
            "transaction_type": transaction_type_str,
            "amount": self.amount,
            "account_number": self.account_number,
            "from_account": self.from_account,
            "to_account": self.to_account,
            "balance": self.balance,
            "category": final_category,
        }

class NotificationParser:
    def __init__(self):
        # Define categories and their related keywords - focused on Indonesian mobile payments
        self.categories = {
            'food': ['food', 'meal', 'restaurant', 'order', 'menu', 'eat', 'lunch', 'dinner', 'breakfast', 'gofood'],
            'transport': ['ride', 'trip', 'gocar', 'grab', 'transport', 'travel', 'driver', 'gojek ride', 'blue bird', 'taxi'],
            'shopping': ['purchase', 'buy', 'shopping', 'shop', 'amazon', 'store', 'item', 'product'],
            'entertainment': ['movie', 'ticket', 'entertainment', 'game', 'music', 'streaming'],
            'bills': ['bill', 'utility', 'electricity', 'water', 'internet', 'phone', 'subscription'],
            'transfer': ['transfer', 'send money', 'receive money'],
            'finance': ['gopay', 'payment', 'wallet', 'jenius', 'bank', 'credit', 'debit', 'card', 'saving'],
            'education': ['course', 'class', 'learning', 'tuition', 'school', 'university'],
            'health': ['medicine', 'doctor', 'hospital', 'health', 'medical', 'pharmacy']
        }
        
        # Define well-known merchants and their categories
        self.merchants = {
            "blue bird": "transport",
            "gojek ride": "transport",
            "grab": "transport",
            "taxi": "transport",
            "uber": "transport",
            "gofood": "food",
            "grabfood": "food",
            "food delivery": "food",
            "tokopedia": "shopping",
            "shopee": "shopping",
            "lazada": "shopping",
            "amazon": "shopping"
        }
        
        # Configure regex patterns for transaction extraction
        self.patterns = [
            # Income patterns
            {
                "regex": r"(?:received|receive|payment from|transfer from)\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)",
                "type": TransactionType.INCOME,
                "extract": lambda m: {
                    "amount": m.group(1)
                }
            },
            # Expense patterns
            {
                "regex": r"(?:paid|payment to|sent|pay|purchase|bought|deducted)\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)",
                "type": TransactionType.EXPENSE,
                "extract": lambda m: {
                    "amount": m.group(1)
                }
            },
            # Transfer patterns
            {
                "regex": r"(?:transfer|send to|sent to)\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)",
                "type": TransactionType.TRANSFER,
                "extract": lambda m: {
                    "amount": m.group(1)
                }
            },
            # Top-up patterns
            {
                "regex": r"(?:top-up|top up|topup|topped up)\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)",
                "type": TransactionType.TOP_UP,
                "extract": lambda m: {
                    "amount": m.group(1)
                }
            },
            # Balance patterns
            {
                "regex": r"(?:balance|saldo)(?:\s+is|\s+now|\s+remaining|\:)?\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)",
                "type": None,  # This pattern doesn't determine transaction type
                "extract": lambda m: {
                    "balance": m.group(1)
                }
            },
            # GoPay Coins patterns
            {
                "regex": r"(\d+)\s+(?:GoPay Coins|Coins)",
                "type": TransactionType.INCOME,
                "extract": lambda m: {
                    "amount": m.group(1),
                    "category": "cashback"
                }
            },
            # From account patterns
            {
                "regex": r"from\s+(?:account)?\s*(?:number)?\s*[:\.]?\s*([\w\s]+?)(?:\.|\s*$)",
                "type": None,
                "extract": lambda m: {
                    "from_account": m.group(1).strip()
                }
            },
            # To account patterns
            {
                "regex": r"to\s+(?:account)?\s*(?:number)?\s*[:\.]?\s*([\w\s\-&\']+?)(?:\.|\s*$|\s+for|\s+on|\s+at|\s+with)",
                "type": None,
                "extract": lambda m: {
                    "to_account": m.group(1).strip().rstrip('.')
                }
            },
            # Account number patterns
            {
                "regex": r"account\s+(?:number|#)?\s*[:\.]?\s*(\d+)",
                "type": None,
                "extract": lambda m: {
                    "account_number": m.group(1)
                }
            }
        ]
    
    def parse_date(self, date_str):
        """Parse date in various formats"""
        if not date_str:
            return None
            
        try:
            # Handle formats like 07Apr25
            if re.match(r"\d{2}[A-Za-z]{3}\d{2}", date_str):
                return datetime.strptime(date_str, "%d%b%y").date()
            # Handle formats like 07-04-2023
            elif re.match(r"\d{1,2}-\d{1,2}-\d{2,4}", date_str):
                # Try different formats
                for fmt in ["%d-%m-%Y", "%d-%m-%y", "%m-%d-%Y", "%m-%d-%y"]:
                    try:
                        return datetime.strptime(date_str, fmt).date()
                    except ValueError:
                        continue
            return None
        except:
            return None
    
    def extract_amount(self, text):
        """Extract monetary amounts from text."""
        # Ensure text is a string
        if not isinstance(text, str):
            return None
            
        # Pattern for currency amounts with various formats
        patterns = [
            r'(?:Rp|IDR)\s*(\d+(?:[.,]\d+)*)',  # Rp or IDR followed by digits
            r'(\d+(?:[.,]\d+)*)\s*(?:rupiah|rupi)',  # Digits followed by rupiah
            r'(?:received|sent|paid|payment|transfer|top.up|topup|refund of|refund|cashback of|cashback|balance)\s+(?:of\s+)?(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)',  # Transaction verbs followed by amount
            r'(?:\d+)\s+(?:GoPay Coins|Coins)',  # Coins amount
            r'(\d+(?:[.,]\d+)*)' # Just digits as fallback
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                # Clean up the amount and convert to float
                amount_str = matches[0]
                if isinstance(amount_str, tuple) and len(amount_str) > 0:
                    amount_str = amount_str[0]
                amount_str = amount_str.replace('.', '').replace(',', '.')
                try:
                    return float(amount_str)
                except ValueError:
                    continue
        
        # Special case for GoPay Coins
        coin_match = re.search(r'(\d+)\s+(?:GoPay Coins|Coins)', text)
        if coin_match:
            return float(coin_match.group(1))
        
        return None
    
    def extract_transaction_type(self, text):
        """Determine transaction type based on text content."""
        # Ensure text is a string
        if not isinstance(text, str):
            return TransactionType.UNKNOWN
            
        text_lower = text.lower()
        
        # Income patterns
        if any(word in text_lower for word in ['received', 'receive', 'refund', 'cashback', 'payment from', 'transfer from']):
            return TransactionType.INCOME
        
        # Expense patterns
        elif any(word in text_lower for word in ['paid', 'payment to', 'sent', 'pay', 'purchase', 'bought', 'deducted']):
            return TransactionType.EXPENSE
        
        # Transfer patterns
        elif any(word in text_lower for word in ['transfer', 'send to', 'sent to']):
            return TransactionType.TRANSFER
        
        # Top-up patterns
        elif any(word in text_lower for word in ['top-up', 'top up', 'topup', 'topped up']):
            return TransactionType.TOP_UP
        
        return TransactionType.UNKNOWN
    
    def extract_account_number(self, text):
        """Extract account numbers from text."""
        # Ensure text is a string
        if not isinstance(text, str):
            return None
            
        # Pattern for account numbers
        patterns = [
            r'account\s+(?:number|#)?\s*[:\.]?\s*(\d+)',  # Account number: digits
            r'card\s+(?:number|#)?\s*[:\.]?\s*[*xX]+(\d{4})',  # Card number ending with 4 digits
            r'to\s+account\s+(?:number|#)?\s*[:\.]?\s*(\d+)',  # To account number
            r'from\s+account\s+(?:number|#)?\s*[:\.]?\s*(\d+)',  # From account number
            r'(?:account|card)\s+ending\s+(?:in|with)\s+(\d{4})'  # Account ending with digits
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1)
        
        return None
    
    def extract_balance(self, text):
        """Extract balance information from text."""
        # Ensure text is a string
        if not isinstance(text, str):
            return None
        
        # Patterns for balance amounts
        balance_patterns = [
            r'(?:balance|saldo)(?:\s+is|\s+now|\s+remaining|\:)?\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)',  # Balance: amount
            r'(?:available|remaining)\s+(?:balance|saldo)(?:\s+is|\:)?\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)',  # Available balance
            r'(?:you\s+have|your)\s+(?:balance|saldo)(?:\s+is|\:)?\s+(?:Rp|IDR)?\s*(\d+(?:[.,]\d+)*)'  # You have balance
        ]
        
        for pattern in balance_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                balance_str = match.group(1).replace('.', '').replace(',', '.')
                try:
                    return float(balance_str)
                except ValueError:
                    continue
        
        return None
    
    def extract_category(self, text):
        """Categorize transactions based on content."""
        # Ensure text is a string
        if not isinstance(text, str):
            return "other"
            
        text_lower = text.lower()
        
        # Check for merchant-based categories first
        for merchant, category in self.merchants.items():
            if merchant in text_lower:
                return category
        
        # Then check keyword-based categories
        for category, keywords in self.categories.items():
            if any(keyword in text_lower for keyword in keywords):
                return category
        
        return "other"
    
    def extract_from_account(self, text):
        """Extract sender account information."""
        # Ensure text is a string
        if not isinstance(text, str):
            return None
            
        patterns = [
            # Generalized patterns to capture multi-word sender names
            r'from\s+(?:account)?\s*(?:number)?\s*[:\.]?\s*([\w\s]+?)(?:\.|\s*$)',
            r'([\w\s]+?)\s+sent\s+you',
            r'received\s+from\s+([\w\s]+?)(?:\.|\s*$)',
            r'from\s+your\s+([\w\s]+?)(?:\.|\s*$)'  # To catch "from your GoPay"
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                # Clean up the extracted sender name (remove trailing whitespace)
                return match.group(1).strip()
        
        return None
    
    def extract_to_account(self, text):
        """Extract recipient account information."""
        # Ensure text is a string
        if not isinstance(text, str):
            return None
            
        patterns = [
            # Improved patterns to capture multi-word merchant names with better boundary detection
            r'to\s+(?:account)?\s*(?:number)?\s*[:\.]?\s*([\w\s\-&\']+?)(?:\.|\s*$|\s+for|\s+on|\s+at|\s+with)',
            r'to\s+([\w\s\-&\']+?)(?:\.|\s*$|\s+for|\s+on|\s+at|\s+with)',  # "to Blue Bird." or "to Blue Bird"
            r'sent\s+to\s+([\w\s\-&\']+?)(?:\.|\s*$|\s+for|\s+on|\s+at|\s+with)',  # "sent to Blue Bird." or "sent to Blue Bird"
            r'paid\s+to\s+([\w\s\-&\']+?)(?:\.|\s*$|\s+for|\s+on|\s+at|\s+with)',  # "paid to Blue Bird." or "paid to Blue Bird"
            r'payment\s+(?:successfully\s+)?made\s+to\s+([\w\s\-&\']+?)(?:\.|\s*$|\s+for|\s+on|\s+at|\s+with)'  # "payment made to Blue Bird."
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                # Clean up the extracted merchant name (remove trailing whitespace and handle periods)
                merchant_name = match.group(1).strip()
                # Remove trailing period if it exists
                if merchant_name.endswith('.'):
                    merchant_name = merchant_name[:-1]
                return merchant_name
        
        return None

    def parse_notification(self, message, contents, user_id, timestamp):
        """Parse notification and extract transaction details"""
        transaction = Transaction()
        
        # Set basic information
        transaction.user = user_id
        if timestamp:
            transaction.timestamp = timestamp
        
        # Combine message and contents for better entity extraction
        full_text = f"{message} {contents}"
        
        # First try pattern-based extraction with the comprehensive patterns
        for pattern in self.patterns:
            match = re.search(pattern["regex"], full_text, re.IGNORECASE)
            if match:
                # Set transaction type if the pattern defines one
                if pattern["type"]:
                    transaction.transaction_type = pattern["type"]
                
                # Extract data based on the pattern's extraction function
                data = pattern["extract"](match)
                
                for key, value in data.items():
                    if key == "amount" and value:
                        try:
                            transaction.amount = float(value.replace(',', '').replace('.', ''))
                        except (ValueError, AttributeError):
                            pass
                    elif key == "balance" and value:
                        try:
                            transaction.balance = float(value.replace(',', '').replace('.', ''))
                        except (ValueError, AttributeError):
                            pass
                    elif key == "account_number":
                        transaction.account_number = value
                    elif key == "from_account":
                        transaction.from_account = value
                    elif key == "to_account":
                        transaction.to_account = value
                    elif key == "category":
                        transaction.category = value
        
        # If transaction type is still unknown, try to determine it
        if transaction.transaction_type == TransactionType.UNKNOWN:
            transaction.transaction_type = self.extract_transaction_type(full_text)
        
        # Use the specialized extraction functions for any missing fields
        if transaction.amount is None:
            transaction.amount = self.extract_amount(full_text)
        
        if transaction.account_number is None:
            transaction.account_number = self.extract_account_number(full_text)
        
        if transaction.from_account is None:
            transaction.from_account = self.extract_from_account(full_text)
        
        if transaction.to_account is None:
            transaction.to_account = self.extract_to_account(full_text)
        
        if transaction.balance is None:
            transaction.balance = self.extract_balance(full_text)
        
        # Apply default values for from_account and to_account based on transaction type
        if transaction.transaction_type == TransactionType.INCOME and not transaction.to_account:
            transaction.to_account = 'GoPay'
        elif transaction.transaction_type == TransactionType.EXPENSE and not transaction.from_account:
            transaction.from_account = 'GoPay'
        
        # Determine category last to take advantage of all extracted information
        if not transaction.category:
            # Try to categorize based on to_account first (for merchants)
            if transaction.to_account:
                to_account_lower = transaction.to_account.lower()
                for merchant, category in self.merchants.items():
                    if merchant in to_account_lower:
                        transaction.category = category
                        break
            
            # If still no category, try generic text categorization
            if not transaction.category:
                transaction.category = self.extract_category(full_text)
        
        return transaction

def process_notification_data(df):
    """
    Process the notifications dataframe to extract structured transaction data.
    """
    parser = NotificationParser()
    results = []
    
    # Process each notification
    for _, row in df.iterrows():
        try:
            # Handle null or non-string values in MESSAGE and CONTENTS
            message = str(row['MESSAGE']) if pd.notna(row['MESSAGE']) else ""
            contents = str(row['CONTENTS']) if isinstance(row['CONTENTS'], str) and pd.notna(row['CONTENTS']) else ""
            
            # Extract user ID
            user = str(row['ID']) if pd.notna(row['ID']) else "unknown_id"
            
            # Extract timestamp
            timestamp = row['TIMESTAMP'] if pd.notna(row['TIMESTAMP']) else datetime.now().isoformat()
            
            # Parse the notification
            transaction = parser.parse_notification(message, contents, user, timestamp)
            
            # Add to results
            results.append(transaction.to_dict())
            
        except Exception as e:
            print(f"Error processing row: {e}")
            # Continue with next row if there's an error in processing current row
            continue
    
    return pd.DataFrame(results)

In [15]:
try:
    # Load the dataset
    print("Loading notifications data...")
    notifications_df = pd.read_csv('custom_notifications.csv')
    
    print(f"Loaded {len(notifications_df)} notifications")
    
    # Process the data
    print("Processing notifications...")
    results_df = process_notification_data(notifications_df)
    
    # Remove entries with "unknown" transaction type
    valid_results_df = results_df[results_df['transaction_type'] != 'unknown']
    
    # Count transactions by type
    transaction_counts = valid_results_df['transaction_type'].value_counts()
    
    # Print transaction counts
    print("\nTransaction Type Summary:")
    print(f"Income: {transaction_counts.get('income', 0)}")
    print(f"Expense: {transaction_counts.get('expense', 0)}")
    print(f"Transfer: {transaction_counts.get('transfer', 0)}")
    print(f"Top-up: {transaction_counts.get('top_up', 0)}")
    print(f"Total valid transactions: {len(valid_results_df)}")
    
    if len(results_df) - len(valid_results_df) > 0:
        print(f"Removed {len(results_df) - len(valid_results_df)} unknown transaction entries")
    
    # Category distribution
    category_counts = valid_results_df['category'].value_counts()
    print("\nCategory Distribution:")
    for category, count in category_counts.items():
        print(f"{category}: {count}")
    
    # Calculate financial summary
    if 'amount' in valid_results_df.columns:
        income_amount = valid_results_df[valid_results_df['transaction_type'] == 'income']['amount'].sum()
        expense_amount = valid_results_df[valid_results_df['transaction_type'] == 'expense']['amount'].sum()
        
        print("\nFinancial Summary:")
        print(f"Total Income: {income_amount:,.2f}")
        print(f"Total Expenses: {expense_amount:,.2f}")
        print(f"Net Balance: {income_amount - expense_amount:,.2f}")
    
    # Save to CSV
    valid_results_df.to_csv('processed_transactions.csv', index=False)
    
    print(f"\nSuccessfully processed and saved {len(valid_results_df)} transactions.")
    
except Exception as e:
    print(f"Error processing data: {e}")


Loading notifications data...
Loaded 4658 notifications
Processing notifications...

Transaction Type Summary:
Income: 899
Expense: 1090
Transfer: 57
Top-up: 22
Total valid transactions: 2068
Removed 2590 unknown transaction entries

Category Distribution:
finance: 915
shopping: 272
other: 250
bills: 198
cashback: 196
transfer: 78
food: 71
entertainment: 50
transport: 38

Financial Summary:
Total Income: 7,261,123.16
Total Expenses: 98,382,713.58
Net Balance: -91,121,590.42

Successfully processed and saved 2068 transactions.
