In [14]:
import pandas as pd
import sqlite3
import gc
import sys
import os

src_path = os.path.abspath(r'C:\Users\User\OneDrive - Imperial College London\Desktop\Documents\GitHub\Credit-Card\src')
sys.path.append(src_path)

from data.clean_data import drop_columns_if_exist, update_dtypes
from data.to_database import save_to_sqlite

gc.collect()

7

In [2]:
df = pd.read_parquet(r'C:\Users\User\OneDrive - Imperial College London\Desktop\Documents\GitHub\Credit-Card\data\raw\all_2023q3_2024q1_FULL.parquet')
df_auth = pd.read_parquet(r'C:\Users\User\OneDrive - Imperial College London\Desktop\Documents\GitHub\Credit-Card\data\raw\authenticated_2023q3_2024q1_FULL.parquet')

In [3]:
df = df.dropna(axis=1, how='all')
df_auth = df_auth.dropna(axis=1, how='all')

In [4]:
# Columns to drop
columns_to_drop = [
    'callback_url', 'credit_card_token', 'order', 'order_id', 'return_url', 'payment_method_id',
    'payment_processor_response', 'ucaf_authentication_data', 'ucaf_collection_indicator', 'xid',
    'language', 'merchant_url', 'three_ds_method_data', 'is_auto_timeout', 'token_type',
    'request_token', 'merchant_name', 'merchant_reference_code', 'descriptor', 'metadata',
    'internal_metadata', 'rewards', 'promotion', 'vat_amount', 'api_version', 'authorization_id',
    'authorization_additional_data', 'authorization_receipt_number', 'authorization_request_token',
    'assessment_id', 'authorization_response', 'capture_response', 'mid_label', 'network_token_id',
    'processor_response', 'retrieval_reference_number', 'card_data_id', 'capture_request_id',
    'masked_card_number', 'original_id', 'installment', 'fee_label', 'transaction_id', 'version',
    'authentication_id', 'authorization_request_id', 'system_trace_audit_number', 'is_cvn_submitted',
    'pares_status', 'processor_type', 'proof_xml', 'redirect_html', 'md', 'pa_req',
    'referer', 'request_id', 'term_url', 'three_ds_result','three_ds_version', 'user_agent', 'veres_enrolled', 
    'x_forwarded_for', 'initiate_three_ds_url', 'authentication_verification_response', 'authentication_transaction_id',
    'acs_url','initial_client_type', 'directory_server_transaction_id', 'billing_details', 'cc_version'
]

In [5]:
df = drop_columns_if_exist(df, columns_to_drop)
df_auth = drop_columns_if_exist(df_auth, columns_to_drop)

In [6]:
# Replace non-finite values (NaN, inf) with a placeholder (e.g., -1)
df['card_expiration_month'] = pd.to_numeric(df['card_expiration_month'], errors='coerce').fillna(-1).astype(int)
df['card_expiration_year'] = pd.to_numeric(df['card_expiration_year'], errors='coerce').fillna(-1).astype(int)


print(f"df shape: {df.shape}")
print(f"df_auth shape: {df_auth.shape}")


df shape: (391604, 51)
df_auth shape: (475066, 30)


In [7]:
# Define dtypes for conversion
df_dtypes = {
    'id': 'category', 'acquiring_bank_name': 'category', 'approval_code': 'category',
    'authorization_transaction_date': 'datetime64[ns]', 'authorized_amount': 'float32',
    'avs_code': 'category', 'bank_merchant_id': 'category', 'bank_reconciliation_id': 'category',
    'business_id': 'category', 'capture_amount': 'float32', 'card_brand': 'category',
    'card_expiration_month': 'int8', 'card_expiration_year': 'int16', 'card_holder_name': 'category',
    'card_type': 'category', 'cavv': 'category', 'cc_version': 'category', 'charge_type': 'category',
    'client_id': 'category', 'client_type': 'category', 'country': 'category', 'country_code': 'category',
    'created': 'datetime64[ns]', 'credit_card_payment_channel': 'category', 'credit_card_processor': 'category',
    'credit_card_token_id': 'category', 'currency': 'category', 'cvn_code': 'category', 'eci': 'category',
    'external_id': 'category', 'failure_reason': 'category', 'fee_amount': 'float32', 'is_blocked_by_fraud': 'bool',
    'is_switcher': 'bool', 'is_t4': 'bool', 'issuing_bank_name': 'category', 'merchant_id': 'category',
    'refund_status': 'category', 'requester_email': 'category', 'reversed_amount': 'float32',
    'settlement_status': 'category', 'settlement_updated': 'datetime64[ns]', 'should_authenticate_credit_card': 'bool',
    'should_settle_directly': 'bool', 'status': 'category', 'total_refund_amount': 'float32',
    'total_refund_fee_amount': 'float32', 'transaction_channel': 'category', 'ucaf': 'category',
    'updated': 'datetime64[ns]', 'user_id': 'category', 'use_reward': 'category', 'dt': 'datetime64[ns]',
    'amount': 'float32', 'authentication_type': 'category', 'card_bank': 'category', 'commerce_indicator': 'category',
    'credit_card_enrollment_info': 'category', 'cybersource_merchant_id': 'category', 'eci_raw': 'category', 'ip_address': 'category',
    'is_enrolled': 'bool', 'cof_type': 'category' 
    
}


In [8]:
# Apply conversions
df = update_dtypes(df, df_dtypes)
df_auth = update_dtypes(df_auth, df_dtypes)

Total rows: 391604
Number of rows dropped: 0
Total rows: 475066
Number of rows dropped: 0


In [10]:
df.dtypes

id                                       category
acquiring_bank_name                      category
approval_code                            category
authorization_transaction_date     datetime64[ns]
authorized_amount                         float32
avs_code                                 category
bank_merchant_id                         category
bank_reconciliation_id                   category
business_id                              category
capture_amount                            float32
card_brand                               category
card_expiration_month                        int8
card_expiration_year                        int16
card_holder_name                         category
card_type                                category
cavv                                     category
charge_type                              category
client_id                                category
client_type                              category
country                                  category


In [12]:
df_auth.dtypes

id                                   category
amount                                float32
authentication_type                  category
business_id                          category
card_bank                            category
card_brand                           category
card_expiration_month                    int8
card_expiration_year                    int16
cavv                                 category
client_id                            category
client_type                          category
commerce_indicator                   category
country                              category
country_code                         category
created                        datetime64[ns]
credit_card_enrollment_info          category
credit_card_payment_channel          category
credit_card_token_id                 category
currency                             category
cybersource_merchant_id              category
eci                                  category
eci_raw                           

In [16]:
# Define database file paths
save_dir = r'C:\Users\User\OneDrive - Imperial College London\Desktop\Documents\GitHub\Credit-Card\data\database (SQLite)' 

db_df = os.path.join(save_dir, 'processed_data.db')
db_df_auth = os.path.join(save_dir, 'processed_authenticated_data.db')

In [17]:
save_to_sqlite(df, db_df, 'data')
save_to_sqlite(df_auth, db_df_auth, 'authenticated_data')

# Clean up memory
gc.collect()

print("DataFrames have been stored as SQLite databases.")

DataFrames have been stored as SQLite databases.
