# v2 - Decision Trees Parallelized GridSearch


## MultiProcessing for FeatureEngineering -
    - distance, count_transactions_within_last_hour,betweeness centrality
2. Code is modularised in "references" folder 
    decisiontree_gridsearch_modelutils.py
      -- parallel_one_hot_encode  -- This Module will parallelize OneHot Encoding.
      -- preprocessdata_parallel_onehot_impute   -- This Module will parallelize preprocessing data
      -- perform_grid_search --uses n_jobs = -1 to parallelize GridSearch
      -- generate_classification_report_and_roc


In [857]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import geopy
from geopy.distance import geodesic
import pickle


In [858]:
import sys
sys.path.append('../references')  # Add the references folder to the system path


In [859]:
model_specs = 'DecisionTrees_Parallelized_GridSearch'

In [860]:
start_time_notebook = time.time()


In [861]:
start_time = time.time()


In [862]:
# Directory to save the figures 

input_src_dir = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/data/raw'
output_dir_figures_train = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/figures/train_figures'
output_dir_figures_test = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/figures/test_figures'

reports_output_dir_base = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports'
# reports_output_dir for DecisionTrees
reports_output_dir = f"{reports_output_dir_base}/DecisionTrees"
print(reports_output_dir)

/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees


In [863]:
# Define which dataset to use
use_test_data = False  # Set to True when using fraudtest.csv

# Determine dataset type based on the variable
dataset_type = 'Test' if use_test_data else 'Train'

# Load the appropriate dataset

if use_test_data:
    output_dir_figures = output_dir_figures_test
else:
    output_dir_figures = output_dir_figures_train

In [864]:
# Generate the preprocess file name dynamically
# Get the current timestamp
timestamp = time.strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS

logfile_title = 'LogFile'
logfile_name = f"{model_specs}_{dataset_type}_{logfile_title.replace(',', '').lower().split('.')[0]}_{timestamp}.txt"

logfile_path = os.path.join(reports_output_dir, logfile_name)

# Function to log times to a file
def log_time(step_name, start_time):
    end_time = time.time()
    elapsed_time = end_time - start_time
    log_message = (f"{step_name} completed at {time.ctime(end_time)}. "
                   f"Elapsed time: {elapsed_time // 60:.0f} minutes and {elapsed_time % 60:.2f} seconds\n")
    
    # Append log to file
    with open(logfile_path, 'a') as f:
        f.write(log_message)
    
    # Print the message to the console as well
    print(log_message)


In [865]:

log_time(f"{model_specs}_{dataset_type} Notebook  started at... ", start_time_notebook)
start_time = time.time()


DecisionTrees_Parallelized_GridSearch_Test Notebook  started at...  completed at Sun Nov  3 09:52:37 2024. Elapsed time: 0 minutes and 0.05 seconds



In [866]:
# Load the appropriate dataset

if use_test_data:
    df = pd.read_csv(f"{input_src_dir}/fraudTest.csv")  # Concatenate the directory with the filename
else:
    df = pd.read_csv(f"{input_src_dir}/fraudTrain.csv")


In [867]:
print(df.columns)
print(df.shape)


Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')
(555719, 23)


In [868]:
df.columns = df.columns.str.strip()
df = df.rename(columns={'amt': 'TransactionAmount', 'cc_num': 'CreditCardNumber', 'dob': 'DateOfBirth', 'trans_date_trans_time': 'TransactionTime'})
print(df.columns)


Index(['Unnamed: 0', 'TransactionTime', 'CreditCardNumber', 'merchant',
       'category', 'TransactionAmount', 'first', 'last', 'gender', 'street',
       'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [869]:
# Generate a unique TransactionID for each row
df['TransactionID'] = range(1, len(df) + 1)


In [870]:
print(df.columns)
print(df.shape)


Index(['Unnamed: 0', 'TransactionTime', 'CreditCardNumber', 'merchant',
       'category', 'TransactionAmount', 'first', 'last', 'gender', 'street',
       'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID'],
      dtype='object')
(555719, 24)


In [871]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# no missing values

Missing values per column:
 Unnamed: 0           0
TransactionTime      0
CreditCardNumber     0
merchant             0
category             0
TransactionAmount    0
first                0
last                 0
gender               0
street               0
city                 0
state                0
zip                  0
lat                  0
long                 0
city_pop             0
job                  0
DateOfBirth          0
trans_num            0
unix_time            0
merch_lat            0
merch_long           0
is_fraud             0
TransactionID        0
dtype: int64


In [872]:
# Count of fraud and non-fraud transactions
fraud_counts = df['is_fraud'].value_counts()
print(fraud_counts)

# Optionally, you can get it in percentage terms
fraud_percentage = df['is_fraud'].value_counts(normalize=True) * 100
print(fraud_percentage)

is_fraud
0    553574
1      2145
Name: count, dtype: int64
is_fraud
0    99.614014
1     0.385986
Name: proportion, dtype: float64


In [873]:
#how many unique credit cards in the data set ??
df['CreditCardNumber'].nunique()

924

In [874]:
print(df.columns)

Index(['Unnamed: 0', 'TransactionTime', 'CreditCardNumber', 'merchant',
       'category', 'TransactionAmount', 'first', 'last', 'gender', 'street',
       'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID'],
      dtype='object')


In [875]:
# Convert TransactionTime to datetime
df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])

# Optional: Convert DateOfBirth to datetime, if needed
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')

In [876]:
# Set 'TransactionTime' as the index permanently
df.set_index('TransactionTime', inplace=True)

# Verify the index
print(df.index)


DatetimeIndex(['2020-06-21 12:14:25', '2020-06-21 12:14:33',
               '2020-06-21 12:14:53', '2020-06-21 12:15:15',
               '2020-06-21 12:15:17', '2020-06-21 12:15:37',
               '2020-06-21 12:15:44', '2020-06-21 12:15:50',
               '2020-06-21 12:16:10', '2020-06-21 12:16:11',
               ...
               '2020-12-31 23:57:18', '2020-12-31 23:57:50',
               '2020-12-31 23:57:56', '2020-12-31 23:58:04',
               '2020-12-31 23:58:34', '2020-12-31 23:59:07',
               '2020-12-31 23:59:09', '2020-12-31 23:59:15',
               '2020-12-31 23:59:24', '2020-12-31 23:59:34'],
              dtype='datetime64[ns]', name='TransactionTime', length=555719, freq=None)


In [877]:
# Get the minimum and maximum transaction times from the index
min_time = df.index.min()
max_time = df.index.max()

print(f"Minimum Transaction Time: {min_time}")
print(f"Maximum Transaction Time: {max_time}")


Minimum Transaction Time: 2020-06-21 12:14:25
Maximum Transaction Time: 2020-12-31 23:59:34


In [878]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,CreditCardNumber,merchant,category,TransactionAmount,first,last,gender,street,city,...,long,city_pop,job,DateOfBirth,trans_num,unix_time,merch_lat,merch_long,is_fraud,TransactionID
TransactionTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-21 12:14:25,0,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,1
2020-06-21 12:14:33,1,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,2
2020-06-21 12:14:53,2,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0,3
2020-06-21 12:15:15,3,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,4
2020-06-21 12:15:17,4,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,5


In [879]:
log_time("Initial Steps Completed File Loading, Describe, Date Conversions etc..  ", start_time)
log_time("--------------------------------------------------- ------------------  ", start_time)

Initial Steps Completed File Loading, Describe, Date Conversions etc..   completed at Sun Nov  3 09:52:41 2024. Elapsed time: 0 minutes and 3.71 seconds

--------------------------------------------------- ------------------   completed at Sun Nov  3 09:52:41 2024. Elapsed time: 0 minutes and 3.71 seconds



# Feature Engineering

In [880]:
# Log pre-process time at various steps
start_time = time.time()


In [881]:
log_time("START - Feature Engineering .....  ", start_time)
start_time = time.time()

START - Feature Engineering .....   completed at Sun Nov  3 09:52:41 2024. Elapsed time: 0 minutes and 0.01 seconds



In [882]:

# Clip outliers if necessary
df['TransactionAmount'] = df['TransactionAmount'].clip(upper=df['TransactionAmount'].quantile(0.99))



In [883]:

# Replace inf values with NaN (in case they exist in the 'TransactionAmount' column)
df['TransactionAmount'].replace([np.inf, -np.inf], np.nan, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TransactionAmount'].replace([np.inf, -np.inf], np.nan, inplace=True)


# next type of VIZ via transaction id vs transaction count


In [884]:
# Extract hour from TransactionTime
df['Hour'] = df.index.hour  # Since TransactionTime is already set as the index


In [885]:
# Calculate fraud rate by hour
fraud_rate_by_hour = df.groupby('Hour')['is_fraud'].mean()

# Sort by fraud rate in descending order
fraud_rate_by_hour = fraud_rate_by_hour.sort_values(ascending=False)

# Define a threshold for high-risk hours (adjust as needed)
threshold = fraud_rate_by_hour.mean()  # Mean fraud rate across all hours

# Dynamically identify high-risk hours based on the threshold
high_risk_hours = fraud_rate_by_hour[fraud_rate_by_hour > threshold].index.tolist()

# Print high-risk hours for reference
print("High-Risk Hours:", high_risk_hours)

# Create the HighRiskHour flag based on dynamically identified high-risk hours
df['HighRiskHour'] = df['Hour'].apply(lambda x: 1 if x in high_risk_hours else 0)

# Print a sample of the DataFrame to verify the new column
print(df[['Hour', 'HighRiskHour']])


High-Risk Hours: [22, 23, 3, 0, 2, 1]
                     Hour  HighRiskHour
TransactionTime                        
2020-06-21 12:14:25    12             0
2020-06-21 12:14:33    12             0
2020-06-21 12:14:53    12             0
2020-06-21 12:15:15    12             0
2020-06-21 12:15:17    12             0
...                   ...           ...
2020-12-31 23:59:07    23             1
2020-12-31 23:59:09    23             1
2020-12-31 23:59:15    23             1
2020-12-31 23:59:24    23             1
2020-12-31 23:59:34    23             1

[555719 rows x 2 columns]


1. Time-Based Analysis:
Already explored daily and hourly trends in transaction volumes, but now dive deeper into fraud patterns based on time.



In [886]:
#Weekday vs. Weekend: Is fraud more common on weekdays or weekends?
df['DayOfWeek'] = df.index.dayofweek  # 0 = Monday, 6 = Sunday
fraud_by_day = df[df['is_fraud'] == 1]['DayOfWeek'].value_counts().sort_index()
non_fraud_by_day = df[df['is_fraud'] == 0]['DayOfWeek'].value_counts().sort_index()



In [887]:

# Define the correct day order
day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']


df['DayName'] = df.index.day_name()
# Convert the 'DayName' column to a categorical type with the correct order
df['DayName'] = pd.Categorical(df['DayName'], categories=day_order, ordered=True)

fraud_by_day = df[df['is_fraud'] == 1]['DayName'].value_counts().sort_index()
non_fraud_by_day = df[df['is_fraud'] == 0]['DayName'].value_counts().sort_index()



In [888]:
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
weekend_fraud = df[df['is_fraud'] == 1]['IsWeekend'].mean()
weekend_non_fraud = df[df['is_fraud'] == 0]['IsWeekend'].mean()

print(f"Percentage of fraud on weekends: {weekend_fraud * 100:.2f}%")
print(f"Percentage of non-fraud on weekends: {weekend_non_fraud * 100:.2f}%")


Percentage of fraud on weekends: 29.84%
Percentage of non-fraud on weekends: 27.95%


In [889]:
log_time("Part1 - TrxAmount, Hour, DayOfWeeek etc..", start_time)
start_time = time.time()


Part1 - TrxAmount, Hour, DayOfWeeek etc.. completed at Sun Nov  3 09:52:42 2024. Elapsed time: 0 minutes and 1.08 seconds



In [890]:
import os
print(os.listdir())  # List all files in the current directory


['.DS_Store', 'v_2.0_RandomForest_Credit_Card_Fraud_Detection.ipynb', 'v_1.2_DecisionTrees_OptFeat_Parallelized_GridSearch_Credit_Card_Fraud_Detection.ipynb', 'v_2.1_RandomForest_Balanced_SMOTE_Credit_Card_Fraud_Detection.ipynb', 'v_2.2_RandomForest_Balanced_SMOTE_GridSearch_Credit_Card_Fraud_Detection.ipynb', 'bkp', 'v_0.0_LogisticRegression_Credit_Card_Fraud_Detection.ipynb', 'v_3.2_xgBoost_Credit_Card_Fraud_Detection.ipynb', '.gitkeep', 'v_3.1_xgBoost_SMOTE_Credit_Card_Fraud_Detection.ipynb', '__pycache__', 'v_0.1_LogisticRegression_Balanced_Credit_Card_Fraud_Detection.ipynb', 'v_1.1_DecisionTrees_Parallelized_GridSearch_Credit_Card_Fraud_Detection.ipynb', '.ipynb_checkpoints', 'v_1.0_DecisionTrees_Credit_Card_Fraud_Detection.ipynb']


# MULTIPROCESSING : distance

In [891]:
import pandas as pd
from geopy.distance import geodesic
import multiprocessing as mp
import numpy as np
import time
import sys
from distance_calculation import calculate_distance_chunk

start_time = time.time()


# Add the current working directory to the system path
sys.path.append(os.getcwd())

# Multiprocessing function to split the dataframe and apply the distance calculation
def parallel_distance_calculation(df, num_partitions=None):
    if num_partitions is None:
        num_partitions = mp.cpu_count()  # Use all available CPU cores
    
    # Split the dataframe into chunks
    df_split = np.array_split(df, num_partitions)
    
    # Create a multiprocessing Pool
    with mp.Pool(num_partitions) as pool:
        # Apply the calculate_distance_chunk function to each chunk in parallel
        result = pool.map(calculate_distance_chunk, df_split)
    
    # Concatenate the results back into a single dataframe
    return pd.concat(result)

# Main block to ensure multiprocessing works correctly
if __name__ == "__main__":
    start_time = time.time()

    # Assuming df has the columns ['lat', 'long', 'merch_lat', 'merch_long']
    
    # Run with limited number of cores (e.g., 4 cores)
    df = parallel_distance_calculation(df, num_partitions=4)  # Use 4 cores instead of all available cores

    # Log the time taken for distance calculation with multiprocessing
    log_time("Part2 - Distance Calculation with Multiprocessing (4 cores)", start_time)

    # Check the first few rows to verify the result
    print(df[['lat', 'long', 'merch_lat', 'merch_long', 'distance']].head())


  return bound(*args, **kwds)


Part2 - Distance Calculation with Multiprocessing (4 cores) completed at Sun Nov  3 09:53:13 2024. Elapsed time: 0 minutes and 30.89 seconds

                         lat      long  merch_lat  merch_long    distance
TransactionTime                                                          
2020-06-21 12:14:25  33.9659  -80.9355  33.986391  -81.200714   24.613746
2020-06-21 12:14:33  40.3207 -110.4360  39.450498 -109.960431  104.834043
2020-06-21 12:14:53  40.6729  -73.5365  40.495810  -74.196111   59.204796
2020-06-21 12:15:15  28.5697  -80.8191  28.812398  -80.883061   27.615117
2020-06-21 12:15:17  44.2529  -85.0170  44.959148  -85.884734  104.423175


In [892]:
import os
print(os.getcwd())  # This will print the current working directory


/Users/sadhvichandragiri/Desktop/coding/ZHAW_Project/ML_BigData_Repo_1/notebooks


log_time("Part2 -  Distance Calculation", start_time)
start_time = time.time()


In [893]:
# Check unique values in the 'is_fraud' column
df['is_fraud'].unique()


array([0, 1])

In [894]:
# Fraud vs Non-Fraud by Merchant Category
fraud_by_category = df[df['is_fraud'] == 1]['category'].value_counts().head(10)
non_fraud_by_category = df[df['is_fraud'] == 0]['category'].value_counts().head(10)



In [895]:
# Top 5 categories with the highest fraud counts
top_fraud_merchant_categories = df[df['is_fraud'] == 1]['category'].value_counts().head(5).index.tolist()

# Print top fraudulent categories
print("Top Fraudulent Merchant Categories:", top_fraud_merchant_categories)

# Create HighRiskMerchantCategory flag
df['HighRiskMerchantCategory'] = df['category'].apply(lambda x: 1 if x in top_fraud_merchant_categories else 0)



Top Fraudulent Merchant Categories: ['shopping_net', 'grocery_pos', 'misc_net', 'shopping_pos', 'gas_transport']


In [896]:
# Print the count of 1s and 0s in HighRiskMerchantCategory
print(df['HighRiskMerchantCategory'].value_counts())


HighRiskMerchantCategory
0    327859
1    227860
Name: count, dtype: int64


# Potential Additional Features:
Transaction Frequency:
    Feature: How often a credit card has been used within a specific time frame (e.g., last hour or day).
    Why: Fraudsters often make rapid successive transactions within short periods. You could create a rolling window to calculate transaction frequency.
    How: You could calculate the number of transactions within the past X hours/days using a rolling window on the TransactionTime feature.

#age group

In [897]:
import pandas as pd

# Ensure 'DateOfBirth' is in datetime format
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')  # Handle errors during conversion

# Step 1: Calculate Age
# Calculate age in years
df['Age'] = (pd.Timestamp.now() - df['DateOfBirth']).dt.days // 365  # Age in years

# Step 2: Create Age Groups
# Define age bins and labels
bins = [0, 18, 25, 35, 45, 55, 65, 100]  # Define your age bins, ensuring to cover all possible ages
labels = ['0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '66+']  # Corresponding labels

# Create age group feature, include NaN values handling
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False, include_lowest=True)

# Verify the new features without truncating DataFrame
#print(df[['DateOfBirth', 'Age', 'AgeGroup']].head(10))  # Display the first 10 entries


In [898]:
log_time("Part3 - Merchant Categories & Age group", start_time)
start_time = time.time()


Part3 - Merchant Categories & Age group completed at Sun Nov  3 09:53:13 2024. Elapsed time: 0 minutes and 31.42 seconds



# MULTIPROCESSING : count_transactions_within_last_hour

In [899]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import time
from transaction_frequency import process_chunk  # Import from the .py file

# Multiprocessing function to parallelize the transaction counting
def parallel_count_transactions(df, num_partitions=None):
    if num_partitions is None:
        num_partitions = mp.cpu_count()  # Use all available CPU cores
    
    # Ensure the index is a datetime
    df.index = pd.to_datetime(df.index)
    
    # Split the dataframe into chunks based on the number of partitions (CPU cores)
    df_split = np.array_split(df, num_partitions)
    
    # Create a multiprocessing Pool
    with mp.Pool(num_partitions) as pool:
        # Apply the processing function to each chunk in parallel
        result = pool.map(process_chunk, df_split)
    
    # Combine the results from each chunk into a single series, reset index for consistency
    return pd.concat(result).reset_index(drop=True)

# Assuming df has 'CreditCardNumber' as a column and transaction times are indexed
if __name__ == "__main__":
    start_time = time.time()

    # Apply the parallel processing for transaction frequency counting
    df['TransactionFrequency'] = parallel_count_transactions(df, num_partitions=4)  # Adjust num_partitions as needed

    # Log the time taken for transaction frequency calculation with multiprocessing
    log_time("Part4 - TransactionFrequency Multiprocessing", start_time)

    # Check the first 10 rows
    print(df[['TransactionFrequency']].head(10))


  return bound(*args, **kwds)


Part4 - TransactionFrequency Multiprocessing completed at Sun Nov  3 09:54:12 2024. Elapsed time: 0 minutes and 59.10 seconds

                    TransactionFrequency
TransactionTime                         
2020-06-21 12:14:25                  NaN
2020-06-21 12:14:33                  NaN
2020-06-21 12:14:53                  NaN
2020-06-21 12:15:15                  NaN
2020-06-21 12:15:17                  NaN
2020-06-21 12:15:37                  NaN
2020-06-21 12:15:44                  NaN
2020-06-21 12:15:50                  NaN
2020-06-21 12:16:10                  NaN
2020-06-21 12:16:11                  NaN


In [900]:
df.index = pd.to_datetime(df.index)
print(df.index)

DatetimeIndex(['2020-06-21 12:14:25', '2020-06-21 12:14:33',
               '2020-06-21 12:14:53', '2020-06-21 12:15:15',
               '2020-06-21 12:15:17', '2020-06-21 12:15:37',
               '2020-06-21 12:15:44', '2020-06-21 12:15:50',
               '2020-06-21 12:16:10', '2020-06-21 12:16:11',
               ...
               '2020-12-31 23:57:18', '2020-12-31 23:57:50',
               '2020-12-31 23:57:56', '2020-12-31 23:58:04',
               '2020-12-31 23:58:34', '2020-12-31 23:59:07',
               '2020-12-31 23:59:09', '2020-12-31 23:59:15',
               '2020-12-31 23:59:24', '2020-12-31 23:59:34'],
              dtype='datetime64[ns]', name='TransactionTime', length=555719, freq=None)


In [901]:
# Resample the data to count transactions every hour
transaction_counts_hourly = df.resample('H').size()
transaction_counts_daily = df.resample('D').size()

# Combine with CreditCardNumber if necessary
transaction_counts = df.groupby('CreditCardNumber').resample('H').size().reset_index(name='TransactionCount')
print(transaction_counts.head(10))

  transaction_counts_hourly = df.resample('H').size()
  transaction_counts = df.groupby('CreditCardNumber').resample('H').size().reset_index(name='TransactionCount')


   CreditCardNumber     TransactionTime  TransactionCount
0       60416207185 2020-06-21 13:00:00                 1
1       60416207185 2020-06-21 14:00:00                 0
2       60416207185 2020-06-21 15:00:00                 0
3       60416207185 2020-06-21 16:00:00                 1
4       60416207185 2020-06-21 17:00:00                 0
5       60416207185 2020-06-21 18:00:00                 0
6       60416207185 2020-06-21 19:00:00                 0
7       60416207185 2020-06-21 20:00:00                 0
8       60416207185 2020-06-21 21:00:00                 0
9       60416207185 2020-06-21 22:00:00                 0


In [902]:
total_transactions = df.groupby('CreditCardNumber').size().reset_index(name='TotalTransactionCount')
print(total_transactions.head(10))


   CreditCardNumber  TotalTransactionCount
0       60416207185                    678
1       60422928733                    669
2       60423098130                    228
3       60427851591                    215
4       60487002085                    239
5       60490596305                    455
6       60495593109                    224
7      501802953619                    635
8      501828204849                    218
9      501831082224                    439


In [903]:
# Calculate the time difference between consecutive transactions
time_diff = df.index.to_series().diff().dt.total_seconds()
# Flag rapid transactions (within 5 minutes)
df['RapidTransactionFlag'] = time_diff < 60  # For a 1-minute threshold

# Create a temporary DataFrame for rapid transactions
rapid_transactions = df[df['RapidTransactionFlag']]

# Group by date and count the number of rapid transactions
rapid_transaction_counts = rapid_transactions.groupby(rapid_transactions.index.date).size()
print(rapid_transaction_counts)

# Get a summary of the rapid transactions
rapid_transactions_summary = rapid_transactions.describe()
print(rapid_transactions_summary)


2020-06-21    1908
2020-06-22    3834
2020-06-23    3602
2020-06-24    1352
2020-06-25    1578
              ... 
2020-12-27    5183
2020-12-28    6212
2020-12-29    6123
2020-12-30    2630
2020-12-31    3044
Length: 194, dtype: int64
          Unnamed: 0  CreditCardNumber  TransactionAmount            zip  \
count  478827.000000      4.788270e+05      478827.000000  478827.000000   
mean   282352.474282      4.156977e+17          63.671204   48855.657540   
min         1.000000      6.041621e+10           1.000000    1257.000000   
25%    139140.500000      1.800429e+14           9.510000   26292.000000   
50%    283432.000000      3.521417e+15          46.370000   48174.000000   
75%    429493.500000      4.635331e+15          82.260000   72042.000000   
max    555718.000000      4.992346e+18         519.854600   99921.000000   
std    163611.709333      1.306822e+18          78.874224   26858.296762   

                 lat           long      city_pop  \
count  478827.000000  47882

In [904]:
print(df.columns)


Index(['Unnamed: 0', 'CreditCardNumber', 'merchant', 'category',
       'TransactionAmount', 'first', 'last', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID', 'Hour', 'HighRiskHour', 'DayOfWeek', 'DayName',
       'IsWeekend', 'distance', 'HighRiskMerchantCategory', 'Age', 'AgeGroup',
       'TransactionFrequency', 'RapidTransactionFlag'],
      dtype='object')


In [905]:
log_time("Part5 - RapidTransactionFlag", start_time)
start_time = time.time()


Part5 - RapidTransactionFlag completed at Sun Nov  3 09:54:16 2024. Elapsed time: 1 minutes and 2.87 seconds



Transaction Amount Features:
Log Transaction Amount: Normalize the TransactionAmount by taking its logarithm to reduce skewness.
Transaction Amount Flags: Create binary flags for high-value transactions (e.g., if TransactionAmount exceeds a certain threshold).

In [906]:

# Sample DataFrame creation
# Assume 'df' is your DataFrame and has a 'TransactionAmount' column
# df = pd.read_csv('your_data.csv')  # Load your actual data

# Step 1: Log Transaction Amount
# Calculate the log of TransactionAmount
df['LogTransactionAmount'] = np.log1p(df['TransactionAmount'])  # Use log1p for stability with 0 values

# Step 2: Create Transaction Amount Flags
# Define a threshold for high-value transactions
threshold = 100  # Adjust the threshold based on your data context

# Create a flag for high-value transactions
df['HighValueTransactionFlag'] = df['TransactionAmount'] > threshold

# Verify the new features
print(df[['TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag']].head(10))  # Display the first 10 entries


                     TransactionAmount  LogTransactionAmount  \
TransactionTime                                                
2020-06-21 12:14:25               2.86              1.350667   
2020-06-21 12:14:33              29.84              3.428813   
2020-06-21 12:14:53              41.28              3.744314   
2020-06-21 12:15:15              60.05              4.111693   
2020-06-21 12:15:17               3.19              1.432701   
2020-06-21 12:15:37              19.55              3.022861   
2020-06-21 12:15:44             133.93              4.904756   
2020-06-21 12:15:50              10.37              2.430978   
2020-06-21 12:16:10               4.37              1.680828   
2020-06-21 12:16:11              66.54              4.212720   

                     HighValueTransactionFlag  
TransactionTime                                
2020-06-21 12:14:25                     False  
2020-06-21 12:14:33                     False  
2020-06-21 12:14:53                    

Behavioral Features:
Count of Transactions in Last X Days: Count how many transactions have occurred in the last 7, 14, or 30 days.
Average Transaction Amount in Last X Days: Calculate the average transaction amount over the same periods.

In [907]:
import pandas as pd

# Assuming 'TransactionTime' is already set as the index and in datetime format

# Step 1: Count of Transactions in Last X Days
for days in [7, 14, 30]:
    # Sort data by CreditCardNumber and TransactionTime to ensure rolling works properly
    df = df.sort_values(by=['CreditCardNumber', 'TransactionTime'])
    
    # Apply rolling and count the number of transactions for each card
    df[f'TransactionCountLast{days}Days'] = (
        df.groupby('CreditCardNumber')['CreditCardNumber']
        .rolling(f'{days}D')
        .count()
        .reset_index(level=0, drop=True)
    )

# Step 2: Average Transaction Amount in Last X Days
for days in [7, 14, 30]:
    # Sort data by CreditCardNumber and TransactionTime to ensure rolling works properly
    df = df.sort_values(by=['CreditCardNumber', 'TransactionTime'])
    
    # Calculate the average transaction amount for each credit card in the last X days
    df[f'AverageTransactionAmountLast{days}Days'] = (
        df.groupby('CreditCardNumber')['TransactionAmount']
        .rolling(f'{days}D')
        .mean()
        .reset_index(level=0, drop=True)
    )

# Verify the new features
print(df[['TransactionCountLast7Days', 'TransactionCountLast14Days', 'TransactionCountLast30Days',
           'AverageTransactionAmountLast7Days', 'AverageTransactionAmountLast14Days', 'AverageTransactionAmountLast30Days']].head(10))


                     TransactionCountLast7Days  TransactionCountLast14Days  \
TransactionTime                                                              
2020-06-21 13:05:42                        1.0                         1.0   
2020-06-21 16:25:36                        2.0                         2.0   
2020-06-22 07:58:33                        3.0                         3.0   
2020-06-22 15:32:31                        4.0                         4.0   
2020-06-23 12:28:54                        5.0                         5.0   
2020-06-23 14:24:48                        6.0                         6.0   
2020-06-23 16:39:40                        7.0                         7.0   
2020-06-23 19:07:05                        8.0                         8.0   
2020-06-23 22:45:57                        9.0                         9.0   
2020-06-24 04:22:17                       10.0                        10.0   

                     TransactionCountLast30Days  \
TransactionT

In [908]:
print(df.columns)  # Display all columns in the DataFrame


Index(['Unnamed: 0', 'CreditCardNumber', 'merchant', 'category',
       'TransactionAmount', 'first', 'last', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID', 'Hour', 'HighRiskHour', 'DayOfWeek', 'DayName',
       'IsWeekend', 'distance', 'HighRiskMerchantCategory', 'Age', 'AgeGroup',
       'TransactionFrequency', 'RapidTransactionFlag', 'LogTransactionAmount',
       'HighValueTransactionFlag', 'TransactionCountLast7Days',
       'TransactionCountLast14Days', 'TransactionCountLast30Days',
       'AverageTransactionAmountLast7Days',
       'AverageTransactionAmountLast14Days',
       'AverageTransactionAmountLast30Days'],
      dtype='object')


In [909]:
log_time("Part6 - TransactionCountLast_X_Days & AverageTrxAmountLast_X_Days", start_time)
start_time = time.time()


Part6 - TransactionCountLast_X_Days & AverageTrxAmountLast_X_Days completed at Sun Nov  3 09:54:22 2024. Elapsed time: 0 minutes and 5.38 seconds



# Graph Construction with NetworkX:

Highlight Fraudulent Nodes: Overlay of fraudulent and non-fraudulent credit cards on this degree distribution to see if there’s a difference in their degrees.

In [910]:


# Create an empty graph
G = nx.Graph()

# Add edges between credit cards and merchants, including transaction amount as an edge attribute
for idx, row in df.iterrows():
    credit_card = str(row['CreditCardNumber'])
    merchant = str(row['merchant'])
    transaction_amount = row['TransactionAmount']  # Ensure TransactionAmount exists in your dataframe
    
    # Add an edge with the transaction amount as an attribute
    G.add_edge(credit_card, merchant, transaction_amount=transaction_amount)


# Calculate degrees for all nodes in the graph
degrees = dict(G.degree())

# Filter degrees for credit cards and merchants
credit_card_nodes = df['CreditCardNumber'].astype(str).unique()
merchant_nodes = df['merchant'].astype(str).unique()

credit_card_degrees = {node: degrees[node] for node in credit_card_nodes if node in degrees}
merchant_degrees = {node: degrees[node] for node in merchant_nodes if node in degrees}

# Debugging: Print counts to ensure correctness
print(f"Number of unique credit card nodes: {len(credit_card_nodes)}")
print(f"Number of unique merchant nodes: {len(merchant_nodes)}")
print(f"Number of credit card nodes with degrees: {len(credit_card_degrees)}")
print(f"Number of merchant nodes with degrees: {len(merchant_degrees)}")

# Create a new DataFrame for easier plotting
degree_df = pd.DataFrame({
    'CreditCardDegree': pd.Series(credit_card_degrees),
    'MerchantDegree': pd.Series(merchant_degrees)
})




Number of unique credit card nodes: 924
Number of unique merchant nodes: 693
Number of credit card nodes with degrees: 924
Number of merchant nodes with degrees: 693


In [911]:
# Add degree information back to the original DataFrame
df['degree'] = df['CreditCardNumber'].astype(str).map(credit_card_degrees)



In [912]:
# Check edges and their attributes
#for edge in G.edges(data=True):
#    print(edge)

#do NOT print this, huge list


In [913]:
df['CreditCardNumber'] = df['CreditCardNumber'].astype(str)


In [914]:
fraud_mapping = df.set_index('CreditCardNumber')['is_fraud'].to_dict()


In [915]:
log_time("Part7 - NetworkX Start Step", start_time)
start_time = time.time()


Part7 - NetworkX Start Step completed at Sun Nov  3 09:55:02 2024. Elapsed time: 0 minutes and 40.55 seconds



In [916]:
#print(fraud_mapping.head(5))
#only testing purposes

# MULTIPROCESSING : betweenness_centrality

In [917]:
import networkx as nx
import time
from networkx_graph_betweeness_centrality import parallel_betweenness_centrality

# Assuming G is your graph
if __name__ == "__main__":
    start_time = time.time()

    # Calculate betweenness centrality using parallel processing
    betweenness_centrality = parallel_betweenness_centrality(G, num_partitions=4)  

    # Log the time taken for betweenness centrality calculation with multiprocessing
    log_time("Part8 - Betweenness Centrality Calculation with Multiprocessing", start_time)
    start_time = time.time()

    # Check a few centrality values
    print(list(betweenness_centrality.items())[:10])


[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=20; total time=   8.4s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=20; total time=   7.3s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=50; total time=   9.0s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=50; total time=   8.5s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=50; total time=   7.3s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=5, min_samples_split=20; total time=   9.4s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=20; total time=   8.5s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=50; total time=   7.3s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=50; total time=   9.7s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=20; total time=   8.6s


In [918]:
df['betweenness_centrality'] = df['CreditCardNumber'].map(betweenness_centrality)


In [919]:
print(df['betweenness_centrality'].describe())
print(df['betweenness_centrality'].isna().sum())  # Check for missing values


count    5.557190e+05
mean     7.601584e-05
std      3.431355e-05
min      7.500703e-09
25%      4.970571e-05
50%      7.859341e-05
75%      1.056097e-04
max      1.412551e-04
Name: betweenness_centrality, dtype: float64
0


In [920]:
# Check betweenness centrality for specific credit card numbers
sample_nodes = ['60416207185', 'fraud_Kutch-Ferry']  # Replace with actual nodes
for node in sample_nodes:
    print(f"{node}: {betweenness_centrality.get(node)}")


60416207185: 6.866934985630104e-05
fraud_Kutch-Ferry: 0.0003782526235637667


1. Investigate Nodes with High Betweenness Centrality:

Now that you’ve visualized nodes with high betweenness centrality, you can:

    Examine if fraudulent nodes tend to have high betweenness centrality. This might indicate that these nodes are acting as "connectors" between different parts of the network, which could be a sign of suspicious behavior.
    Compare centrality between fraud and non-fraud nodes to see if there's a pattern.



2. Visualize Communities in the Network:

You could apply community detection to uncover fraud rings or clusters of merchants targeted by fraudsters. The Louvain algorithm is great for this.

In [921]:
import community.community_louvain as community_louvain


# Apply Louvain method for community detection
partition = community_louvain.best_partition(G)



Fraud Node Highlighting:

    Fraudulent nodes (from df['is_fraud'] == 1) are colored red to make them stand out. The rest of the nodes are still colored based on their communities.
    This should help you easily spot any fraudulent nodes in the network.

Top 10 Most Central Nodes:

    We calculate betweenness centrality and extract the top 10 most central nodes.
    These nodes are visualized with their connections, which should help declutter the graph and focus on the key players in the transaction network.

In [922]:

# Apply Louvain method for community detection
partition = community_louvain.best_partition(G)

# Create positions for nodes using a spring layout
pos = nx.spring_layout(G)

# Add the community information to the DataFrame
df['community'] = df['CreditCardNumber'].map(partition)

# Highlight fraud nodes separately
fraud_nodes = df[df['is_fraud'] == 1]['CreditCardNumber'].values




In [923]:
# Add the community information to the DataFrame
df['community'] = df['CreditCardNumber'].map(partition)

# Calculate the percentage of fraud in each community
community_fraud = df.groupby('community')['is_fraud'].mean()



In [924]:

# Print fraud rate per community
print(community_fraud)


community
0    0.003190
1    0.005009
2    0.003030
3    0.004009
Name: is_fraud, dtype: float64


In [925]:
community_size = df.groupby('community').size()
print(community_size)


community
0    229148
1    119992
2     15511
3    191068
dtype: int64


In [926]:
# Combine fraud rates and community sizes into a single DataFrame
fraud_vs_size = pd.concat([community_fraud, df.groupby('community').size()], axis=1)
fraud_vs_size.columns = ['FraudRate', 'CommunitySize']



In [927]:
top_fraud_communities = community_fraud.sort_values(ascending=False).head(5)
print(top_fraud_communities)


community
1    0.005009
3    0.004009
0    0.003190
2    0.003030
Name: is_fraud, dtype: float64


In [928]:
# Get the community labels of the top fraud communities
top_community_labels = top_fraud_communities.index.tolist()

# Filter the DataFrame for only the top fraud communities
top_communities_df = df[df['community'].isin(top_community_labels)]


In [929]:
# Show Only the Top Merchants by Fraud Rate:
# Instead of displaying all merchants, you can filter the plot to show only the top 10 or 20 merchants with the highest fraud rates.

# Calculate fraud rate by merchant in the top fraud communities
merchant_fraud_rate = top_communities_df.groupby('merchant')['is_fraud'].mean()

# Sort merchants by fraud rate in descending order
top_merchants = merchant_fraud_rate.sort_values(ascending=False).head(10)

# Print top 10 merchants with highest fraud rate
print(top_merchants)


merchant
fraud_Romaguera, Cruickshank and Greenholt    0.021739
fraud_Lemke-Gutmann                           0.021505
fraud_Mosciski, Ziemann and Farrell           0.020690
fraud_Heathcote, Yost and Kertzmann           0.020482
fraud_Rodriguez, Yost and Jenkins             0.019960
fraud_Medhurst PLC                            0.019430
fraud_Bashirian Group                         0.018987
fraud_Kris-Weimann                            0.018939
fraud_Heathcote LLC                           0.018703
fraud_Bednar Group                            0.018519
Name: is_fraud, dtype: float64


In [930]:

# Assuming 'category' is a column representing merchant categories
merchantcategory_fraud = top_communities_df.groupby('category')['is_fraud'].mean()

# Sort the fraud rate by merchant category in descending order
merchantcategory_fraud_sorted = merchantcategory_fraud.sort_values(ascending=False)


In [931]:

log_time("Part9 - Community & Top Merchants", start_time)
start_time = time.time()


Part9 - Community & Top Merchants completed at Sun Nov  3 09:58:13 2024. Elapsed time: 1 minutes and 3.08 seconds



In [932]:
# Check the density of the graph (a measure of sparsity)
density = nx.density(G)
print(f"Graph Density: {density}")


Graph Density: 0.2513486042481799


In [933]:
# Calculate and print the average degree
degree_sequence = [degree for node, degree in G.degree()]
average_degree = sum(degree_sequence) / len(degree_sequence)
print(f"Average Degree of Nodes: {average_degree}")


Average Degree of Nodes: 406.17934446505876


In [934]:
log_time("Part10 - Density", start_time)
start_time = time.time()


Part10 - Density completed at Sun Nov  3 09:58:13 2024. Elapsed time: 0 minutes and 0.02 seconds



In [935]:
print(df.columns)

Index(['Unnamed: 0', 'CreditCardNumber', 'merchant', 'category',
       'TransactionAmount', 'first', 'last', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID', 'Hour', 'HighRiskHour', 'DayOfWeek', 'DayName',
       'IsWeekend', 'distance', 'HighRiskMerchantCategory', 'Age', 'AgeGroup',
       'TransactionFrequency', 'RapidTransactionFlag', 'LogTransactionAmount',
       'HighValueTransactionFlag', 'TransactionCountLast7Days',
       'TransactionCountLast14Days', 'TransactionCountLast30Days',
       'AverageTransactionAmountLast7Days',
       'AverageTransactionAmountLast14Days',
       'AverageTransactionAmountLast30Days', 'degree',
       'betweenness_centrality', 'community'],
      dtype='object')


In [936]:
selected_features = [
    'TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag',
    'TransactionCountLast7Days', 'TransactionCountLast14Days', 'TransactionCountLast30Days',
    'AverageTransactionAmountLast7Days', 'AverageTransactionAmountLast14Days', 'AverageTransactionAmountLast30Days',
    'Hour', 'HighRiskHour', 'DayOfWeek', 'IsWeekend', 'TransactionFrequency', 'RapidTransactionFlag',
    'lat', 'long', 'merch_lat', 'merch_long', 'distance', 'city_pop',
    'Age', 'AgeGroup', 'gender', 'state', 'city',
    'degree', 'betweenness_centrality', 'community'
]

df_selected_features = df[selected_features]


# Page rank as new feature

In [937]:
# Calculate PageRank for each node in the graph
pagerank = nx.pagerank(G)

# Map the PageRank values to the 'CreditCardNumber' in the DataFrame
df['pagerank'] = df['CreditCardNumber'].map(pagerank)


In [938]:
# Check for NaN values in the pagerank column
print(df['pagerank'].isna().sum())


0


In [939]:
# Check descriptive statistics of pagerank values
print(df['pagerank'].describe())



count    555719.000000
mean          0.000636
std           0.000139
min           0.000099
25%           0.000548
50%           0.000669
75%           0.000751
max           0.000850
Name: pagerank, dtype: float64


In [940]:
# Check how many nodes have a PageRank of zero
zero_pagerank_count = (df['pagerank'] == 0).sum()
print(f"Number of nodes with zero PageRank: {zero_pagerank_count}")


Number of nodes with zero PageRank: 0


In [941]:
# Compare the average PageRank for fraud and non-fraud transactions
fraud_avg_pagerank = df[df['is_fraud'] == 1]['pagerank'].mean()
non_fraud_avg_pagerank = df[df['is_fraud'] == 0]['pagerank'].mean()

print(f"Average PageRank for Fraud: {fraud_avg_pagerank}")
print(f"Average PageRank for Non-Fraud: {non_fraud_avg_pagerank}")


Average PageRank for Fraud: 0.0005267197095238786
Average PageRank for Non-Fraud: 0.0006366062405655646


In [942]:
selected_features.append('pagerank')
df_selected_features = df[selected_features]


In [943]:
print(df_selected_features.columns)

Index(['TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag',
       'TransactionCountLast7Days', 'TransactionCountLast14Days',
       'TransactionCountLast30Days', 'AverageTransactionAmountLast7Days',
       'AverageTransactionAmountLast14Days',
       'AverageTransactionAmountLast30Days', 'Hour', 'HighRiskHour',
       'DayOfWeek', 'IsWeekend', 'TransactionFrequency',
       'RapidTransactionFlag', 'lat', 'long', 'merch_lat', 'merch_long',
       'distance', 'city_pop', 'Age', 'AgeGroup', 'gender', 'state', 'city',
       'degree', 'betweenness_centrality', 'community', 'pagerank'],
      dtype='object')


In [944]:
print(df.shape)

(555719, 46)


In [945]:
log_time("Part11 - PageRank", start_time)
start_time = time.time()


Part11 - PageRank completed at Sun Nov  3 09:58:14 2024. Elapsed time: 0 minutes and 1.54 seconds



In [946]:
# Decision Trees start

In [947]:
log_time("END - Feature Engineering .....  ", start_time)
start_time = time.time()

END - Feature Engineering .....   completed at Sun Nov  3 09:58:14 2024. Elapsed time: 0 minutes and 0.01 seconds



In [948]:
log_time(f"{model_specs}_{dataset_type} START Model ....  ", start_time)
start_time = time.time()

DecisionTrees_Parallelized_GridSearch_Test START Model ....   completed at Sun Nov  3 09:58:14 2024. Elapsed time: 0 minutes and 0.01 seconds



In [949]:
import importlib
import decisiontree_gridsearch_modelutils
importlib.reload(decisiontree_gridsearch_modelutils)




<module 'decisiontree_gridsearch_modelutils' from '/Users/sadhvichandragiri/Desktop/coding/ZHAW_Project/ML_BigData_Repo_1/notebooks/../references/decisiontree_gridsearch_modelutils.py'>

In [950]:
print(model_specs)
#decisiontrees_parallelized_gridsearch_Train.pkl

DecisionTrees_Parallelized_GridSearch


In [951]:
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc
from decisiontree_gridsearch_modelutils import parallel_one_hot_encode, preprocessdata_parallel_onehot_impute
from decisiontree_gridsearch_modelutils import perform_grid_search, generate_classification_report_and_roc, save_decision_tree_viz

sample_fraction = 1.0  # Fraction to sample for memory efficiency


# Parameters
param_grid = {
    'max_depth': [10, 20], 
    'min_samples_split': [20, 50], 
    'min_samples_leaf': [2, 5], 
    'criterion': ['gini']
}


# Step 2: Train Model with Grid Search
def train_model(X, y, param_grid):
    #calling  perform_grid_search from decisiontree_gridsearch_modelutils.py
    best_model = perform_grid_search(X, y, param_grid)
    return best_model

def save_trained_model(model_name):
    output_dir_model = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/models'
    # Define model filename
    model_outputfilename = f"{model_specs.replace(' ', '_').replace(',', '').lower()}.pkl"

    # Assuming best_rf_model is your final or best model
    try:
        # Save the model
        with open(os.path.join(output_dir_model, model_outputfilename), 'wb') as model_file:
            pickle.dump(model_name, model_file)

        print(f"Model saved to {os.path.join(output_dir_model, model_outputfilename)}")

    except NameError:
        print(f"Error: {model_specs} is not defined. Please ensure the model is assigned before saving.")

# Load the trained model
def load_trained_model(model_specs, dataset_type, output_dir_model='/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/models'):
    """
    Loads a trained model from the specified directory.
    
    Args:
        model_specs (str): Specifications or name for the model.
        dataset_type (str): Indicates if it’s for 'Train' or 'Test'.
        output_dir_model (str): Directory where the model is saved.
        
    Returns:
        model: The loaded model.
    """
    model_outputfilename = f"{model_specs.replace(' ', '_').replace(',', '').lower()}.pkl"
    model_path = os.path.join(output_dir_model, model_outputfilename)

    # Load the model if it exists
    if os.path.exists(model_path):
        with open(model_path, 'rb') as model_file:
            model = pickle.load(model_file)
        print(f"Model loaded from {model_path}")
        return model
    else:
        print(f"Model file not found at {model_path}")
        return None
    

# Step 4: Main Execution
def main(df, selected_features, param_grid, model_specs, reports_output_dir, use_test_data=False, sample_fraction=1.0):
    
    # Preprocess the data
    X, y, feature_names = preprocessdata_parallel_onehot_impute(df, selected_features)
    dataset_type = 'Test' if use_test_data else 'Train'
    
    if use_test_data:
        # Load pre-trained model for test predictions
        loaded_model = load_trained_model(model_specs, dataset_type)
        if loaded_model:
            # Generate report and ROC for loaded model on test data
            generate_classification_report_and_roc(loaded_model, X, y, dataset_type, model_specs, reports_output_dir)
            return None, feature_names
        else:
            print("No pre-trained model found.")
            return None, feature_names
    else:
        # Perform training with grid search on train data
        best_model = train_model(X, y, param_grid)
        # Generate report and ROC for best model
        generate_classification_report_and_roc(best_model, X, y, dataset_type, model_specs, reports_output_dir)
        
        # Feature importances
        feature_importances = best_model.feature_importances_
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
        print(importance_df.sort_values(by='Importance', ascending=False))

        # Save the trained model
        save_trained_model(best_model)

        return best_model, feature_names


In [952]:
print(reports_output_dir)

/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees


In [953]:
# Example Usage

# Assuming `df` is your data DataFrame and includes the 'is_fraud' target column
selected_features = [
    'LogTransactionAmount', 'TransactionAmount', 'HighValueTransactionFlag',
    'TransactionCountLast7Days', 'TransactionCountLast30Days', 
    'AverageTransactionAmountLast7Days', 'AverageTransactionAmountLast14Days', 
    'AverageTransactionAmountLast30Days', 'Hour', 'HighRiskHour', 'IsWeekend', 
    'RapidTransactionFlag', 'lat', 'long', 'merch_lat', 'merch_long', 
    'distance', 'city_pop', 'AgeGroup', 'degree', 'betweenness_centrality', 'pagerank'
]

if use_test_data:
    _, feature_names = main(df, selected_features, param_grid, model_specs, reports_output_dir, use_test_data=True)
else:
    best_model, feature_names = main(df, selected_features, param_grid, model_specs, reports_output_dir, use_test_data=False)


Model loaded from /Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/models/decisiontrees_parallelized_gridsearch.pkl
DecisionTrees_Parallelized_GridSearch_Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    166071
           1       0.78      0.31      0.44       645

    accuracy                           1.00    166716
   macro avg       0.89      0.65      0.72    166716
weighted avg       1.00      1.00      1.00    166716

DecisionTrees_Parallelized_GridSearch_Test ROC AUC: 0.9168021205462742
DecisionTrees_Parallelized_GridSearch_Test Precision-Recall AUC: 0.5943386340299978
ROC_Curve saved to DecisionTrees_Parallelized_GridSearch_Test_RoC_Curve_20241103_095815.png
Classification report saved to /Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees/DecisionTrees_Parallelized_GridSearch_Test_Report_20241103_095815.txt


In [954]:
log_time(f"{model_specs}_{dataset_type} END Model ....  ", start_time)
start_time = time.time()

DecisionTrees_Parallelized_GridSearch_Test END Model ....   completed at Sun Nov  3 09:58:16 2024. Elapsed time: 0 minutes and 1.21 seconds



In [955]:

# Generate a unique timestamp for the filename
timestamp = time.strftime("%Y%m%d_%H%M%S")
decisiontree_gridsearch_viz_filename = f"{model_specs}_{dataset_type}_decision_tree_gridsearch_viz_{timestamp}.png"
decisiontree_gridsearch_viz_path = os.path.join(reports_output_dir, decisiontree_gridsearch_viz_filename)

class_names = ["Non-Fraud", "Fraud"]

if not use_test_data:
    # Use the feature names returned from `main` to ensure consistency with the model
    save_decision_tree_viz(model=best_model, feature_names=feature_names, class_names=class_names, output_path=decisiontree_gridsearch_viz_path)


1. Load and Preprocess the Data for the Decision Tree model

In [956]:
log_time(f"{model_specs}_{dataset_type} saved the descision tree ....  ", start_time)
start_time = time.time()

DecisionTrees_Parallelized_GridSearch_Test saved the descision tree ....   completed at Sun Nov  3 09:58:16 2024. Elapsed time: 0 minutes and 0.01 seconds



In [957]:
import os
import time

# Assuming start_time is defined earlier in the notebook
end_time_notebook = time.time()
elapsed_time = end_time_notebook - start_time_notebook

# Print and format the notebook end time and total execution time
print(f"Notebook ended at: {time.ctime(end_time_notebook)}")
print(f"Total execution time: {elapsed_time // 60:.0f} minutes and {elapsed_time % 60:.2f} seconds")


log_time(f"{model_specs}_{dataset_type} Notebook Ended at... ", start_time_notebook)


Notebook ended at: Sun Nov  3 09:58:16 2024
Total execution time: 5 minutes and 38.57 seconds
DecisionTrees_Parallelized_GridSearch_Test Notebook Ended at...  completed at Sun Nov  3 09:58:16 2024. Elapsed time: 5 minutes and 38.57 seconds



# Results of TRAIN

In [958]:
# Save Classification Report and AUC Results in Notebook

classification_report_text = """
### Classification Report for DecisionTrees_Parallelized_GridSearch (Train)

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| Non-Fraud (0) | 1.00 | 1.00 | 1.00 | 1,289,169 |
| Fraud (1)     | 0.88 | 0.68 | 0.76 | 7,506     |

- **Accuracy**: 1.00
- **Macro Avg Precision**: 0.94
- **Macro Avg Recall**: 0.84
- **Macro Avg F1-Score**: 0.88
- **Weighted Avg Precision**: 1.00
- **Weighted Avg Recall**: 1.00
- **Weighted Avg F1-Score**: 1.00
"""

auc_report_text = """
### AUC Metrics
- **ROC AUC**: 0.97
- **Precision-Recall AUC**: 0.89

#### Insights
- The ROC AUC of 0.97 indicates a strong capability of the model to separate fraud and non-fraud transactions.
- Precision-Recall AUC of 0.89 shows that while precision is high, recall could be further improved to reduce missed fraud cases.
"""

print(classification_report_text)
print(auc_report_text)



### Classification Report for DecisionTrees_Parallelized_GridSearch (Train)

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| Non-Fraud (0) | 1.00 | 1.00 | 1.00 | 1,289,169 |
| Fraud (1)     | 0.88 | 0.68 | 0.76 | 7,506     |

- **Accuracy**: 1.00
- **Macro Avg Precision**: 0.94
- **Macro Avg Recall**: 0.84
- **Macro Avg F1-Score**: 0.88
- **Weighted Avg Precision**: 1.00
- **Weighted Avg Recall**: 1.00
- **Weighted Avg F1-Score**: 1.00


### AUC Metrics
- **ROC AUC**: 0.97
- **Precision-Recall AUC**: 0.89

#### Insights
- The ROC AUC of 0.97 indicates a strong capability of the model to separate fraud and non-fraud transactions.
- Precision-Recall AUC of 0.89 shows that while precision is high, recall could be further improved to reduce missed fraud cases.



Test Results aren't promising, hence NOT using parallel_one_hot_encode function, 
instead directly get_dummies on preprocessdata_parallel_onehot_impute.

Also sample_faction = 0.3

DecisionTrees_Parallelized_GridSearch_Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.33      0.45      0.38      2145

    accuracy                           0.99    555719
   macro avg       0.66      0.72      0.69    555719
weighted avg       1.00      0.99      0.99    555719

DecisionTrees_Parallelized_GridSearch_Test ROC AUC: 0.8870440262552248
DecisionTrees_Parallelized_GridSearch_Test Precision-Recall AUC: 0.2255077254205718

Now with sample_faction = 0.3 and getd_dummies directly instaed of parallilized one_hot_encode, precision is improved.

DecisionTrees_Parallelized_GridSearch_Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    166071
           1       0.78      0.31      0.44       645

    accuracy                           1.00    166716
   macro avg       0.89      0.65      0.72    166716
weighted avg       1.00      1.00      1.00    166716

DecisionTrees_Parallelized_GridSearch_Test ROC AUC: 0.9168021205462742
DecisionTrees_Parallelized_GridSearch_Test Precision-Recall AUC: 0.5943386340299978


Model loaded from /Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/models/decisiontrees_parallelized_gridsearch.pkl
DecisionTrees_Parallelized_GridSearch_Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    166071
           1       0.78      0.31      0.44       645

    accuracy                           1.00    166716
   macro avg       0.89      0.65      0.72    166716
weighted avg       1.00      1.00      1.00    166716

# Now sample_factor = 1.0 for train and then test

Fitting 3 folds for each of 8 candidates, totalling 24 fits
DecisionTrees_Parallelized_GridSearch_Train Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386716
           1       0.89      0.65      0.75      2286

    accuracy                           1.00    389002
   macro avg       0.94      0.82      0.87    389002
weighted avg       1.00      1.00      1.00    389002

DecisionTrees_Parallelized_GridSearch_Train ROC AUC: 0.9896053011274324
DecisionTrees_Parallelized_GridSearch_Train Precision-Recall AUC: 0.835250105419902
ROC_Curve saved to DecisionTrees_Parallelized_GridSearch_Train_RoC_Curve_20241102_141643.png
Classification report saved to /Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees_Parallelized_GridSearch_Train_Report_20241102_141643.txt

Accuracy: 0.9970

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    166071
           1       0.78      0.31      0.44       645

    accuracy                           1.00    166716
   macro avg       0.89      0.65      0.72    166716
weighted avg       1.00      1.00      1.00    166716


ROC AUC: 0.9168
Precision-Recall AUC: 0.5943