In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import geopy
from geopy.distance import geodesic
import pickle


In [2]:
import sys
sys.path.append('../references')  # Add the references folder to the system path


In [3]:
#model_specs = 'DecisionTrees_Parallelized_GridSearch_by_Top10Features'

# now with perform_grid_search_balanced
model_specs = 'DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features'

In [4]:
start_time_notebook = time.time()


In [5]:
start_time = time.time()


In [6]:
# Directory to save the figures 

input_src_dir = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/data/raw'
output_dir_figures_train = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/figures/train_figures'
output_dir_figures_test = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/figures/test_figures'


reports_output_dir_base = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports'
# reports_output_dir for DecisionTrees
reports_output_dir = f"{reports_output_dir_base}/DecisionTrees"
print(reports_output_dir)

/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees


In [7]:
# Define which dataset to use
use_test_data = False  # Set to True when using fraudtest.csv

# Determine dataset type based on the variable
dataset_type = 'Test' if use_test_data else 'Train'

# Load the appropriate dataset

if use_test_data:
    output_dir_figures = output_dir_figures_test
else:
    output_dir_figures = output_dir_figures_train

In [8]:
# Generate the preprocess file name dynamically
# Get the current timestamp
timestamp = time.strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS

logfile_title = 'LogFile'
logfile_name = f"{model_specs}_{dataset_type}_{logfile_title.replace(',', '').lower().split('.')[0]}_{timestamp}.txt"

logfile_path = os.path.join(reports_output_dir, logfile_name)

# Function to log times to a file
def log_time(step_name, start_time):
    end_time = time.time()
    elapsed_time = end_time - start_time
    log_message = (f"{step_name} completed at {time.ctime(end_time)}. "
                   f"Elapsed time: {elapsed_time // 60:.0f} minutes and {elapsed_time % 60:.2f} seconds\n")
    
    # Append log to file
    with open(logfile_path, 'a') as f:
        f.write(log_message)
    
    # Print the message to the console as well
    print(log_message)


In [9]:

log_time(f"{model_specs}_{dataset_type} Notebook  started at... ", start_time_notebook)
start_time = time.time()


DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features_Train Notebook  started at...  completed at Tue Nov  5 09:37:49 2024. Elapsed time: 0 minutes and 0.04 seconds



In [10]:
# Load the appropriate dataset

if use_test_data:
    df = pd.read_csv(f"{input_src_dir}/fraudTest.csv")  # Concatenate the directory with the filename
else:
    df = pd.read_csv(f"{input_src_dir}/fraudTrain.csv")


In [11]:
print(df.columns)
print(df.shape)


Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')
(1296675, 23)


In [12]:
df.columns = df.columns.str.strip()
df = df.rename(columns={'amt': 'TransactionAmount', 'cc_num': 'CreditCardNumber', 'dob': 'DateOfBirth', 'trans_date_trans_time': 'TransactionTime'})
print(df.columns)


Index(['Unnamed: 0', 'TransactionTime', 'CreditCardNumber', 'merchant',
       'category', 'TransactionAmount', 'first', 'last', 'gender', 'street',
       'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [13]:
# Generate a unique TransactionID for each row
df['TransactionID'] = range(1, len(df) + 1)


In [14]:
print(df.columns)
print(df.shape)


Index(['Unnamed: 0', 'TransactionTime', 'CreditCardNumber', 'merchant',
       'category', 'TransactionAmount', 'first', 'last', 'gender', 'street',
       'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID'],
      dtype='object')
(1296675, 24)


In [15]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# no missing values

Missing values per column:
 Unnamed: 0           0
TransactionTime      0
CreditCardNumber     0
merchant             0
category             0
TransactionAmount    0
first                0
last                 0
gender               0
street               0
city                 0
state                0
zip                  0
lat                  0
long                 0
city_pop             0
job                  0
DateOfBirth          0
trans_num            0
unix_time            0
merch_lat            0
merch_long           0
is_fraud             0
TransactionID        0
dtype: int64


In [16]:
# Count of fraud and non-fraud transactions
fraud_counts = df['is_fraud'].value_counts()
print(fraud_counts)

# Optionally, you can get it in percentage terms
fraud_percentage = df['is_fraud'].value_counts(normalize=True) * 100
print(fraud_percentage)

is_fraud
0    1289169
1       7506
Name: count, dtype: int64
is_fraud
0    99.421135
1     0.578865
Name: proportion, dtype: float64


In [17]:
#how many unique credit cards in the data set ??
df['CreditCardNumber'].nunique()

983

In [18]:
print(df.columns)

Index(['Unnamed: 0', 'TransactionTime', 'CreditCardNumber', 'merchant',
       'category', 'TransactionAmount', 'first', 'last', 'gender', 'street',
       'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID'],
      dtype='object')


In [19]:
# Convert TransactionTime to datetime
df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])

# Optional: Convert DateOfBirth to datetime, if needed
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')

In [20]:
# Set 'TransactionTime' as the index permanently
df.set_index('TransactionTime', inplace=True)

# Verify the index
print(df.index)


DatetimeIndex(['2019-01-01 00:00:18', '2019-01-01 00:00:44',
               '2019-01-01 00:00:51', '2019-01-01 00:01:16',
               '2019-01-01 00:03:06', '2019-01-01 00:04:08',
               '2019-01-01 00:04:42', '2019-01-01 00:05:08',
               '2019-01-01 00:05:18', '2019-01-01 00:06:01',
               ...
               '2020-06-21 12:08:42', '2020-06-21 12:09:22',
               '2020-06-21 12:10:56', '2020-06-21 12:11:23',
               '2020-06-21 12:11:36', '2020-06-21 12:12:08',
               '2020-06-21 12:12:19', '2020-06-21 12:12:32',
               '2020-06-21 12:13:36', '2020-06-21 12:13:37'],
              dtype='datetime64[ns]', name='TransactionTime', length=1296675, freq=None)


In [21]:
# Get the minimum and maximum transaction times from the index
min_time = df.index.min()
max_time = df.index.max()

print(f"Minimum Transaction Time: {min_time}")
print(f"Maximum Transaction Time: {max_time}")


Minimum Transaction Time: 2019-01-01 00:00:18
Maximum Transaction Time: 2020-06-21 12:13:37


In [22]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,CreditCardNumber,merchant,category,TransactionAmount,first,last,gender,street,city,...,long,city_pop,job,DateOfBirth,trans_num,unix_time,merch_lat,merch_long,is_fraud,TransactionID
TransactionTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:18,0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,1
2019-01-01 00:00:44,1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,2
2019-01-01 00:00:51,2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,3
2019-01-01 00:01:16,3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,4
2019-01-01 00:03:06,4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,5


In [23]:
log_time("Initial Steps Completed File Loading, Describe, Date Conversions etc..  ", start_time)
log_time("--------------------------------------------------- ------------------  ", start_time)

Initial Steps Completed File Loading, Describe, Date Conversions etc..   completed at Tue Nov  5 09:37:57 2024. Elapsed time: 0 minutes and 7.55 seconds

--------------------------------------------------- ------------------   completed at Tue Nov  5 09:37:57 2024. Elapsed time: 0 minutes and 7.55 seconds



# Feature Engineering

In [24]:
# Log pre-process time at various steps
start_time = time.time()


In [25]:
log_time("START - Feature Engineering .....  ", start_time)
start_time = time.time()

START - Feature Engineering .....   completed at Tue Nov  5 09:37:57 2024. Elapsed time: 0 minutes and 0.01 seconds



In [26]:

# Clip outliers if necessary
df['TransactionAmount'] = df['TransactionAmount'].clip(upper=df['TransactionAmount'].quantile(0.99))



In [27]:

# Replace inf values with NaN (in case they exist in the 'TransactionAmount' column)
df['TransactionAmount'].replace([np.inf, -np.inf], np.nan, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TransactionAmount'].replace([np.inf, -np.inf], np.nan, inplace=True)


# next type of VIZ via transaction id vs transaction count


In [28]:
# Extract hour from TransactionTime
df['Hour'] = df.index.hour  # Since TransactionTime is already set as the index


In [29]:
# Calculate fraud rate by hour
fraud_rate_by_hour = df.groupby('Hour')['is_fraud'].mean()

# Sort by fraud rate in descending order
fraud_rate_by_hour = fraud_rate_by_hour.sort_values(ascending=False)

# Define a threshold for high-risk hours (adjust as needed)
threshold = fraud_rate_by_hour.mean()  # Mean fraud rate across all hours

# Dynamically identify high-risk hours based on the threshold
high_risk_hours = fraud_rate_by_hour[fraud_rate_by_hour > threshold].index.tolist()

# Print high-risk hours for reference
print("High-Risk Hours:", high_risk_hours)

# Create the HighRiskHour flag based on dynamically identified high-risk hours
df['HighRiskHour'] = df['Hour'].apply(lambda x: 1 if x in high_risk_hours else 0)

# Print a sample of the DataFrame to verify the new column
print(df[['Hour', 'HighRiskHour']])


High-Risk Hours: [22, 23, 1, 0, 2, 3]
                     Hour  HighRiskHour
TransactionTime                        
2019-01-01 00:00:18     0             1
2019-01-01 00:00:44     0             1
2019-01-01 00:00:51     0             1
2019-01-01 00:01:16     0             1
2019-01-01 00:03:06     0             1
...                   ...           ...
2020-06-21 12:12:08    12             0
2020-06-21 12:12:19    12             0
2020-06-21 12:12:32    12             0
2020-06-21 12:13:36    12             0
2020-06-21 12:13:37    12             0

[1296675 rows x 2 columns]


1. Time-Based Analysis:
Already explored daily and hourly trends in transaction volumes, but now dive deeper into fraud patterns based on time.



In [30]:
#Weekday vs. Weekend: Is fraud more common on weekdays or weekends?
df['DayOfWeek'] = df.index.dayofweek  # 0 = Monday, 6 = Sunday
fraud_by_day = df[df['is_fraud'] == 1]['DayOfWeek'].value_counts().sort_index()
non_fraud_by_day = df[df['is_fraud'] == 0]['DayOfWeek'].value_counts().sort_index()



In [31]:

# Define the correct day order
day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']


df['DayName'] = df.index.day_name()
# Convert the 'DayName' column to a categorical type with the correct order
df['DayName'] = pd.Categorical(df['DayName'], categories=day_order, ordered=True)

fraud_by_day = df[df['is_fraud'] == 1]['DayName'].value_counts().sort_index()
non_fraud_by_day = df[df['is_fraud'] == 0]['DayName'].value_counts().sort_index()



In [32]:
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
weekend_fraud = df[df['is_fraud'] == 1]['IsWeekend'].mean()
weekend_non_fraud = df[df['is_fraud'] == 0]['IsWeekend'].mean()

print(f"Percentage of fraud on weekends: {weekend_fraud * 100:.2f}%")
print(f"Percentage of non-fraud on weekends: {weekend_non_fraud * 100:.2f}%")


Percentage of fraud on weekends: 32.55%
Percentage of non-fraud on weekends: 34.84%


In [33]:
log_time("Part1 - TrxAmount, Hour, DayOfWeeek etc..", start_time)
start_time = time.time()


Part1 - TrxAmount, Hour, DayOfWeeek etc.. completed at Tue Nov  5 09:37:59 2024. Elapsed time: 0 minutes and 2.69 seconds



In [34]:
import os
print(os.listdir())  # List all files in the current directory


['.DS_Store', 'v_1.2_DecisionTrees_OptFeat_Parallelized_GridSearch_Credit_Card_Fraud_Detection.ipynb', 'v_2.1_RandomForest_Balanced_SMOTE_Credit_Card_Fraud_Detection.ipynb', 'v_2.2_RandomForest_Balanced_SMOTE_GridSearch_Credit_Card_Fraud_Detection.ipynb', 'v_1.0.1_DecisionTrees_Parallelized_Credit_Card_Fraud_Detection.ipynb', 'bkp', 'v_0.0_LogisticRegression_Credit_Card_Fraud_Detection.ipynb', 'v_3.2_xgBoost_Credit_Card_Fraud_Detection.ipynb', '.gitkeep', 'v_3.1_xgBoost_SMOTE_Credit_Card_Fraud_Detection.ipynb', 'v_2.0.1_RandomForest_use_DASK_to_train_Model_Credit_Card_Fraud_Detection.ipynb', '__pycache__', 'v_0.1_LogisticRegression_Balanced_Credit_Card_Fraud_Detection.ipynb', 'v_1.1_DecisionTrees_Parallelized_GridSearch_Credit_Card_Fraud_Detection.ipynb', 'v_2.0.0_RandomForest_Credit_Card_Fraud_Detection.ipynb', '.ipynb_checkpoints', 'Project_Presentation_BigData.ipynb', 'Project_Presentation_ML.ipynb', 'v_1.0_DecisionTrees_Credit_Card_Fraud_Detection.ipynb']


# MULTIPROCESSING : distance

In [35]:
import pandas as pd
from geopy.distance import geodesic
import multiprocessing as mp
import numpy as np
import time
import sys
from distance_calculation import calculate_distance_chunk

start_time = time.time()


# Add the current working directory to the system path
sys.path.append(os.getcwd())

# Multiprocessing function to split the dataframe and apply the distance calculation
def parallel_distance_calculation(df, num_partitions=None):
    if num_partitions is None:
        num_partitions = mp.cpu_count()  # Use all available CPU cores
    
    # Split the dataframe into chunks
    df_split = np.array_split(df, num_partitions)
    
    # Create a multiprocessing Pool
    with mp.Pool(num_partitions) as pool:
        # Apply the calculate_distance_chunk function to each chunk in parallel
        result = pool.map(calculate_distance_chunk, df_split)
    
    # Concatenate the results back into a single dataframe
    return pd.concat(result)

# Main block to ensure multiprocessing works correctly
if __name__ == "__main__":
    start_time = time.time()

    # Assuming df has the columns ['lat', 'long', 'merch_lat', 'merch_long']
    
    # Run with limited number of cores (e.g., 4 cores)
    df = parallel_distance_calculation(df, num_partitions=4)  # Use 4 cores instead of all available cores

    # Log the time taken for distance calculation with multiprocessing
    log_time("Part2 - Distance Calculation with Multiprocessing (4 cores)", start_time)
    start_time = time.time()

    # Check the first few rows to verify the result
    print(df[['lat', 'long', 'merch_lat', 'merch_long', 'distance']].head())


  return bound(*args, **kwds)


Part2 - Distance Calculation with Multiprocessing (4 cores) completed at Tue Nov  5 09:39:12 2024. Elapsed time: 1 minutes and 12.41 seconds

                         lat      long  merch_lat  merch_long    distance
TransactionTime                                                          
2019-01-01 00:00:18  36.0788  -81.1781  36.011293  -82.048315   78.773821
2019-01-01 00:00:44  48.8878 -118.2105  49.159047 -118.186462   30.216618
2019-01-01 00:00:51  42.1808 -112.2620  43.150704 -112.154481  108.102912
2019-01-01 00:01:16  46.2306 -112.1138  47.034331 -112.561071   95.685115
2019-01-01 00:03:06  38.4207  -79.4629  38.674999  -78.632459   77.702395


In [36]:
import os
print(os.getcwd())  # This will print the current working directory


/Users/sadhvichandragiri/Desktop/coding/ZHAW_Project/ML_BigData_Repo_1/notebooks


log_time("Part2 -  Distance Calculation", start_time)
start_time = time.time()


In [37]:
# Check unique values in the 'is_fraud' column
df['is_fraud'].unique()


array([0, 1])

In [38]:
# Fraud vs Non-Fraud by Merchant Category
fraud_by_category = df[df['is_fraud'] == 1]['category'].value_counts().head(10)
non_fraud_by_category = df[df['is_fraud'] == 0]['category'].value_counts().head(10)



In [39]:
# Top 5 categories with the highest fraud counts
top_fraud_merchant_categories = df[df['is_fraud'] == 1]['category'].value_counts().head(5).index.tolist()

# Print top fraudulent categories
print("Top Fraudulent Merchant Categories:", top_fraud_merchant_categories)

# Create HighRiskMerchantCategory flag
df['HighRiskMerchantCategory'] = df['category'].apply(lambda x: 1 if x in top_fraud_merchant_categories else 0)



Top Fraudulent Merchant Categories: ['grocery_pos', 'shopping_net', 'misc_net', 'shopping_pos', 'gas_transport']


In [40]:
# Print the count of 1s and 0s in HighRiskMerchantCategory
print(df['HighRiskMerchantCategory'].value_counts())


HighRiskMerchantCategory
0    763876
1    532799
Name: count, dtype: int64


# Potential Additional Features:
Transaction Frequency:
    Feature: How often a credit card has been used within a specific time frame (e.g., last hour or day).
    Why: Fraudsters often make rapid successive transactions within short periods. You could create a rolling window to calculate transaction frequency.
    How: You could calculate the number of transactions within the past X hours/days using a rolling window on the TransactionTime feature.

#age group

In [41]:
import pandas as pd

# Ensure 'DateOfBirth' is in datetime format
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')  # Handle errors during conversion

# Step 1: Calculate Age
# Calculate age in years
df['Age'] = (pd.Timestamp.now() - df['DateOfBirth']).dt.days // 365  # Age in years

# Step 2: Create Age Groups
# Define age bins and labels
bins = [0, 18, 25, 35, 45, 55, 65, 100]  # Define your age bins, ensuring to cover all possible ages
labels = ['0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '66+']  # Corresponding labels

# Create age group feature, include NaN values handling
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False, include_lowest=True)

# Verify the new features without truncating DataFrame
#print(df[['DateOfBirth', 'Age', 'AgeGroup']].head(10))  # Display the first 10 entries


In [42]:
log_time("Part3 - Merchant Categories & Age group", start_time)
start_time = time.time()


Part3 - Merchant Categories & Age group completed at Tue Nov  5 09:39:13 2024. Elapsed time: 0 minutes and 1.33 seconds



# MULTIPROCESSING : count_transactions_within_last_hour

In [43]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import time
from transaction_frequency import process_chunk  # Import from the .py file

# Multiprocessing function to parallelize the transaction counting
def parallel_count_transactions(df, num_partitions=None):
    if num_partitions is None:
        num_partitions = mp.cpu_count()  # Use all available CPU cores
    
    # Ensure the index is a datetime
    df.index = pd.to_datetime(df.index)
    
    # Split the dataframe into chunks based on the number of partitions (CPU cores)
    df_split = np.array_split(df, num_partitions)
    
    # Create a multiprocessing Pool
    with mp.Pool(num_partitions) as pool:
        # Apply the processing function to each chunk in parallel
        result = pool.map(process_chunk, df_split)
    
    # Combine the results from each chunk into a single series, reset index for consistency
    return pd.concat(result).reset_index(drop=True)

# Assuming df has 'CreditCardNumber' as a column and transaction times are indexed
if __name__ == "__main__":
    start_time = time.time()

    # Apply the parallel processing for transaction frequency counting
    df['TransactionFrequency'] = parallel_count_transactions(df, num_partitions=4)  # Adjust num_partitions as needed

    # Log the time taken for transaction frequency calculation with multiprocessing
    log_time("Part4 - TransactionFrequency Multiprocessing", start_time)
    start_time = time.time()

    # Check the first 10 rows
    print(df[['TransactionFrequency']].head(10))


  return bound(*args, **kwds)


Part4 - TransactionFrequency Multiprocessing completed at Tue Nov  5 09:41:35 2024. Elapsed time: 2 minutes and 21.97 seconds

                    TransactionFrequency
TransactionTime                         
2019-01-01 00:00:18                  NaN
2019-01-01 00:00:44                  NaN
2019-01-01 00:00:51                  NaN
2019-01-01 00:01:16                  NaN
2019-01-01 00:03:06                  NaN
2019-01-01 00:04:08                  NaN
2019-01-01 00:04:42                  NaN
2019-01-01 00:05:08                  NaN
2019-01-01 00:05:18                  NaN
2019-01-01 00:06:01                  NaN


In [44]:
df.index = pd.to_datetime(df.index)
print(df.index)

DatetimeIndex(['2019-01-01 00:00:18', '2019-01-01 00:00:44',
               '2019-01-01 00:00:51', '2019-01-01 00:01:16',
               '2019-01-01 00:03:06', '2019-01-01 00:04:08',
               '2019-01-01 00:04:42', '2019-01-01 00:05:08',
               '2019-01-01 00:05:18', '2019-01-01 00:06:01',
               ...
               '2020-06-21 12:08:42', '2020-06-21 12:09:22',
               '2020-06-21 12:10:56', '2020-06-21 12:11:23',
               '2020-06-21 12:11:36', '2020-06-21 12:12:08',
               '2020-06-21 12:12:19', '2020-06-21 12:12:32',
               '2020-06-21 12:13:36', '2020-06-21 12:13:37'],
              dtype='datetime64[ns]', name='TransactionTime', length=1296675, freq=None)


In [45]:
# Resample the data to count transactions every hour
transaction_counts_hourly = df.resample('H').size()
transaction_counts_daily = df.resample('D').size()

# Combine with CreditCardNumber if necessary
transaction_counts = df.groupby('CreditCardNumber').resample('H').size().reset_index(name='TransactionCount')
print(transaction_counts.head(10))

  transaction_counts_hourly = df.resample('H').size()
  transaction_counts = df.groupby('CreditCardNumber').resample('H').size().reset_index(name='TransactionCount')


   CreditCardNumber     TransactionTime  TransactionCount
0       60416207185 2019-01-01 12:00:00                 1
1       60416207185 2019-01-01 13:00:00                 0
2       60416207185 2019-01-01 14:00:00                 0
3       60416207185 2019-01-01 15:00:00                 0
4       60416207185 2019-01-01 16:00:00                 0
5       60416207185 2019-01-01 17:00:00                 0
6       60416207185 2019-01-01 18:00:00                 0
7       60416207185 2019-01-01 19:00:00                 0
8       60416207185 2019-01-01 20:00:00                 0
9       60416207185 2019-01-01 21:00:00                 0


In [46]:
total_transactions = df.groupby('CreditCardNumber').size().reset_index(name='TotalTransactionCount')
print(total_transactions.head(10))


   CreditCardNumber  TotalTransactionCount
0       60416207185                   1518
1       60422928733                   1531
2       60423098130                    510
3       60427851591                    528
4       60487002085                    496
5       60490596305                   1010
6       60495593109                    518
7      501802953619                   1559
8      501818133297                      8
9      501828204849                    515


In [47]:
# Calculate the time difference between consecutive transactions
time_diff = df.index.to_series().diff().dt.total_seconds()
# Flag rapid transactions (within 5 minutes)
df['RapidTransactionFlag'] = time_diff < 60  # For a 1-minute threshold

# Create a temporary DataFrame for rapid transactions
rapid_transactions = df[df['RapidTransactionFlag']]

# Group by date and count the number of rapid transactions
rapid_transaction_counts = rapid_transactions.groupby(rapid_transactions.index.date).size()
print(rapid_transaction_counts)

# Get a summary of the rapid transactions
rapid_transactions_summary = rapid_transactions.describe()
print(rapid_transactions_summary)


2019-01-01    1960
2019-01-02     606
2019-01-03     701
2019-01-04     952
2019-01-05     879
              ... 
2020-06-17    1327
2020-06-18    1547
2020-06-19    1966
2020-06-20    1910
2020-06-21    1142
Length: 537, dtype: int64
         Unnamed: 0  CreditCardNumber  TransactionAmount           zip  \
count  1.061083e+06      1.061083e+06       1.061083e+06  1.061083e+06   
mean   6.541777e+05      4.168739e+17       6.463842e+01  4.881377e+04   
min    1.000000e+00      6.041621e+10       1.000000e+00  1.257000e+03   
25%    3.399115e+05      1.800429e+14       9.500000e+00  2.623700e+04   
50%    6.543810e+05      3.521417e+15       4.635000e+01  4.817400e+04   
75%    9.639775e+05      4.642255e+15       8.223000e+01  7.204200e+04   
max    1.296674e+06      4.992346e+18       5.459926e+02  9.978300e+04   
std    3.688933e+05      1.308438e+18       8.254229e+01  2.689496e+04   

                lat          long      city_pop  \
count  1.061083e+06  1.061083e+06  1.061083e+06

In [48]:
print(df.columns)


Index(['Unnamed: 0', 'CreditCardNumber', 'merchant', 'category',
       'TransactionAmount', 'first', 'last', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID', 'Hour', 'HighRiskHour', 'DayOfWeek', 'DayName',
       'IsWeekend', 'distance', 'HighRiskMerchantCategory', 'Age', 'AgeGroup',
       'TransactionFrequency', 'RapidTransactionFlag'],
      dtype='object')


In [49]:
log_time("Part5 - RapidTransactionFlag", start_time)
start_time = time.time()


Part5 - RapidTransactionFlag completed at Tue Nov  5 09:41:45 2024. Elapsed time: 0 minutes and 10.24 seconds



Transaction Amount Features:
Log Transaction Amount: Normalize the TransactionAmount by taking its logarithm to reduce skewness.
Transaction Amount Flags: Create binary flags for high-value transactions (e.g., if TransactionAmount exceeds a certain threshold).

In [50]:

# Sample DataFrame creation
# Assume 'df' is your DataFrame and has a 'TransactionAmount' column
# df = pd.read_csv('your_data.csv')  # Load your actual data

# Step 1: Log Transaction Amount
# Calculate the log of TransactionAmount
df['LogTransactionAmount'] = np.log1p(df['TransactionAmount'])  # Use log1p for stability with 0 values

# Step 2: Create Transaction Amount Flags
# Define a threshold for high-value transactions
threshold = 100  # Adjust the threshold based on your data context

# Create a flag for high-value transactions
df['HighValueTransactionFlag'] = df['TransactionAmount'] > threshold

# Verify the new features
print(df[['TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag']].head(10))  # Display the first 10 entries


                     TransactionAmount  LogTransactionAmount  \
TransactionTime                                                
2019-01-01 00:00:18               4.97              1.786747   
2019-01-01 00:00:44             107.23              4.684259   
2019-01-01 00:00:51             220.11              5.398660   
2019-01-01 00:01:16              45.00              3.828641   
2019-01-01 00:03:06              41.96              3.760269   
2019-01-01 00:04:08              94.63              4.560487   
2019-01-01 00:04:42              44.54              3.818591   
2019-01-01 00:05:08              71.65              4.285653   
2019-01-01 00:05:18               4.27              1.662030   
2019-01-01 00:06:01             198.39              5.295263   

                     HighValueTransactionFlag  
TransactionTime                                
2019-01-01 00:00:18                     False  
2019-01-01 00:00:44                      True  
2019-01-01 00:00:51                    

Behavioral Features:
Count of Transactions in Last X Days: Count how many transactions have occurred in the last 7, 14, or 30 days.
Average Transaction Amount in Last X Days: Calculate the average transaction amount over the same periods.

In [51]:
import pandas as pd

# Assuming 'TransactionTime' is already set as the index and in datetime format

# Step 1: Count of Transactions in Last X Days
for days in [7, 14, 30]:
    # Sort data by CreditCardNumber and TransactionTime to ensure rolling works properly
    df = df.sort_values(by=['CreditCardNumber', 'TransactionTime'])
    
    # Apply rolling and count the number of transactions for each card
    df[f'TransactionCountLast{days}Days'] = (
        df.groupby('CreditCardNumber')['CreditCardNumber']
        .rolling(f'{days}D')
        .count()
        .reset_index(level=0, drop=True)
    )

# Step 2: Average Transaction Amount in Last X Days
for days in [7, 14, 30]:
    # Sort data by CreditCardNumber and TransactionTime to ensure rolling works properly
    df = df.sort_values(by=['CreditCardNumber', 'TransactionTime'])
    
    # Calculate the average transaction amount for each credit card in the last X days
    df[f'AverageTransactionAmountLast{days}Days'] = (
        df.groupby('CreditCardNumber')['TransactionAmount']
        .rolling(f'{days}D')
        .mean()
        .reset_index(level=0, drop=True)
    )

# Verify the new features
print(df[['TransactionCountLast7Days', 'TransactionCountLast14Days', 'TransactionCountLast30Days',
           'AverageTransactionAmountLast7Days', 'AverageTransactionAmountLast14Days', 'AverageTransactionAmountLast30Days']].head(10))


                     TransactionCountLast7Days  TransactionCountLast14Days  \
TransactionTime                                                              
2019-01-01 12:47:15                        1.0                         1.0   
2019-01-02 08:44:57                        2.0                         2.0   
2019-01-02 08:47:36                        3.0                         3.0   
2019-01-02 12:38:14                        4.0                         4.0   
2019-01-02 13:10:46                        5.0                         5.0   
2019-01-03 13:56:35                        6.0                         6.0   
2019-01-03 17:05:10                        7.0                         7.0   
2019-01-04 13:59:55                        8.0                         8.0   
2019-01-04 21:17:22                        9.0                         9.0   
2019-01-05 00:42:24                       10.0                        10.0   

                     TransactionCountLast30Days  \
TransactionT

In [52]:
print(df.columns)  # Display all columns in the DataFrame


Index(['Unnamed: 0', 'CreditCardNumber', 'merchant', 'category',
       'TransactionAmount', 'first', 'last', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID', 'Hour', 'HighRiskHour', 'DayOfWeek', 'DayName',
       'IsWeekend', 'distance', 'HighRiskMerchantCategory', 'Age', 'AgeGroup',
       'TransactionFrequency', 'RapidTransactionFlag', 'LogTransactionAmount',
       'HighValueTransactionFlag', 'TransactionCountLast7Days',
       'TransactionCountLast14Days', 'TransactionCountLast30Days',
       'AverageTransactionAmountLast7Days',
       'AverageTransactionAmountLast14Days',
       'AverageTransactionAmountLast30Days'],
      dtype='object')


In [53]:
log_time("Part6 - TransactionCountLast_X_Days & AverageTrxAmountLast_X_Days", start_time)
start_time = time.time()


Part6 - TransactionCountLast_X_Days & AverageTrxAmountLast_X_Days completed at Tue Nov  5 09:41:58 2024. Elapsed time: 0 minutes and 12.39 seconds



# Graph Construction with NetworkX:

Highlight Fraudulent Nodes: Overlay of fraudulent and non-fraudulent credit cards on this degree distribution to see if there’s a difference in their degrees.

In [54]:


# Create an empty graph
G = nx.Graph()

# Add edges between credit cards and merchants, including transaction amount as an edge attribute
for idx, row in df.iterrows():
    credit_card = str(row['CreditCardNumber'])
    merchant = str(row['merchant'])
    transaction_amount = row['TransactionAmount']  # Ensure TransactionAmount exists in your dataframe
    
    # Add an edge with the transaction amount as an attribute
    G.add_edge(credit_card, merchant, transaction_amount=transaction_amount)


# Calculate degrees for all nodes in the graph
degrees = dict(G.degree())

# Filter degrees for credit cards and merchants
credit_card_nodes = df['CreditCardNumber'].astype(str).unique()
merchant_nodes = df['merchant'].astype(str).unique()

credit_card_degrees = {node: degrees[node] for node in credit_card_nodes if node in degrees}
merchant_degrees = {node: degrees[node] for node in merchant_nodes if node in degrees}

# Debugging: Print counts to ensure correctness
print(f"Number of unique credit card nodes: {len(credit_card_nodes)}")
print(f"Number of unique merchant nodes: {len(merchant_nodes)}")
print(f"Number of credit card nodes with degrees: {len(credit_card_degrees)}")
print(f"Number of merchant nodes with degrees: {len(merchant_degrees)}")

# Create a new DataFrame for easier plotting
degree_df = pd.DataFrame({
    'CreditCardDegree': pd.Series(credit_card_degrees),
    'MerchantDegree': pd.Series(merchant_degrees)
})




Number of unique credit card nodes: 983
Number of unique merchant nodes: 693
Number of credit card nodes with degrees: 983
Number of merchant nodes with degrees: 693


In [55]:
# Add degree information back to the original DataFrame
df['degree'] = df['CreditCardNumber'].astype(str).map(credit_card_degrees)



In [56]:
# Check edges and their attributes
#for edge in G.edges(data=True):
#    print(edge)

#do NOT print this, huge list


In [57]:
df['CreditCardNumber'] = df['CreditCardNumber'].astype(str)


In [58]:
fraud_mapping = df.set_index('CreditCardNumber')['is_fraud'].to_dict()


In [59]:
log_time("Part7 - NetworkX Start Step", start_time)
start_time = time.time()


Part7 - NetworkX Start Step completed at Tue Nov  5 09:43:17 2024. Elapsed time: 1 minutes and 19.41 seconds



In [60]:
#print(fraud_mapping.head(5))
#only testing purposes

# MULTIPROCESSING : betweenness_centrality

In [61]:
import networkx as nx
import time
from networkx_graph_betweeness_centrality import parallel_betweenness_centrality

# Assuming G is your graph
if __name__ == "__main__":
    start_time = time.time()

    # Calculate betweenness centrality using parallel processing
    betweenness_centrality = parallel_betweenness_centrality(G, num_partitions=4)  

    # Log the time taken for betweenness centrality calculation with multiprocessing
    log_time("Part8 - Betweenness Centrality Calculation with Multiprocessing", start_time)
    start_time = time.time()

    # Check a few centrality values
    print(list(betweenness_centrality.items())[:10])


Part8 - Betweenness Centrality Calculation with Multiprocessing completed at Tue Nov  5 09:45:59 2024. Elapsed time: 2 minutes and 41.34 seconds

[('60416207185', 3.903743365141523e-05), ('fraud_Jones, Sawayn and Romaguera', 0.00016718121427278148), ('fraud_Berge LLC', 0.0003731639453736409), ('fraud_Luettgen PLC', 0.0002880918347168189), ('fraud_Daugherty LLC', 0.00026296051041424566), ('fraud_Beier and Sons', 0.00025675424900787366), ('fraud_Stamm-Witting', 0.00021600459190031658), ('fraud_Conroy-Emard', 0.00021409812196013464), ('fraud_Pollich LLC', 0.00029566515903354184), ('fraud_Monahan-Morar', 0.0002101294467358623)]


In [62]:
df['betweenness_centrality'] = df['CreditCardNumber'].map(betweenness_centrality)


In [63]:
print(df['betweenness_centrality'].describe())
print(df['betweenness_centrality'].isna().sum())  # Check for missing values


count    1.296675e+06
mean     4.163569e-05
std      1.178525e-05
min      2.728628e-09
25%      3.630433e-05
50%      4.399435e-05
75%      5.147284e-05
max      5.736748e-05
Name: betweenness_centrality, dtype: float64
0


In [64]:
# Check betweenness centrality for specific credit card numbers
sample_nodes = ['60416207185', 'fraud_Kutch-Ferry']  # Replace with actual nodes
for node in sample_nodes:
    print(f"{node}: {betweenness_centrality.get(node)}")


60416207185: 3.903743365141523e-05
fraud_Kutch-Ferry: 0.00026288184139774875


1. Investigate Nodes with High Betweenness Centrality:

Now that you’ve visualized nodes with high betweenness centrality, you can:

    Examine if fraudulent nodes tend to have high betweenness centrality. This might indicate that these nodes are acting as "connectors" between different parts of the network, which could be a sign of suspicious behavior.
    Compare centrality between fraud and non-fraud nodes to see if there's a pattern.



2. Visualize Communities in the Network:

You could apply community detection to uncover fraud rings or clusters of merchants targeted by fraudsters. The Louvain algorithm is great for this.

In [65]:
import community.community_louvain as community_louvain


# Apply Louvain method for community detection
partition = community_louvain.best_partition(G)



Fraud Node Highlighting:

    Fraudulent nodes (from df['is_fraud'] == 1) are colored red to make them stand out. The rest of the nodes are still colored based on their communities.
    This should help you easily spot any fraudulent nodes in the network.

Top 10 Most Central Nodes:

    We calculate betweenness centrality and extract the top 10 most central nodes.
    These nodes are visualized with their connections, which should help declutter the graph and focus on the key players in the transaction network.

In [66]:

# Apply Louvain method for community detection
partition = community_louvain.best_partition(G)

# Create positions for nodes using a spring layout
pos = nx.spring_layout(G)

# Add the community information to the DataFrame
df['community'] = df['CreditCardNumber'].map(partition)

# Highlight fraud nodes separately
fraud_nodes = df[df['is_fraud'] == 1]['CreditCardNumber'].values




In [67]:
# Add the community information to the DataFrame
df['community'] = df['CreditCardNumber'].map(partition)

# Calculate the percentage of fraud in each community
community_fraud = df.groupby('community')['is_fraud'].mean()



In [68]:

# Print fraud rate per community
print(community_fraud)


community
0    0.003945
1    0.010391
2    0.006325
Name: is_fraud, dtype: float64


In [69]:
community_size = df.groupby('community').size()
print(community_size)


community
0    668641
1    220292
2    407742
dtype: int64


In [70]:
# Combine fraud rates and community sizes into a single DataFrame
fraud_vs_size = pd.concat([community_fraud, df.groupby('community').size()], axis=1)
fraud_vs_size.columns = ['FraudRate', 'CommunitySize']



In [71]:
top_fraud_communities = community_fraud.sort_values(ascending=False).head(5)
print(top_fraud_communities)


community
1    0.010391
2    0.006325
0    0.003945
Name: is_fraud, dtype: float64


In [72]:
# Get the community labels of the top fraud communities
top_community_labels = top_fraud_communities.index.tolist()

# Filter the DataFrame for only the top fraud communities
top_communities_df = df[df['community'].isin(top_community_labels)]


In [73]:
# Show Only the Top Merchants by Fraud Rate:
# Instead of displaying all merchants, you can filter the plot to show only the top 10 or 20 merchants with the highest fraud rates.

# Calculate fraud rate by merchant in the top fraud communities
merchant_fraud_rate = top_communities_df.groupby('merchant')['is_fraud'].mean()

# Sort merchants by fraud rate in descending order
top_merchants = merchant_fraud_rate.sort_values(ascending=False).head(10)

# Print top 10 merchants with highest fraud rate
print(top_merchants)


merchant
fraud_Kozey-Boehm                       0.025723
fraud_Herman, Treutel and Dickens       0.025385
fraud_Kerluke-Abshire                   0.022307
fraud_Brown PLC                         0.022109
fraud_Goyette Inc                       0.021616
fraud_Terry-Huel                        0.021543
fraud_Jast Ltd                          0.021505
fraud_Schmeler, Bashirian and Price     0.020833
fraud_Boyer-Reichert                    0.019916
fraud_Langworth, Boehm and Gulgowski    0.019807
Name: is_fraud, dtype: float64


In [74]:

# Assuming 'category' is a column representing merchant categories
merchantcategory_fraud = top_communities_df.groupby('category')['is_fraud'].mean()

# Sort the fraud rate by merchant category in descending order
merchantcategory_fraud_sorted = merchantcategory_fraud.sort_values(ascending=False)


In [75]:

log_time("Part9 - Community & Top Merchants", start_time)
start_time = time.time()


Part9 - Community & Top Merchants completed at Tue Nov  5 09:47:04 2024. Elapsed time: 1 minutes and 4.93 seconds



In [76]:
# Check the density of the graph (a measure of sparsity)
density = nx.density(G)
print(f"Graph Density: {density}")


Graph Density: 0.34130445623909095


In [77]:
# Calculate and print the average degree
degree_sequence = [degree for node, degree in G.degree()]
average_degree = sum(degree_sequence) / len(degree_sequence)
print(f"Average Degree of Nodes: {average_degree}")


Average Degree of Nodes: 571.6849642004773


In [78]:
log_time("Part10 - Density", start_time)
start_time = time.time()


Part10 - Density completed at Tue Nov  5 09:47:04 2024. Elapsed time: 0 minutes and 0.02 seconds



In [79]:
print(df.columns)

Index(['Unnamed: 0', 'CreditCardNumber', 'merchant', 'category',
       'TransactionAmount', 'first', 'last', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'DateOfBirth',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'TransactionID', 'Hour', 'HighRiskHour', 'DayOfWeek', 'DayName',
       'IsWeekend', 'distance', 'HighRiskMerchantCategory', 'Age', 'AgeGroup',
       'TransactionFrequency', 'RapidTransactionFlag', 'LogTransactionAmount',
       'HighValueTransactionFlag', 'TransactionCountLast7Days',
       'TransactionCountLast14Days', 'TransactionCountLast30Days',
       'AverageTransactionAmountLast7Days',
       'AverageTransactionAmountLast14Days',
       'AverageTransactionAmountLast30Days', 'degree',
       'betweenness_centrality', 'community'],
      dtype='object')


In [80]:
selected_features = [
    'TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag',
    'TransactionCountLast7Days', 'TransactionCountLast14Days', 'TransactionCountLast30Days',
    'AverageTransactionAmountLast7Days', 'AverageTransactionAmountLast14Days', 'AverageTransactionAmountLast30Days',
    'Hour', 'HighRiskHour', 'DayOfWeek', 'IsWeekend', 'TransactionFrequency', 'RapidTransactionFlag',
    'lat', 'long', 'merch_lat', 'merch_long', 'distance', 'city_pop',
    'Age', 'AgeGroup', 'gender', 'state', 'city',
    'degree', 'betweenness_centrality', 'community'
]

df_selected_features = df[selected_features]


# Page rank as new feature

In [81]:
# Calculate PageRank for each node in the graph
pagerank = nx.pagerank(G)

# Map the PageRank values to the 'CreditCardNumber' in the DataFrame
df['pagerank'] = df['CreditCardNumber'].map(pagerank)


In [82]:
# Check for NaN values in the pagerank column
print(df['pagerank'].isna().sum())


0


In [83]:
# Check descriptive statistics of pagerank values
print(df['pagerank'].describe())



count    1.296675e+06
mean     5.971670e-04
std      7.856303e-05
min      9.478567e-05
25%      5.812104e-04
50%      6.195727e-04
75%      6.566711e-04
max      6.834763e-04
Name: pagerank, dtype: float64


In [84]:
# Check how many nodes have a PageRank of zero
zero_pagerank_count = (df['pagerank'] == 0).sum()
print(f"Number of nodes with zero PageRank: {zero_pagerank_count}")


Number of nodes with zero PageRank: 0


In [85]:
# Compare the average PageRank for fraud and non-fraud transactions
fraud_avg_pagerank = df[df['is_fraud'] == 1]['pagerank'].mean()
non_fraud_avg_pagerank = df[df['is_fraud'] == 0]['pagerank'].mean()

print(f"Average PageRank for Fraud: {fraud_avg_pagerank}")
print(f"Average PageRank for Non-Fraud: {non_fraud_avg_pagerank}")


Average PageRank for Fraud: 0.000505843602201983
Average PageRank for Non-Fraud: 0.0005976986758972222


In [86]:
selected_features.append('pagerank')
df_selected_features = df[selected_features]


In [87]:
print(df_selected_features.columns)

Index(['TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag',
       'TransactionCountLast7Days', 'TransactionCountLast14Days',
       'TransactionCountLast30Days', 'AverageTransactionAmountLast7Days',
       'AverageTransactionAmountLast14Days',
       'AverageTransactionAmountLast30Days', 'Hour', 'HighRiskHour',
       'DayOfWeek', 'IsWeekend', 'TransactionFrequency',
       'RapidTransactionFlag', 'lat', 'long', 'merch_lat', 'merch_long',
       'distance', 'city_pop', 'Age', 'AgeGroup', 'gender', 'state', 'city',
       'degree', 'betweenness_centrality', 'community', 'pagerank'],
      dtype='object')


In [88]:
print(df.shape)

(1296675, 46)


In [89]:
log_time("Part11 - PageRank", start_time)
start_time = time.time()


Part11 - PageRank completed at Tue Nov  5 09:47:06 2024. Elapsed time: 0 minutes and 2.28 seconds



In [90]:
# Decision Trees start

In [91]:
log_time("END - Feature Engineering .....  ", start_time)
start_time = time.time()

END - Feature Engineering .....   completed at Tue Nov  5 09:47:06 2024. Elapsed time: 0 minutes and 0.01 seconds



In [92]:
log_time(f"{model_specs}_{dataset_type} START Model ....  ", start_time)
start_time = time.time()

DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features_Train START Model ....   completed at Tue Nov  5 09:47:06 2024. Elapsed time: 0 minutes and 0.00 seconds



In [93]:
import importlib
import decisiontree_gridsearch_modelutils
importlib.reload(decisiontree_gridsearch_modelutils)




<module 'decisiontree_gridsearch_modelutils' from '/Users/sadhvichandragiri/Desktop/coding/ZHAW_Project/ML_BigData_Repo_1/notebooks/../references/decisiontree_gridsearch_modelutils.py'>

In [94]:
print(model_specs)
#decisiontrees_parallelized_gridsearch_Train.pkl

DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features


In [95]:
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc
from decisiontree_gridsearch_modelutils import parallel_one_hot_encode, preprocessdata_parallel_onehot_impute
from decisiontree_gridsearch_modelutils import perform_grid_search_balanced, perform_grid_search, generate_classification_report_and_roc, save_decision_tree_viz

sample_fraction = 1.0  # Fraction to sample for memory efficiency


# Parameters
param_grid = {
    'max_depth': [10, 20], 
    'min_samples_split': [20, 50], 
    'min_samples_leaf': [2, 5], 
    'criterion': ['gini']
}



# Step 2: Train Model with Grid Search
def train_model(X, y, param_grid):
    #calling  perform_grid_search from decisiontree_gridsearch_modelutils.py
    # best_model = perform_grid_search(X, y, param_grid)
    best_model = perform_grid_search_balanced(X, y, param_grid) 
    return best_model

def save_trained_model(model_name):
    output_dir_model = '/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/models'
    # Define model filename
    model_outputfilename = f"{model_specs.replace(' ', '_').replace(',', '').lower()}.pkl"

    # Assuming best_rf_model is your final or best model
    try:
        # Save the model
        with open(os.path.join(output_dir_model, model_outputfilename), 'wb') as model_file:
            pickle.dump(model_name, model_file)

        print(f"Model saved to {os.path.join(output_dir_model, model_outputfilename)}")

    except NameError:
        print(f"Error: {model_specs} is not defined. Please ensure the model is assigned before saving.")

# Load the trained model
def load_trained_model(model_specs, dataset_type, output_dir_model='/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/models'):
    """
    Loads a trained model from the specified directory.
    
    Args:
        model_specs (str): Specifications or name for the model.
        dataset_type (str): Indicates if it’s for 'Train' or 'Test'.
        output_dir_model (str): Directory where the model is saved.
        
    Returns:
        model: The loaded model.
    """
    model_outputfilename = f"{model_specs.replace(' ', '_').replace(',', '').lower()}.pkl"
    model_path = os.path.join(output_dir_model, model_outputfilename)

    # Load the model if it exists
    if os.path.exists(model_path):
        with open(model_path, 'rb') as model_file:
            model = pickle.load(model_file)
        print(f"Model loaded from {model_path}")
        return model
    else:
        print(f"Model file not found at {model_path}")
        return None
    

# Step 4: Main Execution
def main(df, features_by_importance, param_grid, model_specs, reports_output_dir, use_test_data=False, sample_fraction=1.0):
    
    # Preprocess the data
    X, y, imp_feature_names = preprocessdata_parallel_onehot_impute(df, features_by_importance)
    dataset_type = 'Test' if use_test_data else 'Train'
    
    if use_test_data:
        # Load pre-trained model for test predictions
        loaded_model = load_trained_model(model_specs, dataset_type)
        if loaded_model:
            # Generate report and ROC for loaded model on test data
            generate_classification_report_and_roc(loaded_model, X, y, dataset_type, model_specs, reports_output_dir)
            return None, imp_feature_names
        else:
            print("No pre-trained model found.")
            return None, imp_feature_names
    else:
        # Perform training with grid search on train data
        best_model = train_model(X, y, param_grid)
        # Generate report and ROC for best model
        generate_classification_report_and_roc(best_model, X, y, dataset_type, model_specs, reports_output_dir)
        
        # Feature importances
        feature_importances = best_model.feature_importances_
        importance_df = pd.DataFrame({'Feature': imp_feature_names, 'Importance': feature_importances})
        print(importance_df.sort_values(by='Importance', ascending=False))

        # Save the trained model
        save_trained_model(best_model)

        return best_model, imp_feature_names


In [96]:
print(reports_output_dir)

/Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees


In [102]:
# Example Usage

# Assuming `df` is your data DataFrame and includes the 'is_fraud' target column

features_by_importance = [
    'TransactionAmount', 'LogTransactionAmount', 'HighValueTransactionFlag',
    'TransactionCountLast7Days', 'TransactionCountLast14Days', 'TransactionCountLast30Days',
    'AverageTransactionAmountLast7Days', 'AverageTransactionAmountLast14Days', 'AverageTransactionAmountLast30Days',
    'Hour', 'HighRiskHour', 'DayOfWeek', 'IsWeekend', 'TransactionFrequency', 'RapidTransactionFlag',
    'lat', 'long', 'merch_lat', 'merch_long', 'distance', 'city_pop',
    'Age', 'AgeGroup', 'gender', 'state', 'city',
    'degree', 'betweenness_centrality', 'community'
]

if use_test_data:
    _, imp_feature_names = main(df, features_by_importance, param_grid, model_specs, reports_output_dir, use_test_data=True)
else:
    best_model, imp_feature_names = main(df, features_by_importance, param_grid, model_specs, reports_output_dir, use_test_data=False)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=50; total time=   3.3s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=20; total time=   3.2s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=50; total time=   3.7s




[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=50; total time=   3.4s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=50; total time=   3.3s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=50; total time=   3.5s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=50; total time=   3.7s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=20; total time=   3.4s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=5, min_samples_split=20; total time=   3.5s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=20; total time=   3.4s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=50; total time=   3.6s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=5, min_samples_split=20; total time=   3.6s
[CV] END criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=20; total time=   3.5s


KeyboardInterrupt: 

In [98]:
log_time(f"{model_specs}_{dataset_type} END Model ....  ", start_time)
start_time = time.time()

DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features_Train END Model ....   completed at Tue Nov  5 09:47:27 2024. Elapsed time: 0 minutes and 20.72 seconds



In [99]:

# Generate a unique timestamp for the filename
timestamp = time.strftime("%Y%m%d_%H%M%S")
decisiontree_gridsearch_viz_filename = f"{model_specs}_{dataset_type}_decision_tree_balanced_gridsearch_viz_{timestamp}.png"
decisiontree_gridsearch_viz_path = os.path.join(reports_output_dir, decisiontree_gridsearch_viz_filename)

class_names = ["Non-Fraud", "Fraud"]

if not use_test_data:
    # Use the feature names returned from `main` to ensure consistency with the model
    save_decision_tree_viz(model=best_model, feature_names=imp_feature_names, class_names=class_names, output_path=decisiontree_gridsearch_viz_path)


Decision Tree visualization saved to /Users/sadhvichandragiri/desktop/coding/ZHAW_Project/ML_BigData_Repo_1/reports/DecisionTrees/DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features_Train_decision_tree_balanced_gridsearch_viz_20241105_094727.png


1. Load and Preprocess the Data for the Decision Tree model

In [100]:
log_time(f"{model_specs}_{dataset_type} saved the descision tree ....  ", start_time)
start_time = time.time()

DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features_Train saved the descision tree ....   completed at Tue Nov  5 09:47:50 2024. Elapsed time: 0 minutes and 23.48 seconds



In [101]:
import os
import time

# Assuming start_time is defined earlier in the notebook
end_time_notebook = time.time()
elapsed_time = end_time_notebook - start_time_notebook

# Print and format the notebook end time and total execution time
print(f"Notebook ended at: {time.ctime(end_time_notebook)}")
print(f"Total execution time: {elapsed_time // 60:.0f} minutes and {elapsed_time % 60:.2f} seconds")


log_time(f"{model_specs}_{dataset_type} Notebook Ended at... ", start_time_notebook)


Notebook ended at: Tue Nov  5 09:47:50 2024
Total execution time: 10 minutes and 0.93 seconds
DecisionTrees_Parallelized_GridSearch_Balanced_by_Top10Features_Train Notebook Ended at...  completed at Tue Nov  5 09:47:50 2024. Elapsed time: 10 minutes and 0.93 seconds



# Results of Top 10 features using Grid Search

# Results of Top 10 features using Grid Search BALANCED