In [292]:
## importing the important libraries
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import recall_score, classification_report, accuracy_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [293]:
df = pd.read_csv('creditcard.csv')
print("Initial data shape:", df.shape)

Initial data shape: (1048575, 23)


In [294]:
print("\nFirst few rows:")
print(df.head())


First few rows:
   Index trans_date_trans_time        cc_num  \
0      0         1/1/2019 0:00  2.703190e+15   
1      1         1/1/2019 0:00  6.304230e+11   
2      2         1/1/2019 0:00  3.885950e+13   
3      3         1/1/2019 0:01  3.534090e+15   
4      4         1/1/2019 0:03  3.755340e+14   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48.8878 -118.2105   
2  Sanchez      M 

In [295]:
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Index                  1048575 non-null  int64  
 1   trans_date_trans_time  1048575 non-null  object 
 2   cc_num                 1048575 non-null  float64
 3   merchant               1048575 non-null  object 
 4   category               1048575 non-null  object 
 5   amt                    1048575 non-null  float64
 6   first                  1048575 non-null  object 
 7   last                   1048575 non-null  object 
 8   gender                 1048575 non-null  object 
 9   street                 1048575 non-null  object 
 10  city                   1048575 non-null  object 
 11  state                  1048575 non-null  object 
 12  zip                    1048575 non-null  int64  
 13  lat                    1048575 non-null  float64
 14  lon

In [296]:
obj = (df.dtypes == 'object')
obj_cols = list(obj[obj].index)
print('\nCategorical values: ', len(obj_cols))

int = (df.dtypes == 'int64')
int_cols = list(int[int].index)
print('Integer values: ', len(int_cols))

float = (df.dtypes == 'float')
float_cols = list(float[float].index)
print('Float values: ', len(float_cols))


Categorical values:  12
Integer values:  5
Float values:  6


In [297]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Index                    0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [298]:
df.describe()

Unnamed: 0,Index,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,524287.0,4.171565e+17,70.2791,48801.59,38.53336,-90.22626,89057.76,1344906000.0,38.53346,-90.22648,0.005727773
std,302697.7,1.308811e+18,159.9518,26898.04,5.076852,13.75858,302435.1,10197000.0,5.111233,13.77093,0.07546503
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,262143.5,180040000000000.0,9.64,26237.0,34.6205,-96.798,743.0,1336682000.0,34.72954,-96.89864,0.0
50%,524287.0,3520550000000000.0,47.45,48174.0,39.3543,-87.4769,2456.0,1344902000.0,39.36295,-87.43923,0.0
75%,786430.5,4642260000000000.0,83.05,72042.0,41.9404,-80.158,20328.0,1354366000.0,41.95602,-80.23228,0.0
max,1048574.0,4.99235e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1362932000.0,67.51027,-66.9509,1.0


In [299]:
df.drop(['Index'], axis=1, inplace=True)

In [300]:
print(df.columns)

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


In [301]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
print(df.dtypes['trans_date_trans_time'])
df.head()

datetime64[ns]


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:00,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",3/9/1988,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:00,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,6/21/1978,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:00,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1/19/1962,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:00,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1/12/1967,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:00,375534000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,3/28/1986,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [302]:
# First ensure date column is datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Extract features
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df['trans_day_of_week'] = df['trans_date_trans_time'].dt.dayofweek + 1  # Adding 1 to make Monday=1
df['trans_year_month'] = df['trans_date_trans_time'].dt.to_period('M')

# Display results
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_hour,trans_day_of_week,trans_year_month
0,2019-01-01 00:00:00,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,"Psychologist, counselling",3/9/1988,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,0,2,2019-01
1,2019-01-01 00:00:00,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,Special educational needs teacher,6/21/1978,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,0,2,2019-01
2,2019-01-01 00:00:00,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,Nature conservation officer,1/19/1962,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,0,2,2019-01
3,2019-01-01 00:01:00,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,Patent attorney,1/12/1967,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,0,2,2019-01
4,2019-01-01 00:03:00,375534000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,Dance movement psychotherapist,3/28/1986,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,0,2,2019-01


In [303]:
df.nunique()

trans_date_trans_time     476595
cc_num                       943
merchant                     693
category                      14
amt                        48602
first                        348
last                         479
gender                         2
street                       965
city                         879
state                         51
zip                          952
lat                          950
long                         951
city_pop                     865
job                          493
dob                          950
trans_num                1048575
unix_time                1030650
merch_lat                1016437
merch_long               1034825
is_fraud                       2
trans_hour                    24
trans_day_of_week              7
trans_year_month              15
dtype: int64

In [304]:
# First check if columns exist
print("Columns in dataframe:", df.columns.tolist())

# Convert dates ensuring proper format
df['dob'] = pd.to_datetime(df['dob'])
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Calculate age using timedelta
df['age'] = (df['trans_date_trans_time'].dt.year - df['dob'].dt.year)

# Adjust age for cases where birthday hasn't occurred yet this year
df['age'] = df['age'].apply(lambda x: max(0, x))

# Verify results
print("\nAge calculation preview:")
print(df[['dob', 'trans_date_trans_time', 'age']].head())

Columns in dataframe: ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'trans_hour', 'trans_day_of_week', 'trans_year_month']

Age calculation preview:
         dob trans_date_trans_time  age
0 1988-03-09   2019-01-01 00:00:00   31
1 1978-06-21   2019-01-01 00:00:00   41
2 1962-01-19   2019-01-01 00:00:00   57
3 1967-01-12   2019-01-01 00:01:00   52
4 1986-03-28   2019-01-01 00:03:00   33


In [305]:
df.drop(['trans_date_trans_time','first', 'last', 'dob'] , axis=1, inplace=True)

In [306]:
df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,street,city,state,zip,lat,...,job,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_hour,trans_day_of_week,trans_year_month,age
0,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,...,"Psychologist, counselling",0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,0,2,2019-01,31
1,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,...,Special educational needs teacher,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,0,2,2019-01,41
2,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,...,Nature conservation officer,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,0,2,2019-01,57
3,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,...,Patent attorney,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,0,2,2019-01,52
4,375534000000000.0,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,...,Dance movement psychotherapist,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,0,2,2019-01,33


In [307]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 22 columns):
 #   Column             Non-Null Count    Dtype    
---  ------             --------------    -----    
 0   cc_num             1048575 non-null  float64  
 1   merchant           1048575 non-null  object   
 2   category           1048575 non-null  object   
 3   amt                1048575 non-null  float64  
 4   gender             1048575 non-null  object   
 5   street             1048575 non-null  object   
 6   city               1048575 non-null  object   
 7   state              1048575 non-null  object   
 8   zip                1048575 non-null  int64    
 9   lat                1048575 non-null  float64  
 10  long               1048575 non-null  float64  
 11  city_pop           1048575 non-null  int64    
 12  job                1048575 non-null  object   
 13  trans_num          1048575 non-null  object   
 14  unix_time          1048575 non-null  int64    
 15

In [308]:
df['category'] = df['category'].astype('category')
df['gender'] = df['gender'].astype('category')
df['is_fraud'] = df['is_fraud'].astype('category')

In [309]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 22 columns):
 #   Column             Non-Null Count    Dtype    
---  ------             --------------    -----    
 0   cc_num             1048575 non-null  float64  
 1   merchant           1048575 non-null  object   
 2   category           1048575 non-null  category 
 3   amt                1048575 non-null  float64  
 4   gender             1048575 non-null  category 
 5   street             1048575 non-null  object   
 6   city               1048575 non-null  object   
 7   state              1048575 non-null  object   
 8   zip                1048575 non-null  int64    
 9   lat                1048575 non-null  float64  
 10  long               1048575 non-null  float64  
 11  city_pop           1048575 non-null  int64    
 12  job                1048575 non-null  object   
 13  trans_num          1048575 non-null  object   
 14  unix_time          1048575 non-null  int64    
 15

In [310]:
df.shape

(1048575, 22)

In [311]:
df.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,trans_hour,trans_day_of_week,age
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,4.171565e+17,70.2791,48801.59,38.53336,-90.22626,89057.76,1344906000.0,38.53346,-90.22648,12.80077,4.135594,45.88899
std,1.308811e+18,159.9518,26898.04,5.076852,13.75858,302435.1,10197000.0,5.111233,13.77093,6.816861,2.200499,17.36625
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0,1.0,14.0
25%,180040000000000.0,9.64,26237.0,34.6205,-96.798,743.0,1336682000.0,34.72954,-96.89864,7.0,2.0,32.0
50%,3520550000000000.0,47.45,48174.0,39.3543,-87.4769,2456.0,1344902000.0,39.36295,-87.43923,14.0,4.0,44.0
75%,4642260000000000.0,83.05,72042.0,41.9404,-80.158,20328.0,1354366000.0,41.95602,-80.23228,19.0,6.0,57.0
max,4.99235e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1362932000.0,67.51027,-66.9509,23.0,7.0,96.0


In [312]:
100 * df.groupby('is_fraud').size() / len(df)

is_fraud
0    99.427223
1     0.572777
dtype: float64

In [313]:
# Basic statistics for all transactions
all_transactions = df['amt'].describe()

# Statistics for non-fraud transactions
non_fraud = df[df['is_fraud']==0]['amt'].describe()

# Statistics for fraud transactions
fraud = df[df['is_fraud']==1]['amt'].describe()

# Combine all statistics in a readable format
comparison = pd.DataFrame({
    'All Transactions': all_transactions,
    'Non-Fraud': non_fraud,
    'Fraud': fraud
})

# Display results
comparison.round(2)  # Round to 2 decimal places for clarity

Unnamed: 0,All Transactions,Non-Fraud,Fraud
count,1048575.0,1042569.0,6006.0
mean,70.28,67.63,530.57
std,159.95,153.7,391.33
min,1.0,1.0,1.18
25%,9.64,9.6,241.58
50%,47.45,47.22,391.16
75%,83.05,82.47,901.95
max,28948.9,28948.9,1371.81


In [314]:
df_timeline01 = df.groupby(df['trans_year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline01.columns = ['year_month','num_of_transactions','customers']
df_timeline01

Unnamed: 0,year_month,num_of_transactions,customers
0,2019-01,52525,893
1,2019-02,49866,898
2,2019-03,70939,896
3,2019-04,68078,893
4,2019-05,72532,890
5,2019-06,86064,888
6,2019-07,86596,890
7,2019-08,87359,891
8,2019-09,70652,893
9,2019-10,68758,891


In [315]:
df_fraud_transactions = df[df['is_fraud']==1]

df_timeline02 = df_fraud_transactions.groupby(df_fraud_transactions['trans_year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline02.columns = ['year_month','num_of_fraud_transactions','fraud_customers']
df_timeline02

Unnamed: 0,year_month,num_of_fraud_transactions,fraud_customers
0,2019-01,506,50
1,2019-02,517,53
2,2019-03,494,49
3,2019-04,376,41
4,2019-05,408,42
5,2019-06,354,35
6,2019-07,331,36
7,2019-08,382,39
8,2019-09,418,44
9,2019-10,454,50


In [316]:
# Ensure no naming conflicts
if 'float' in globals():
    del float

# Define age bins and labels
custom_bins = [13, 19, 32, 42, 50, 62, float('inf')]
custom_labels = ['Teenagers', 'Young Adults', 'Adults', 'Middle-aged', 'Seniors', 'Retired']

# Apply the binning to create a new 'age_category' column
df['age_category'] = pd.cut(df['age'], bins=custom_bins, labels=custom_labels, right=False)

# Display the result
print(df[['age', 'age_category']].tail())

         age  age_category
1048570   77       Retired
1048571   21  Young Adults
1048572   68       Retired
1048573   30  Young Adults
1048574   23  Young Adults


In [317]:
df.sample(5)

Unnamed: 0,cc_num,merchant,category,amt,gender,street,city,state,zip,lat,...,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_hour,trans_day_of_week,trans_year_month,age,age_category
902732,4642260000000000.0,fraud_Wisozk and Sons,misc_pos,7.44,F,320 Nicholson Orchard,Thompson,UT,84540,38.9999,...,276b45cd5eac516256d5cde993b51464,1356627745,38.13455,-109.13387,0,17,5,2019-12,32,Adults
587117,4727240000000000.0,fraud_Crooks and Sons,personal_care,64.95,F,118 Justin Extension,Bay Minette,AL,36507,30.8635,...,f9927917c7418ba38b85b1e99f303eff,1346936602,30.345244,-86.816426,0,13,5,2019-09,90,Retired
762756,3559720000000000.0,fraud_Dickinson Ltd,misc_pos,41.95,M,622 Williams Trafficway,Ringwood,IL,60072,42.4048,...,a024fa86758135cbabfd1b2dca5855c0,1353635346,41.774705,-88.871728,0,1,6,2019-11,34,Adults
479086,3592330000000000.0,fraud_Stamm-Rodriguez,misc_pos,116.47,F,1007 Colton Forks,Hopewell,VA,23860,37.2876,...,4384ab27067e0a334df01e8ff8042000,1343529244,37.065684,-77.106097,0,2,1,2019-07,84,Retired
435256,6011720000000000.0,fraud_Kuhn Group,food_dining,55.59,F,144 Evans Islands Apt. 683,Burbank,WA,99323,46.1966,...,d270462e0a2c39933d5a02c0619881bc,1342217083,46.222382,-119.267012,0,22,6,2019-07,38,Adults


In [318]:
high_trans_states = df.state.value_counts().head(25).index.tolist()
print(high_trans_states)

['TX', 'NY', 'PA', 'CA', 'OH', 'MI', 'IL', 'FL', 'AL', 'MO', 'MN', 'AR', 'NC', 'WI', 'VA', 'SC', 'KY', 'IN', 'IA', 'OK', 'MD', 'GA', 'WV', 'NJ', 'NE']


In [319]:
state_df = 100 * df[df.state.isin(high_trans_states)].state.value_counts(normalize=True)
print(state_df)

state
TX    9.586414
NY    8.416395
PA    8.055206
CA    5.710920
OH    4.675331
MI    4.649969
IL    4.380983
FL    4.334632
AL    4.151102
MO    3.879867
MN    3.210089
AR    3.129505
NC    3.085528
WI    2.966590
VA    2.952347
SC    2.947100
KY    2.874388
IN    2.780186
IA    2.708848
OK    2.684236
MD    2.649879
GA    2.628890
WV    2.596656
NJ    2.491586
NE    2.453355
Name: proportion, dtype: float64


In [320]:
fraud_ratios = []

for state in df['state'].unique():
    total_count = df[df['state'] == state].shape[0]
    fraud_count = df[(df['state'] == state) & (df['is_fraud'] == 1)].shape[0]
    ratio = fraud_count / total_count if total_count > 0 else 0
    fraud_ratios.append({'State': state, 'Fraud Ratio': ratio, 'Total Transactions': total_count})

# Create a DataFrame from the list of dictionaries
fraud_ratio_df = pd.DataFrame(fraud_ratios)

# Print the DataFrame
print(fraud_ratio_df)

   State  Fraud Ratio  Total Transactions
0     NC     0.004778               24697
1     WA     0.004524               15251
2     ID     0.002440                4509
3     MT     0.003385                9454
4     VA     0.006178               23631
5     PA     0.005351               64475
6     KS     0.004407               18606
7     TN     0.007996               14132
8     IA     0.005488               21682
9     WV     0.005100               20784
10    FL     0.007580               34695
11    CA     0.005754               45711
12    NM     0.005258               13314
13    NJ     0.004362               19943
14    OK     0.005259               21485
15    IN     0.004089               22253
16    MA     0.004915                9970
17    TX     0.005096               76731
18    WI     0.005643               23745
19    MI     0.005615               37219
20    WY     0.006076               15636
21    HI     0.003433                2039
22    NE     0.007639             

In [321]:
df_state = df[['state','trans_num']].groupby(['state']).count().reset_index()
df_state.columns = ['state', 'state_count']

#creating the state-fraud distribution
df_fraud_state = df[['state', 'trans_num', 'is_fraud']].groupby(['state','is_fraud']).count().reset_index()
df_fraud_state.columns = ['state', 'is_fraud', 'Transaction count']

df_fraud_state = df_fraud_state.merge(df_state[['state', 'state_count']], how='inner', on='state')

df_fraud_state['Transaction percentage'] = (df_fraud_state['Transaction count']/df_fraud_state['state_count'])*100
#viewing the top 20 states with high fraudulent transactions
df_fraud_state[df_fraud_state['is_fraud'] == 1].sort_values(by = ['Transaction percentage'], ascending=False).head(20)
#states with more than 75% fraudulent transactions
print('state with more than 75% fraudulent transactions:\n')
print(df_fraud_state.loc[(df_fraud_state.is_fraud == 1) & (df_fraud_state['Transaction percentage'] >= 75)].state)

state with more than 75% fraudulent transactions:

17    DE
Name: state, dtype: object


In [322]:
df.category.value_counts(normalize=True)

category
gas_transport     0.101500
grocery_pos       0.095278
home              0.094965
shopping_pos      0.089982
kids_pets         0.087170
shopping_net      0.075244
entertainment     0.072461
food_dining       0.070611
personal_care     0.070093
health_fitness    0.066149
misc_pos          0.061504
misc_net          0.048716
grocery_net       0.035018
travel            0.031309
Name: proportion, dtype: float64

In [323]:
#constructing the category-transaction count distribution
df_category = df[['category','trans_num']].groupby(['category']).count().reset_index()
df_category.columns = ['category', 'category_count']

#creating the zip-fraud distribution
df_fraud_category = df[['category', 'trans_num', 'is_fraud']].groupby(['category','is_fraud']).count().reset_index()
df_fraud_category.columns = ['category', 'is_fraud', 'Transaction count']

df_fraud_category = df_fraud_category.merge(df_category[['category', 'category_count']], how='inner', on='category')

df_fraud_category['Transaction percentage'] = (df_fraud_category['Transaction count']/df_fraud_category['category_count'])*100

#viewing the top categories with high fraudulent transaction volumes
df_fraud_category[df_fraud_category['is_fraud'] == 1].sort_values(by = ['Transaction percentage'], ascending=False)

Unnamed: 0,category,is_fraud,Transaction count,category_count,Transaction percentage
23,shopping_net,1,1375,78899,1.742734
17,misc_net,1,742,51082,1.452566
9,grocery_pos,1,1396,99906,1.397313
25,shopping_pos,1,662,94353,0.701621
5,gas_transport,1,498,106430,0.467913
27,travel,1,100,32830,0.304599
19,misc_pos,1,194,64492,0.300813
7,grocery_net,1,110,36719,0.299572
1,entertainment,1,185,75981,0.243482
21,personal_care,1,172,73498,0.23402


In [324]:
#categories with more than one percent fraudulent transactions
df_fraud_category.loc[(df_fraud_category.is_fraud == 1) & (df_fraud_category['Transaction percentage'] >= 0.1)]

Unnamed: 0,category,is_fraud,Transaction count,category_count,Transaction percentage
1,entertainment,1,185,75981,0.243482
3,food_dining,1,121,74041,0.163423
5,gas_transport,1,498,106430,0.467913
7,grocery_net,1,110,36719,0.299572
9,grocery_pos,1,1396,99906,1.397313
11,health_fitness,1,104,69362,0.149938
13,home,1,153,99578,0.153648
15,kids_pets,1,194,91404,0.212245
17,misc_net,1,742,51082,1.452566
19,misc_pos,1,194,64492,0.300813


In [325]:
df.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'trans_num',
       'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'trans_hour',
       'trans_day_of_week', 'trans_year_month', 'age', 'age_category'],
      dtype='object')

In [326]:
df

Unnamed: 0,cc_num,merchant,category,amt,gender,street,city,state,zip,lat,...,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_hour,trans_day_of_week,trans_year_month,age,age_category
0,2.703190e+15,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,...,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,0,2,2019-01,31,Young Adults
1,6.304230e+11,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,...,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,0,2,2019-01,41,Adults
2,3.885950e+13,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,...,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,0,2,2019-01,57,Seniors
3,3.534090e+15,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,...,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,0,2,2019-01,52,Seniors
4,3.755340e+14,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,...,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,0,2,2019-01,33,Adults
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,6.011980e+15,fraud_Fadel Inc,health_fitness,77.00,F,05561 Farrell Crescent,Annapolis,MD,21405,39.0305,...,45ecd198c65e81e597db22e8d2ef7361,1362931649,38.779464,-76.317042,0,16,2,2020-03,77,Retired
1048571,4.839040e+15,"fraud_Cremin, Hamill and Reichel",misc_pos,116.94,F,043 Hanson Turnpike,Hedrick,IA,52563,41.1826,...,c00ce51c6ebb7657474a77b9e0b51f34,1362931670,41.400318,-92.726724,0,16,2,2020-03,21,Young Adults
1048572,5.718440e+11,"fraud_O'Connell, Botsford and Hand",home,21.27,F,005 Cody Estates,Louisville,KY,40202,38.2507,...,17c9dc8b2a6449ca2473726346e58e6c,1362931711,37.293339,-84.798122,0,16,2,2020-03,68,Retired
1048573,4.646850e+18,fraud_Thompson-Gleason,health_fitness,9.52,F,576 House Crossroad,West Sayville,NY,11796,40.7320,...,5ca650881b48a6a38754f841c23b77ab,1362931718,39.773077,-72.213209,0,16,2,2020-03,30,Young Adults


In [327]:
x =  [ 'amt', 'trans_hour', 'age','is_fraud', 'zip']
df = pd.get_dummies(df[x], drop_first=True)
df.columns

Index(['amt', 'trans_hour', 'age', 'zip', 'is_fraud_1'], dtype='object')

In [328]:
# Define predictors without city_tier
predictors = ['amt', 'trans_hour', 'age', 'zip']

# partition data
X = df.loc[:, predictors]
y = df.loc[:, 'is_fraud_1']

In [329]:
RANDOM_STATE = 42
TEST_SIZE = 0.4
PROBABILITY_CUTOFF = 0.7

In [330]:
X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=TEST_SIZE, 
            random_state=RANDOM_STATE,
            stratify=y
        )

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
        
# Apply undersampling
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

In [331]:
# Create a copy of X to avoid modifying the original data
X_encoded = X.copy()

# Encode the categorical 'city_tier' column
label_encoder = LabelEncoder()
# X_encoded = label_encoder.fit_transform(X_encoded)
# Use the created 'city_tier' column for encoding
X_encoded = X_encoded[['amt', 'trans_hour', 'age', 'zip']]

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['amt', 'trans_hour', 'age', 'zip']
X_encoded[numerical_features] = scaler.fit_transform(X_encoded[numerical_features])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
)

# Apply undersampling to balance the classes
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Define XGBoost parameters
xgb_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

# Split the data
xgb_classifier = GridSearchCV(
    XGBClassifier(
        random_state=RANDOM_STATE, 
        use_label_encoder=False, 
        eval_metric='logloss',
        enable_categorical=True
    ),
    xgb_params,
    cv=5,
    scoring='f1'
)
xgb_classifier = GridSearchCV(
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    xgb_params,
    cv=5,
    scoring='f1'
)
xgb_classifier = GridSearchCV(
    XGBClassifier(random_state=42),
    xgb_params,
    cv=5,
    scoring='f1'
)

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
ranfor_classifier = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='f1'
)

dt_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
dt_classifier = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_params,
    cv=5,
    scoring='f1'
)

# Train models on resampled data
classifiers = {
    'XGBoost': xgb_classifier,
    'Random Forest': ranfor_classifier,
    'Decision Tree': dt_classifier
}

results = {}
feature_importances = {}
predictions = {}
probabilities = {}

for name, clf in classifiers.items():
    # Train and get best model
    clf.fit(X_train_resampled, y_train_resampled)
    best_model = clf.best_estimator_
    
    # Make predictions
    predictions[name] = best_model.predict(X_test)
    probabilities[name] = best_model.predict_proba(X_test)
    
    # Get feature importances
    feature_importances[name] = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    # Calculate metrics
    results[name] = {
        'Accuracy': accuracy_score(y_test, predictions[name]),
        'Precision': precision_score(y_test, predictions[name]),
        'Recall': recall_score(y_test, predictions[name]),
        'F1': f1_score(y_test, predictions[name]),
        'ROC AUC': roc_auc_score(y_test, probabilities[name][:, 1]),
        'Cross Val Score': np.mean(cross_val_score(best_model, X_train_resampled, y_train_resampled, cv=5)),
        'Best Parameters': clf.best_params_
    }
    
    # Print detailed results
    print(f"\n{name} Classifier:")
    print(f"Best Parameters: {clf.best_params_}")
    print(f"Cross-validation score: {results[name]['Cross Val Score']:.4f}")
    print("\nFeature Importance:")
    print(feature_importances[name].head())
    print("\nClassification Report:")
    print(classification_report(y_test, predictions[name]))

# Create predictions DataFrame with probability cutoff
cutoff = 0.7
for name in classifiers.keys():
    df_name = f"{name.lower().replace(' ', '_')}_results"
    globals()[df_name] = pd.DataFrame({
        'actual': y_test,
        'p(0)': probabilities[name][:, 0],
        'p(1)': probabilities[name][:, 1],
        'predicted': predictions[name],
        'predicted_with_cutoff': (probabilities[name][:, 1] > cutoff).astype(int)
    })
    
    # Show random samples
    random_indices = np.random.choice(globals()[df_name].index, size=5, replace=False)
    print(f"\n{name} - Random sample of predictions (cutoff = {cutoff}):")
    print(globals()[df_name].loc[random_indices])

# Create final comparison DataFrame
final_data = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy (%)': [results[model]['Accuracy'] * 100 for model in results],
    'Precision (%)': [results[model]['Precision'] * 100 for model in results],
    'Recall (%)': [results[model]['Recall'] * 100 for model in results],
    'F1 Score': [results[model]['F1'] for model in results],
    'ROC AUC': [results[model]['ROC AUC'] for model in results],
    'CV Score': [results[model]['Cross Val Score'] for model in results]
}).sort_values('F1 Score', ascending=False)

print("\nFinal Model Comparison:")
print(final_data)

# Find the best model based on F1 score
best_model_name = final_data.iloc[0]['Model']
print(f"\nBest performing model: {best_model_name}")
print("Best parameters:", results[best_model_name]['Best Parameters'])


XGBoost Classifier:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Cross-validation score: 0.9345

Feature Importance:
      Feature  Importance
0         amt    0.656317
1  trans_hour    0.248136
2         age    0.057647
3         zip    0.037900

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.94      0.97    417045
        True       0.09      0.93      0.16      2385

    accuracy                           0.94    419430
   macro avg       0.54      0.94      0.56    419430
weighted avg       0.99      0.94      0.97    419430


Random Forest Classifier:
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Cross-validation score: 0.9358

Feature Importance:
      Feature  Importance
0         amt    0.682039
1  trans_hour    0.222393
2         age    0.049015
3         zip    0.046552

Classification Report:
              precision    recall  f1-score   support

 

In [332]:
def train_models(X, y):
    """
    Train multiple classification models with proper error handling and data preparation.
    
    Parameters:
    -----------
    X : pd.DataFrame or np.array
        Feature matrix
    y : pd.Series or np.array
        Target variable
        
    Returns:
    --------
    tuple:
        - results: dict
            Dictionary containing evaluation metrics for each model
        - predictions: dict
            Dictionary containing predictions for each model
        - feature_importance: dict
            Dictionary containing feature importance for each model
        - X_test_scaled: np.array
            Scaled test features
        - y_test: np.array
            Test target values
        - scaler: StandardScaler
            Fitted scaler object
        - classifiers: dict
            Dictionary containing trained classifier objects
    
    Raises:
    -------
    ValueError
        If input data is empty or contains invalid values
    """
    # Input validation
    if X is None or y is None:
        raise ValueError("Input data cannot be None")
    if len(X) == 0 or len(y) == 0:
        raise ValueError("Input data cannot be empty")
    if X.shape[0] != len(y):
        raise ValueError("X and y must have the same number of samples")
    
    try:
        # Initial train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=TEST_SIZE, 
            random_state=RANDOM_STATE,
            stratify=y
        )

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Apply undersampling
        undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=RANDOM_STATE)
        X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

        # Initialize classifiers with basic parameters
        classifiers = {
            'xgb': XGBClassifier(
                random_state=RANDOM_STATE,
                n_estimators=100,
                learning_rate=0.1
            ),
            'ranfor': RandomForestClassifier(
                random_state=RANDOM_STATE,
                n_estimators=100,
                n_jobs=-1
            ),
            'dt': DecisionTreeClassifier(
                random_state=RANDOM_STATE,
                max_depth=None
            )
        }

        # Initialize result dictionaries
        results = {}
        predictions = {}
        probabilities = {}
        feature_importance = {}

        # Train and evaluate each model
        for name, clf in classifiers.items():
            try:
                # Train model
                clf.fit(X_train_resampled, y_train_resampled)
                
                # Generate predictions and probabilities
                predictions[name] = clf.predict(X_test_scaled)
                probabilities[name] = clf.predict_proba(X_test_scaled)
                
                # Calculate feature importance
                feature_importance[name] = pd.DataFrame({
                    'Feature': X.columns,
                    'Importance': clf.feature_importances_
                }).sort_values(by='Importance', ascending=False)

                # Calculate comprehensive metrics
                results[name] = {
                    'accuracy': accuracy_score(y_test, predictions[name]),
                    'precision': precision_score(y_test, predictions[name]),
                    'recall': recall_score(y_test, predictions[name]),
                    'f1': f1_score(y_test, predictions[name]),
                    'roc_auc': roc_auc_score(y_test, probabilities[name][:, 1]),
                    'confusion_matrix': confusion_matrix(y_test, predictions[name]).tolist(),
                    'report': classification_report(y_test, predictions[name]),
                    'cv_scores': cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5).tolist()
                }
                
            except Exception as model_error:
                print(f"Error training {name} classifier: {str(model_error)}")
                results[name] = None
                predictions[name] = None
                feature_importance[name] = None

        return {
            'results': results,
            'predictions': predictions,
            'feature_importance': feature_importance,
            'X_test_scaled': X_test_scaled,
            'y_test': y_test,
            'scaler': scaler,
            'classifiers': classifiers
        }

    except Exception as e:
        print(f"Fatal error in model training pipeline: {str(e)}")
        raise RuntimeError(f"Model training failed: {str(e)}")

In [333]:
def load_and_prepare_data(data, predictors):
    """
    Prepare data with error handling and validation.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Input DataFrame containing all features
    predictors : list
        List of feature names to use
        
    Returns:
    --------
    pd.DataFrame or None
        Prepared feature matrix
    """
    try:
        if not isinstance(data, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame")
            
        if not isinstance(predictors, list):
            raise ValueError("Predictors must be a list")
            
        if not all(feature in data.columns for feature in predictors):
            missing_features = [f for f in predictors if f not in data.columns]
            raise ValueError(f"Missing features in dataset: {missing_features}")
        
        prepared_data = data[predictors].copy()
        
        # Check for missing values
        if prepared_data.isnull().any().any():
            raise ValueError("Dataset contains missing values")
            
        return prepared_data
    except Exception as e:
        print(f"Error in data preparation: {str(e)}")
        return None

In [334]:
def get_probability_predictions(classifier, X_test, y_test, cutoff=PROBABILITY_CUTOFF):
    """
    Get probability predictions with proper error handling.
    
    Parameters:
    -----------
    classifier : sklearn classifier object
        Trained classifier
    X_test : np.array
        Test features
    y_test : np.array
        Test labels
    cutoff : float
        Probability threshold for classification
        Returns:
    --------
    pd.DataFrame or None
        DataFrame containing predictions and probabilities
    """
    try:
        if not hasattr(classifier, 'predict_proba'):
            raise ValueError("Classifier must support probability predictions")
            
        probabilities = classifier.predict_proba(X_test)
        predictions = classifier.predict(X_test)
        
        results_df = pd.DataFrame({
            'actual': y_test,
            'p(0)': probabilities[:, 0],
            'p(1)': probabilities[:, 1],
            'predicted': predictions,
            'predicted_with_cutoff': (probabilities[:, 1] > cutoff).astype(int)
        })
        
        # Add evaluation metrics
        results_df['correct'] = results_df['actual'] == results_df['predicted_with_cutoff']
        
        return results_df
    
    except Exception as e:
        print(f"Error in probability prediction: {str(e)}")
        return None

In [335]:
def predict_transaction(transaction_data, model_path="models/credit_card_model.pkl"):
    """
    Predict if a transaction is fraudulent.
    
    Parameters:
    -----------
    transaction_data : list or np.ndarray
        Features of the transaction to predict
    model_path : str
        Path to the saved model file
        
    Returns:
    --------
    dict or None
        Prediction results including classification and probability
    """
    try:
        if not isinstance(transaction_data, (list, np.ndarray)):
            raise ValueError("Transaction data must be a list or numpy array")
            
        model_dict = joblib.load(model_path)
        
        # Validate loaded model
        required_keys = ['model', 'scaler', 'features']
        if not all(key in model_dict for key in required_keys):
            raise ValueError("Invalid model file format")
            
        model = model_dict['model']
        scaler = model_dict['scaler']
        expected_features = len(model_dict['features'])
        
        # Validate input dimensions
        if len(transaction_data) != expected_features:
            raise ValueError(f"Expected {expected_features} features, got {len(transaction_data)}")
        
        # Scale the input data
        scaled_data = scaler.transform([transaction_data])
        
        # Get predictions
        prediction = model.predict(scaled_data)
        probability = model.predict_proba(scaled_data)[0][1]
        
        return {
            'prediction': "Fraud Transaction" if prediction[0] == 1 else "Normal Transaction",
            'probability': float(probability),  # Convert to native Python float
            'is_fraud': bool(prediction[0]),
            'confidence': "High" if abs(probability - 0.5) > 0.3 else "Low"
        }
        
    except FileNotFoundError:
        print(f"Model file not found at {model_path}")
        return None
    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None

In [336]:
def check_transaction(transaction_data, model=None, scaler=None):
    """
    Quick transaction check returning simple safe/fraud response.
    
    Parameters:
    -----------
    transaction_data : list or np.ndarray
        Features of the transaction to check
    model : sklearn classifier object, optional
        Pre-trained model (if None, loads from file)
    scaler : StandardScaler object, optional
        Fitted scaler (if None, loads from file)
        
    Returns:
    --------
    str
     "SAFE" or "FRAUD"
    """
    try:
        if model is None or scaler is None:
            model_dict = joblib.load('models/credit_card_model.pkl')
            model = model_dict['model']
            scaler = model_dict['scaler']
        
        # Scale the input data
        scaled_data = scaler.transform([transaction_data])
        
        # Get prediction
        prediction = model.predict(scaled_data)[0]
        
        return "SAFE" if prediction == 0 else "FRAUD"
        
    except Exception as e:
        print(f"Error in transaction check: {str(e)}")
        return "ERROR"

In [337]:
def main():
    """
    Main execution function with proper error handling and execution flow.
    """
    try:
        # Define predictors (customize based on your dataset)
        predictors = [
            'amt', 'zip', 'trans_hour', 'age'
        ]
        
        # Load and prepare data (assuming df is your dataset)
        X = load_and_prepare_data(df, predictors)
        if X is None:
            raise ValueError("Data preparation failed")
            
        # Get target variable (assuming 'fraud' is your target column)
        y = df['fraud']
        # Train models and get results
        results, predictors, feature_importance, X_test, y_test, scaler, classifiers = train_models(X, y)
        if results is None:
            raise ValueError("Model training failed")
            
        # Get best model based on accuracy and recall
        model_scores = {
            name: (metrics['accuracy'] + metrics['recall']) / 2  # Combined metric
            for name, metrics in results.items()
        }
        best_model_name = max(model_scores, key=model_scores.get)
        best_model = classifiers[best_model_name]
        
        # Get detailed predictions for best model
        best_model_predictions = get_probability_predictions(
            best_model, 
            X_test, 
            y_test, 
            cutoff=PROBABILITY_CUTOFF
        )
        
        # Save best model with scaler and metadata
        joblib.dump({
            'model': best_model,
            'scaler': scaler,
            'features': predictors,
            'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
            'model_type': best_model_name,
            'metrics': results[best_model_name]
        }, 'credit_card_model.pkl')

        # Print results
        print("\nModel Performance Summary:")
        for name, metrics in results.items():
            print(f"\n{name} Classifier:")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print(f"Recall: {metrics['recall']:.4f}")
            print("\nFeature Importance:")
            print(feature_importance[name].head())
            print("\nClassification Report:")
            print(metrics['report'])
            
        print(f"\nBest Model: {best_model_name}")
        
        return best_model, scaler
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        return None, None

In [338]:
def check_transaction(transaction_data, model_path="models/credit_card_model.pkl"):
    """
    Simply checks if a transaction is safe or fraudulent.
    
    Parameters:
    -----------
    transaction_data : list or array-like
        Transaction features in order: [amt, city_tier, trans_hour, age]
    model_path : str
        Path to saved model file
        
    Returns:
    --------
    str
        "SAFE" or "FRAUD"
    """
    try:
        # Load the saved model and scaler
        model_dict = joblib.load(model_path)
        model = model_dict['model']
        scaler = model_dict['scaler']
        
        # Validate input
        if len(transaction_data) != len(model_dict['features']):
            raise ValueError(f"Expected {len(model_dict['features'])} features, got {len(transaction_data)}")
        
        # Scale the input data using the loaded scaler
        scaled_data = scaler.transform([transaction_data])
        
        # Get prediction
        prediction = model.predict(scaled_data)[0]
        
        return "SAFE" if prediction == 0 else "FRAUD"
        
    except Exception as e:
        print(f"Error in check_transaction: {str(e)}")
        return "ERROR"

In [339]:
def test_transaction(model_path="models/credit_card_model.pkl"):
    """
    Test multiple transactions with proper feature formatting.
    Features must match the order: ['amt', 'zip', 'trans_hour', 'age']
    """
    try:
        # Load the saved model and scaler
        model_dict = joblib.load(model_path)
        model = model_dict['model']
        scaler = model_dict['scaler']
        features = model_dict['features']
        
        # Test transactions
        transactions = [
            # [amount, zip, transaction_hour, age]
            [100.0, 19238, 14, 35],      # Normal transaction
            [15000.0, 263972, 3, 25],      # Suspicious transaction
            [500.0, 234321, 12, 45],     # Normal transaction
            [25000.0, 31397, 2, 22]       # Suspicious transaction
        ]
        
        # Create a DataFrame with proper column names
        test_df = pd.DataFrame(transactions, columns=['amt', 'zip', 'trans_hour', 'age'])
        
        # Scale the data using the loaded scaler
        test_data_scaled = scaler.transform(test_df)
        
        # Process each transaction
        results = []
        for i, trans in enumerate(test_data_scaled, 1):
            # Get prediction probability
            prob = model.predict_proba([trans])[0]
            prediction = "SUSPICIOUS" if prob[1] > PROBABILITY_CUTOFF else "SAFE"
            
            results.append({
                'transaction_id': i,
                'features': dict(zip(features, transactions[i-1])),
                'prediction': prediction,
                'fraud_probability': prob[1]
            })
        
        # Print results
        print("\nTransaction Test Results:")
        print("-" * 50)
        for result in results:
            print(f"\nTransaction {result['transaction_id']}:")
            print(f"Amount: ${result['features']['amt']:,.2f}")
            print(f"City zip code: {result['features']['zip']:,}")
            print(f"Hour: {result['features']['trans_hour']:02d}:00")
            print(f"Age: {result['features']['age']}")
            print(f"Prediction: {result['prediction']}")
            print(f"Fraud Probability: {result['fraud_probability']:.2%}")
            
        return results
    
    except FileNotFoundError:
        print(f"Model file not found at {model_path}")
        return None
    except Exception as e:
        print(f"Error in test_transaction: {str(e)}")
        return None


In [340]:
# Example usage:
if __name__ == "__main__":
    # Test a single transaction
    transaction = [100.0,12323, 14, 35]
    result = check_transaction(transaction)
    print(f"\nSingle Transaction Test:")
    print(f"Result: {result}")
    
    # Test multiple transactions
    print("\nMultiple Transactions Test:")
    test_results = test_transaction()


Single Transaction Test:
Result: SAFE

Multiple Transactions Test:
Error in test_transaction: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- zip
Feature names seen at fit time, yet now missing:
- city_pop

