In [116]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [117]:
train_data = pd.read_csv('fraudTrain.csv')

In [118]:
train_data.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [90]:
train_data["state"].unique()

array(['NC', 'WA', 'ID', 'MT', 'VA', 'PA', 'KS', 'TN', 'IA', 'WV', 'FL',
       'CA', 'NM', 'NJ', 'OK', 'IN', 'MA', 'TX', 'WI', 'MI', 'WY', 'HI',
       'NE', 'OR', 'LA', 'DC', 'KY', 'NY', 'MS', 'UT', 'AL', 'AR', 'MD',
       'GA', 'ME', 'AZ', 'MN', 'OH', 'CO', 'VT', 'MO', 'SC', 'NV', 'IL',
       'NH', 'SD', 'AK', 'ND', 'CT', 'RI', 'DE'], dtype=object)

In [119]:
cols = train_data.columns

In [120]:
'''Step-1: Segragating data-time, categorical, numerical, and output features and removing  
Nominal, meaning less, complicated ones '''
date_features = [cols[1], cols[19]]
cat_features = [cols[2], cols[4], cols[8], cols[10], cols[11], cols[12], cols[16]]
num_features = [cols[5], cols[13], cols[14], cols[15], cols[17], cols[20], cols[21]  ]
output_features = [cols[22]]

# Concatenate the feature lists into a single list
all_features = date_features + cat_features + num_features + output_features
df =  train_data[all_features]
print(all_features)

['trans_date_trans_time', 'unix_time', 'cc_num', 'category', 'gender', 'city', 'state', 'zip', 'job', 'amt', 'lat', 'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud']


In [121]:
train_dt =  train_data[all_features]

In [122]:
train_dt.columns

Index(['trans_date_trans_time', 'unix_time', 'cc_num', 'category', 'gender',
       'city', 'state', 'zip', 'job', 'amt', 'lat', 'long', 'city_pop', 'dob',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [123]:
(train_dt).isna().sum()

trans_date_trans_time    0
unix_time                0
cc_num                   0
category                 0
gender                   0
city                     0
state                    0
zip                      0
job                      0
amt                      0
lat                      0
long                     0
city_pop                 0
dob                      0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [124]:
train_dt.dtypes

trans_date_trans_time     object
unix_time                  int64
cc_num                     int64
category                  object
gender                    object
city                      object
state                     object
zip                        int64
job                       object
amt                      float64
lat                      float64
long                     float64
city_pop                   int64
dob                       object
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [125]:
from math import sin, cos, sqrt, atan2, radians

def haversine(lat1, lon1, lat2, lon2):
    R = 3958.8  # Earth radius in miles

    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Calculate the distance using the Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c

    return distance

In [126]:
def new_features(data):
    df = data.copy()
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['trans_date'] = df['trans_date_trans_time'].dt.date
    df['day_of_week'] = df['trans_date_trans_time'].dt.day_name()
    df['trans_hour'] = df['trans_date_trans_time'].dt.hour
    df['avg_category_amt'] = df.groupby('category')['amt'].transform('mean')
    df['amt_above_avg_category_amt'] = (df['amt'] > df['avg_category_amt']).astype(int)
    df = df.sort_values(['cc_num', 'trans_date_trans_time'], ascending=[True, True])
    df['time_since_last_trans'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds()
    df['trans_freq'] = df.groupby('cc_num')['trans_date_trans_time'].transform('count')
    df['time_since_last_trans'] = df.groupby('cc_num')['trans_date_trans_time'].transform(lambda x: x.max() - x)
    df = df.sort_index()
    df['distance'] = df.apply(lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)
    df['time_since_last_trans'] = df['time_since_last_trans'].dt.total_seconds()
    df['days_since_last_trans'], df['hours_since_last_trans'] = divmod(df['time_since_last_trans'], 86400)
    df['hours_since_last_trans'] /= 3600
    df['days_since_last_trans'] = df['days_since_last_trans'].astype(int)
    df['hours_since_last_trans'] = df['hours_since_last_trans'].astype(int)
    train_dt['dob'] = train_dt['dob'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    date  = pd.to_datetime(df['trans_date'])
    df['age']    = ( date- train_dt['dob']).dt.days.astype(int)
    return df


In [127]:
engineered_data_train = pd.DataFrame()
engineered_data_train = new_features(train_dt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dt['dob'] = train_dt['dob'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))


In [128]:
engineered_data_train.head(10)

Unnamed: 0,trans_date_trans_time,unix_time,cc_num,category,gender,city,state,zip,job,amt,...,day_of_week,trans_hour,avg_category_amt,amt_above_avg_category_amt,time_since_last_trans,trans_freq,distance,days_since_last_trans,hours_since_last_trans,age
0,2019-01-01 00:00:18,1325376018,2703186189652095,misc_net,F,Moravian Falls,NC,28654,"Psychologist, counselling",4.97,...,Tuesday,0,80.865095,0,46418877.0,2028,48.838809,537,6,11255
1,2019-01-01 00:00:44,1325376044,630423337322,grocery_pos,F,Orient,WA,99160,Special educational needs teacher,107.23,...,Tuesday,0,116.960986,0,46425509.0,3030,18.773185,537,7,14804
2,2019-01-01 00:00:51,1325376051,38859492057661,entertainment,M,Malad City,ID,83252,Nature conservation officer,220.11,...,Tuesday,0,64.210421,1,46336689.0,503,67.236892,536,7,20801
3,2019-01-01 00:01:16,1325376076,3534093764340240,gas_transport,M,Boulder,MT,59632,Patent attorney,45.0,...,Tuesday,0,63.434572,0,46230446.0,493,59.449252,535,1,18982
4,2019-01-01 00:03:06,1325376186,375534208663984,misc_pos,M,Doe Hill,VA,24433,Dance movement psychotherapist,41.96,...,Tuesday,0,62.890999,0,46409545.0,2017,48.192064,537,3,11967
5,2019-01-01 00:04:08,1325376248,4767265376804500,gas_transport,F,Dublin,PA,18917,Transport planner,94.63,...,Tuesday,0,63.434572,1,46359999.0,552,53.39045,536,13,21015
6,2019-01-01 00:04:42,1325376282,30074693890476,grocery_net,F,Holcomb,KS,67851,Arboriculturist,44.54,...,Tuesday,0,53.670084,0,46432714.0,2057,73.397044,537,9,9269
7,2019-01-01 00:05:08,1325376308,6011360759745864,gas_transport,M,Edinburg,VA,22824,"Designer, multimedia",71.65,...,Tuesday,0,63.434572,1,46427402.0,1044,7.933086,537,8,26066
8,2019-01-01 00:05:18,1325376318,4922710831011201,misc_pos,F,Manor,PA,15665,Public affairs consultant,4.27,...,Tuesday,0,62.890999,0,46316220.0,1009,15.702532,536,1,28424
9,2019-01-01 00:06:01,1325376361,2720830304681674,grocery_pos,F,Clarksville,TN,37040,Pathologist,198.39,...,Tuesday,0,116.960986,1,46383751.0,1042,46.030293,536,20,16350


In [129]:
engineered_data_train.columns

Index(['trans_date_trans_time', 'unix_time', 'cc_num', 'category', 'gender',
       'city', 'state', 'zip', 'job', 'amt', 'lat', 'long', 'city_pop', 'dob',
       'merch_lat', 'merch_long', 'is_fraud', 'trans_date', 'day_of_week',
       'trans_hour', 'avg_category_amt', 'amt_above_avg_category_amt',
       'time_since_last_trans', 'trans_freq', 'distance',
       'days_since_last_trans', 'hours_since_last_trans', 'age'],
      dtype='object')

In [130]:
# Drop irrelevant columns
engineered_data_train = engineered_data_train.drop(['trans_date_trans_time', 'unix_time', 'cc_num','city', 'zip', 'lat', 'long', 'merch_lat', 'merch_long','avg_category_amt','dob'], axis=1)


In [131]:
# define a function to encode time difference into categories
def encode_time_diff(time_diff):
    if time_diff <= pd.Timedelta(weeks=8):
        return 'recent'
    elif time_diff <= pd.Timedelta(weeks=20):
        return 'moderate'
    else:
        return 'long ago'

In [132]:
def encode_features(data):
    df = data.copy()
    region_dict = {'AL': 'South', 'AK': 'West', 'AZ': 'West', 'AR': 'South',
                     'CA': 'West', 'CO': 'West', 'CT': 'Northeast', 'DE': 'South',
                     'FL': 'South', 'GA': 'South', 'HI': 'Pacific','DC':'Northeast', 'ID': 'West',
                     'IL': 'Midwest', 'IN': 'Midwest', 'IA': 'Midwest', 'KS': 'Midwest',
                     'KY': 'South', 'LA': 'South', 'ME': 'Northeast', 'MD': 'South',
                     'MA': 'Northeast', 'MI': 'Midwest', 'MN': 'Midwest', 'MS': 'South',
                     'MO': 'Midwest', 'MT': 'West', 'NE': 'Midwest', 'NV': 'West',
                     'NH': 'Northeast', 'NJ': 'Northeast', 'NM': 'West', 'NY': 'Northeast',
                     'NC': 'South', 'ND': 'Midwest', 'OH': 'Midwest', 'OK': 'South',
                     'OR': 'West', 'PA': 'Northeast', 'RI': 'Northeast', 'SC': 'South',
                     'SD': 'Midwest', 'TN': 'South', 'TX': 'South', 'UT': 'West',
                     'VT': 'Northeast', 'VA': 'South', 'WA': 'West', 'WV': 'South',
                     'WI': 'Midwest', 'WY': 'West'}
    
    week_dict = {'Monday': 'weekday', 
                 'Tuesday': 'weekday', 
                 'Wednesday': 'weekday', 
                 'Thursday': 'weekday', 
                 'Friday': 'weekday',
               'Saturday': 'weekend',
                'Sunday': 'weekend'}
    df['day_of_week'] = df['day_of_week'].map(week_dict)
    
    # Use the dictionary to map the states to their regions
    df['region'] = df['state'].map({state: region for region, states in region_dict.items() for state in states})

    df['region'] = df['state'].map(region_dict)
    df['gender'] = df['gender'].map({'F': 0, 'M': 1})
    df['category'] = pd.factorize(df['category'])[0]
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    RN_date = datetime.now()
    df['time_diff'] = (RN_date - df['trans_date'])
    df['trans_date_encoded'] = df['time_diff'].apply(encode_time_diff)
    # create bins for the different times of day
    bins = [0, 12, 18, 24]
    labels = ['morning', 'afternoon', 'evening']
    # use the cut() function to encode the 'trans_time_of_day' column
    df['trans_hour'] = pd.cut(pd.to_datetime(df['trans_hour']).dt.hour, bins=bins, labels=labels, include_lowest=True)
    return df

In [133]:
encoded_engineered_data_train = pd.DataFrame()
encoded_engineered_data_train = encode_features(engineered_data_train)
# drop: state, job, time_diff, trans_date
#encoded_engineered_data = encoded_engineered_data.drop(['state', 'job', 'time_diff','trans_date'],axis=1)

In [135]:
encoded_engineered_data_train = encoded_engineered_data_train.drop(['state', 'job', 'time_diff','trans_date'],axis=1)

In [136]:
(encoded_engineered_data_train).isna().sum()

category                      0
gender                        0
amt                           0
city_pop                      0
is_fraud                      0
day_of_week                   0
trans_hour                    0
amt_above_avg_category_amt    0
time_since_last_trans         0
trans_freq                    0
distance                      0
days_since_last_trans         0
hours_since_last_trans        0
age                           0
region                        0
trans_date_encoded            0
dtype: int64

In [137]:
encoded_engineered_data_train["region"].unique()

array(['South', 'West', 'Northeast', 'Midwest', 'Pacific'], dtype=object)

In [138]:
encoded_engineered_data_train.head(10)

Unnamed: 0,category,gender,amt,city_pop,is_fraud,day_of_week,trans_hour,amt_above_avg_category_amt,time_since_last_trans,trans_freq,distance,days_since_last_trans,hours_since_last_trans,age,region,trans_date_encoded
0,0,0,4.97,3495,0,weekday,morning,0,46418877.0,2028,48.838809,537,6,11255,South,long ago
1,1,0,107.23,149,0,weekday,morning,0,46425509.0,3030,18.773185,537,7,14804,West,long ago
2,2,1,220.11,4154,0,weekday,morning,1,46336689.0,503,67.236892,536,7,20801,West,long ago
3,3,1,45.0,1939,0,weekday,morning,0,46230446.0,493,59.449252,535,1,18982,West,long ago
4,4,1,41.96,99,0,weekday,morning,0,46409545.0,2017,48.192064,537,3,11967,South,long ago
5,3,0,94.63,2158,0,weekday,morning,1,46359999.0,552,53.39045,536,13,21015,Northeast,long ago
6,5,0,44.54,2691,0,weekday,morning,0,46432714.0,2057,73.397044,537,9,9269,Midwest,long ago
7,3,1,71.65,6018,0,weekday,morning,1,46427402.0,1044,7.933086,537,8,26066,South,long ago
8,4,0,4.27,1472,0,weekday,morning,0,46316220.0,1009,15.702532,536,1,28424,Northeast,long ago
9,1,0,198.39,151785,0,weekday,morning,1,46383751.0,1042,46.030293,536,20,16350,South,long ago


In [139]:
encoded_engineered_data_train.dtypes

category                         int64
gender                           int64
amt                            float64
city_pop                         int64
is_fraud                         int64
day_of_week                     object
trans_hour                    category
amt_above_avg_category_amt       int64
time_since_last_trans          float64
trans_freq                       int64
distance                       float64
days_since_last_trans            int64
hours_since_last_trans           int64
age                              int64
region                          object
trans_date_encoded              object
dtype: object

category                         : To which category payment is made

gender                           : Male=>1, Female=>0

amt                              : Amount of transaction made

city_pop                         : Population of city in which transaction made

is_fraud                         : Final outcome to be predicted

day_of_week                      : Weekday or weekend in which tranaction made

trans_hour                       : Morning, Afternoon or evening the tranaction made
amt_above_avg_category_amt       : Is the transaction amount is greater than average amount of that particular card

time_since_last_trans            : For that particular card, how much days has beed last transaction made

trans_freq                       : How many tranasctions made for that particular card

distance                         : Distance B/W 

days_since_last_trans            : Days since last transaction made

hours_since_last_trans           : Hours since last transaction made 

age                              : Age of that card holder

region                           : In which region, the transaction made

trans_date_encoded               : WRT to today, what is the status of transaction

In [140]:
dummy = encoded_engineered_data_train.copy()

In [141]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

dummy['day_of_week'] = le.fit_transform(dummy['day_of_week'])
dummy['trans_hour'] = le.fit_transform(dummy['trans_hour'])
dummy['region'] = le.fit_transform(dummy['region'])
dummy['time_since_last_trans'] = le.fit_transform(dummy['time_since_last_trans'])
dummy['days_since_last_trans'] = le.fit_transform(dummy['days_since_last_trans'])
dummy['hours_since_last_trans'] = le.fit_transform(dummy['hours_since_last_trans'])
dummy['trans_date_encoded'] = le.fit_transform(dummy['trans_date_encoded'])

In [142]:
print(dummy.dtypes)

category                        int64
gender                          int64
amt                           float64
city_pop                        int64
is_fraud                        int64
day_of_week                     int64
trans_hour                      int64
amt_above_avg_category_amt      int64
time_since_last_trans           int64
trans_freq                      int64
distance                      float64
days_since_last_trans           int64
hours_since_last_trans          int64
age                             int64
region                          int64
trans_date_encoded              int64
dtype: object


In [143]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [149]:
X_train = dummy.drop('is_fraud', axis=1)

In [150]:
X_train['time_since_last_trans'].unique()

array([1274980, 1275065, 1273151, ...,      49,       6,      17])

In [151]:
X_train

Unnamed: 0,category,gender,amt,city_pop,day_of_week,trans_hour,amt_above_avg_category_amt,time_since_last_trans,trans_freq,distance,days_since_last_trans,hours_since_last_trans,age,region,trans_date_encoded
0,0,0,4.97,3495,0,0,0,1274980,2028,48.838809,537,6,11255,3,0
1,1,0,107.23,149,0,0,0,1275065,3030,18.773185,537,7,14804,4,0
2,2,1,220.11,4154,0,0,1,1273151,503,67.236892,536,7,20801,4,0
3,3,1,45.00,1939,0,0,0,1271597,493,59.449252,535,1,18982,4,0
4,4,1,41.96,99,0,0,0,1274854,2017,48.192064,537,3,11967,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2,1,15.56,258,1,0,0,0,1513,74.411357,0,0,21394,4,0
1296671,8,1,51.70,100,1,0,1,0,531,46.668035,0,0,14803,3,0
1296672,8,1,105.93,899,1,0,1,0,2070,61.546094,0,0,19289,4,0
1296673,8,1,74.90,1126,1,0,1,0,2024,52.585771,0,0,14552,0,0


In [152]:
nb_model = GaussianNB()

# fit the model on the training data
nb_model.fit(X_train, dummy['is_fraud'])

GaussianNB()

In [148]:
print(encoded_engineered_data_train.dtypes)

category                         int64
gender                           int64
amt                            float64
city_pop                         int64
is_fraud                         int64
day_of_week                     object
trans_hour                    category
amt_above_avg_category_amt       int64
time_since_last_trans          float64
trans_freq                       int64
distance                       float64
days_since_last_trans            int64
hours_since_last_trans           int64
age                              int64
region                          object
trans_date_encoded              object
dtype: object


### Testing data Feature Engineering

In [154]:
test_data = pd.read_csv('fraudTest.csv')


In [160]:
cols = test_data.columns

In [161]:
cols

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [162]:
'''Step-1: Segragating data-time, categorical, numerical, and output features and removing  
Nominal, meaning less, complicated ones '''
date_features = [cols[1], cols[19]]
cat_features = [cols[2], cols[4], cols[8], cols[10], cols[11], cols[12], cols[16]]
num_features = [cols[5], cols[13], cols[14], cols[15], cols[17], cols[20], cols[21]  ]
output_features = [cols[22]]

# Concatenate the feature lists into a single list
all_features = date_features + cat_features + num_features + output_features
df =  test_data[all_features]
print(all_features)

['trans_date_trans_time', 'unix_time', 'cc_num', 'category', 'gender', 'city', 'state', 'zip', 'job', 'amt', 'lat', 'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud']


In [163]:
test_dt =  train_data[all_features]

In [164]:
(train_dt).isna().sum()

trans_date_trans_time    0
unix_time                0
cc_num                   0
category                 0
gender                   0
city                     0
state                    0
zip                      0
job                      0
amt                      0
lat                      0
long                     0
city_pop                 0
dob                      0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [165]:
from math import sin, cos, sqrt, atan2, radians

def haversine(lat1, lon1, lat2, lon2):
    R = 3958.8  # Earth radius in miles

    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Calculate the distance using the Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c

    return distance

In [167]:
def new_features(data):
    df = data.copy()
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['trans_date'] = df['trans_date_trans_time'].dt.date
    df['day_of_week'] = df['trans_date_trans_time'].dt.day_name()
    df['trans_hour'] = df['trans_date_trans_time'].dt.hour
    df['avg_category_amt'] = df.groupby('category')['amt'].transform('mean')
    df['amt_above_avg_category_amt'] = (df['amt'] > df['avg_category_amt']).astype(int)
    df = df.sort_values(['cc_num', 'trans_date_trans_time'], ascending=[True, True])
    df['time_since_last_trans'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds()
    df['trans_freq'] = df.groupby('cc_num')['trans_date_trans_time'].transform('count')
    df['time_since_last_trans'] = df.groupby('cc_num')['trans_date_trans_time'].transform(lambda x: x.max() - x)
    df = df.sort_index()
    df['distance'] = df.apply(lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)
    df['time_since_last_trans'] = df['time_since_last_trans'].dt.total_seconds()
    df['days_since_last_trans'], df['hours_since_last_trans'] = divmod(df['time_since_last_trans'], 86400)
    df['hours_since_last_trans'] /= 3600
    df['days_since_last_trans'] = df['days_since_last_trans'].astype(int)
    df['hours_since_last_trans'] = df['hours_since_last_trans'].astype(int)
    test_dt['dob'] = test_dt['dob'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    date  = pd.to_datetime(df['trans_date'])
    df['age']    = ( date- test_dt['dob']).dt.days.astype(int)
    return df

In [168]:
engineered_data_test = pd.DataFrame()
engineered_data_test = new_features(test_dt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dt['dob'] = test_dt['dob'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))


In [169]:
# Drop irrelevant columns
engineered_data_test = engineered_data_test.drop(['trans_date_trans_time', 'unix_time', 'cc_num','city', 'zip', 'lat', 'long', 'merch_lat', 'merch_long','avg_category_amt','dob'], axis=1)


In [170]:
encoded_engineered_data_test = pd.DataFrame()
encoded_engineered_data_test = encode_features(engineered_data_test)

In [171]:
encoded_engineered_data_test.head(5)

Unnamed: 0,category,gender,state,job,amt,city_pop,is_fraud,trans_date,day_of_week,trans_hour,amt_above_avg_category_amt,time_since_last_trans,trans_freq,distance,days_since_last_trans,hours_since_last_trans,age,region,time_diff,trans_date_encoded
0,0,0,NC,"Psychologist, counselling",4.97,3495,0,2019-01-01,weekday,morning,0,46418877.0,2028,48.838809,537,6,11255,South,1575 days 22:38:21.316777,long ago
1,1,0,WA,Special educational needs teacher,107.23,149,0,2019-01-01,weekday,morning,0,46425509.0,3030,18.773185,537,7,14804,West,1575 days 22:38:21.316777,long ago
2,2,1,ID,Nature conservation officer,220.11,4154,0,2019-01-01,weekday,morning,1,46336689.0,503,67.236892,536,7,20801,West,1575 days 22:38:21.316777,long ago
3,3,1,MT,Patent attorney,45.0,1939,0,2019-01-01,weekday,morning,0,46230446.0,493,59.449252,535,1,18982,West,1575 days 22:38:21.316777,long ago
4,4,1,VA,Dance movement psychotherapist,41.96,99,0,2019-01-01,weekday,morning,0,46409545.0,2017,48.192064,537,3,11967,South,1575 days 22:38:21.316777,long ago


In [172]:
encoded_engineered_data_test = encoded_engineered_data_test.drop(['state', 'job', 'time_diff','trans_date'],axis=1)

In [173]:
(encoded_engineered_data_test).isna().sum()

category                      0
gender                        0
amt                           0
city_pop                      0
is_fraud                      0
day_of_week                   0
trans_hour                    0
amt_above_avg_category_amt    0
time_since_last_trans         0
trans_freq                    0
distance                      0
days_since_last_trans         0
hours_since_last_trans        0
age                           0
region                        0
trans_date_encoded            0
dtype: int64

In [174]:
encoded_engineered_data_train.head(5)

Unnamed: 0,category,gender,amt,city_pop,is_fraud,day_of_week,trans_hour,amt_above_avg_category_amt,time_since_last_trans,trans_freq,distance,days_since_last_trans,hours_since_last_trans,age,region,trans_date_encoded
0,0,0,4.97,3495,0,weekday,morning,0,46418877.0,2028,48.838809,537,6,11255,South,long ago
1,1,0,107.23,149,0,weekday,morning,0,46425509.0,3030,18.773185,537,7,14804,West,long ago
2,2,1,220.11,4154,0,weekday,morning,1,46336689.0,503,67.236892,536,7,20801,West,long ago
3,3,1,45.0,1939,0,weekday,morning,0,46230446.0,493,59.449252,535,1,18982,West,long ago
4,4,1,41.96,99,0,weekday,morning,0,46409545.0,2017,48.192064,537,3,11967,South,long ago


In [175]:
dummy_test = encoded_engineered_data_test.copy()

In [176]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

dummy_test['day_of_week'] = le.fit_transform(dummy_test['day_of_week'])
dummy_test['trans_hour'] = le.fit_transform(dummy_test['trans_hour'])
dummy_test['region'] = le.fit_transform(dummy_test['region'])
dummy_test['time_since_last_trans'] = le.fit_transform(dummy_test['time_since_last_trans'])
dummy_test['days_since_last_trans'] = le.fit_transform(dummy_test['days_since_last_trans'])
dummy_test['hours_since_last_trans'] = le.fit_transform(dummy_test['hours_since_last_trans'])
dummy_test['trans_date_encoded'] = le.fit_transform(dummy_test['trans_date_encoded'])


In [177]:
X_test = dummy_test.drop('is_fraud', axis=1)

In [179]:
y_test = dummy_test['is_fraud']

In [178]:
y_pred = nb_model.predict(X_test)

In [180]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print(cm)

[[1281642    7527]
 [   3926    3580]]


In [181]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)

# classification report
report = classification_report(y_test, y_pred)

# results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.9911674089498139
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00   1289169
           1       0.32      0.48      0.38      7506

    accuracy                           0.99   1296675
   macro avg       0.66      0.74      0.69   1296675
weighted avg       0.99      0.99      0.99   1296675

