In [1]:
import pandas as pd

#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from joblib import dump, load
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

import pickle

import warnings 
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
df_train = pd.read_csv('../datasets/fraudTrain.csv', low_memory=False, index_col=0)
df_test = pd.read_csv('../datasets/fraudTest.csv', low_memory=False, index_col=0)

In [3]:
df_train.shape, df_test.shape

((1296675, 22), (555719, 22))

In [4]:
df_train.head(2)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0


In [5]:
df_train.job.nunique()

494

In [6]:
df_train.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [7]:
list(df_train.columns) == list(df_test.columns)

True

### Next steps
- Merge both train and test (done)
- Look into each of the columns
- over_sampling/under_sampling for imbalance of class (is_fraud)
- feature selection
- split data into train and test
- model building and evaluation
- wrap the complete model into a dash app

In [8]:
df = pd.concat([df_train, df_test],ignore_index=True)

In [9]:
df.to_csv('../data/combined.csv', index=False)

In [10]:
df.shape

(1852394, 22)

In [11]:
df.is_fraud.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

### Looking into columns and cleaning

In [12]:
def clean_df(df):
    return df.drop(['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time', 'merch_lat', 'merch_long', 'unix_time', 'lat', 'long', 'city_pop'],axis=1)

df = clean_df(df)

In [13]:
df.head(5)

Unnamed: 0,merchant,category,amt,gender,job,is_fraud
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,"Psychologist, counselling",0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Special educational needs teacher,0
2,fraud_Lind-Buckridge,entertainment,220.11,M,Nature conservation officer,0
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Patent attorney,0
4,fraud_Keeling-Crist,misc_pos,41.96,M,Dance movement psychotherapist,0


In [14]:
df['job'] = df['job'].str.split(',')
df = df.explode('job')
df['job'] = df['job'].str.lower()
df['job'] = df['job'].str.strip()

In [15]:
df = df.drop_duplicates().reset_index(drop=True)

In [16]:
with open('job_list.pkl', 'wb') as f:
    pickle.dump(list(df['job'].unique()),f)

with open('merchant_lst.pkl', 'wb') as f:
    pickle.dump(list(df['merchant'].unique()),f)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296292 entries, 0 to 2296291
Data columns (total 6 columns):
 #   Column    Dtype  
---  ------    -----  
 0   merchant  object 
 1   category  object 
 2   amt       float64
 3   gender    object 
 4   job       object 
 5   is_fraud  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 105.1+ MB


In [18]:
df.category.value_counts()

category
gas_transport     233545
grocery_pos       217280
home              217035
shopping_pos      206615
kids_pets         200302
shopping_net      172179
entertainment     166488
food_dining       162449
personal_care     161852
health_fitness    151734
misc_pos          141690
misc_net          112837
grocery_net        80634
travel             71652
Name: count, dtype: int64

### Sampling and handling the minority class using SMOTE

In [19]:
no_fraud = df[df['is_fraud'] == 0].reset_index(drop=True)
no_fraud = no_fraud[:9650]
yes_fraud = df[df['is_fraud'] == 1].reset_index(drop=True)
yes_fraud = yes_fraud[:9650]
dummy = pd.concat([no_fraud,yes_fraud])

In [20]:
#dummy.to_csv('pos_neg_dummy_data.csv', index=False)

In [21]:
#df = dummy.copy()

In [22]:
df.head(2)

Unnamed: 0,merchant,category,amt,gender,job,is_fraud
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,psychologist,0
1,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,counselling,0


### Splitting data into train and test

In [23]:
# train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# train.shape, test.shape

In [24]:

# Define the columns to be encoded
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
numerical_columns = [col for col in df.columns if not df[col].dtype == 'object']
numerical_columns.remove('is_fraud')
print(categorical_columns, numerical_columns)
# Define preprocessing steps for categorical and numerical columns
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer([
    ('categorical', categorical_pipeline, categorical_columns),
    ('numerical', numerical_pipeline, numerical_columns)
])

# Define the XGBoost classifier
classifier = XGBClassifier()

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Fit the pipeline on the training data
pipeline.fit(df.drop(columns=["is_fraud"]), df["is_fraud"])

# Save the trained pipeline to a file
dump(pipeline, 'pipeline.joblib')

['merchant', 'category', 'gender', 'job'] ['amt']


['pipeline.joblib']

In [25]:
lst_50_rows = dummy

y_pred = pipeline.predict(lst_50_rows)
y = lst_50_rows['is_fraud']
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81      9650
           1       1.00      0.52      0.69      9650

    accuracy                           0.76     19300
   macro avg       0.84      0.76      0.75     19300
weighted avg       0.84      0.76      0.75     19300



In [27]:
df[df["is_fraud"] ==1][:2]

Unnamed: 0,merchant,category,amt,gender,job,is_fraud
3050,fraud_Rutherford-Mertz,grocery_pos,281.06,M,soil scientist,1
3078,"fraud_Jenkins, Hauck and Friesen",gas_transport,11.52,F,horticultural consultant,1


### Encoding object columns

In [None]:
def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in df_obj.columns:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(list(df[col].values))
        encoders[col] = encoder
    with open('LE_mdl_v1.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    return df

train = encode(train)

In [None]:
train.head(2)

### Modelling

In [None]:
x = train.drop(columns=['is_fraud'])
y = train['is_fraud']

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

In [None]:
x_train.shape

In [None]:
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = DecisionTreeClassifier()
model4 = xgb.XGBClassifier()

In [None]:
def model_train(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    with open(str(model)[:3]+'_mdl.pkl', 'wb') as f:
        pickle.dump(model,f)

In [None]:
#model_train(model1, x_train, y_train, x_val, y_val)

In [None]:
#model_train(model2, x_train, y_train, x_val, y_val)

In [None]:
#model_train(model3, x_train, y_train, x_val, y_val)

In [None]:
model_train(model4, x_train, y_train, x_val, y_val)