## Feature Engineering and Data Balancing

### Overview
In this file, we perform basic feature engineering and balance the dataset using SMOTE (Synthetic Minority Over-sampling Technique).

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [4]:
df = pd.read_csv("data/fraudTrain.csv")
df.sample()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
122581,122581,2019-03-09 22:43:37,571365235126,"fraud_Crist, Jakubowski and Littel",home,41.67,Barbara,Taylor,F,0069 Robin Brooks Apt. 695,...,44.5995,-86.2141,372,"Exhibitions officer, museum/gallery",1995-07-12,2aba6eb4b69895a330a0e3a294591055,1331333017,43.811303,-86.432905,0


### Removing Unwanted columns

In [5]:
columns_to_remove = ["Unnamed: 0" , "cc_num" , "first" , "last" , "street" , "lat" , "long", "zip" ,
                     "job" , "trans_num" , "unix_time" , "merch_lat" , "merch_long" ]
df.drop(columns=columns_to_remove, inplace=True)

In [6]:
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,city_pop,dob,is_fraud
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,3495,1988-03-09,0
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,149,1978-06-21,0
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,4154,1962-01-19,0
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,1939,1967-01-12,0
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,99,1986-03-28,0


In [7]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

## Extracting Time of Day

In [8]:
df['trans_hour'] = df['trans_date_trans_time'].dt.hour

In [9]:
def time_of_day(hour):
    if  0 <= hour <= 7:
        return "night"
    elif 8 <= hour <= 15:
        return "morning"
    else:
        return "midday"

In [10]:
df['time_of_day'] = df['trans_hour'].apply(time_of_day)

## Exctracting age

In [11]:
# extracting year from dob variable
df['year_birth']= df['dob'].dt.year

In [12]:
# create the function
def age(age):
    if  2024 - age <= 29 :
        return "Young"
    elif 30 <= 2024 - age <=59 :
        return "Middle-Aged"
    else:
        return "Old"

In [13]:
df['age'] = df['year_birth'].apply(age)

In [14]:
df.sample(5)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,city_pop,dob,is_fraud,trans_hour,time_of_day,year_birth,age
187437,2019-04-07 12:49:17,fraud_Friesen Inc,shopping_pos,6.56,F,Georgetown,MN,346,1976-12-14,0,12,morning,1976,Middle-Aged
336650,2019-06-08 21:51:20,"fraud_Langosh, Wintheiser and Hyatt",food_dining,6.23,F,Centerview,MO,2368,1989-07-17,0,21,midday,1989,Middle-Aged
922720,2019-12-31 08:18:11,fraud_Kilback LLC,grocery_pos,84.88,F,Moro,IL,2401,1972-01-03,0,8,morning,1972,Middle-Aged
1106980,2020-04-05 21:12:44,fraud_Dietrich-Fadel,health_fitness,59.94,M,Ruckersville,VA,9815,1975-07-07,0,21,midday,1975,Middle-Aged
1076337,2020-03-23 07:26:28,"fraud_Lind, Huel and McClure",gas_transport,82.16,F,Greendale,WI,13973,1981-05-06,0,7,night,1981,Middle-Aged


In [15]:
df.drop(columns=['trans_date_trans_time' , 'dob' , 'trans_hour' , 'year_birth'] ,inplace = True)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
X_train = df.drop(columns=['is_fraud'])
y_train = df['is_fraud']

## Scaling and Encoding

In [18]:
categorical_columns = X_train.select_dtypes(include=['object']).columns
numerical_columns = X_train.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numerical_columns),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('encoder', preprocessor),
])

X_train_scaled = pipeline.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)

## Balancing Data: SMOTE

In [19]:
# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [20]:
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_resampled, y_train_resampled)

In [21]:
X_train_resampled.sample(5)

Unnamed: 0,merchant,category,amt,gender,city,state,city_pop,time_of_day,age
751042,0.569522,-0.293721,403.0,12.0,0.0,677.0,23.0,1.0,2.0
1439398,4.622187,0.105026,65.520534,8.312321,0.895893,835.230184,22.312321,0.0,0.208214
1733952,3.056362,0.239219,278.649906,8.48743,0.418762,552.324953,11.093809,0.0,1.418762
262371,0.075603,-0.292611,507.0,5.0,1.0,273.0,50.0,0.0,0.0
893792,0.160054,-0.165301,107.0,2.0,1.0,770.0,34.0,2.0,1.0


In [22]:
df = X_train_resampled.copy()
df['is_fraud'] = y_train_resampled

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

temp = df.sample(10000)
X = temp.drop('is_fraud', axis=1)
y = temp['is_fraud']

### Baseline model

In [24]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [25]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.983
Confusion Matrix:
[[995  12]
 [ 22 971]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1007
           1       0.99      0.98      0.98       993

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000



In [28]:
df.to_csv('data/credit_card_fraud_detection_feature_engineering.csv', index=False)

In [26]:
import pickle

In [30]:
with open('data/scaler_encoder_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [31]:
pipeline