In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

In [2]:
# Data Preparation

data = pd.read_csv('data/card_transdata.csv')
data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [3]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1:].values

In [4]:
x.shape,y.shape

((1000000, 7), (1000000, 1))

In [5]:
# spliting train and Test dataset
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=2)

In [6]:
log_reg = LogisticRegression(solver='liblinear')
log_reg.fit(xtrain,ytrain)
log_pred = log_reg.predict(xtest)
log_pred

  y = column_or_1d(y, warn=True)


array([0., 0., 0., ..., 0., 0., 0.])

In [7]:
# initial Logistic Regression Model

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print("Accuracy Score \t",accuracy_score(ytest,log_pred))
print("Confusion Matrix \n",confusion_matrix(ytest,log_pred))
print("classification Report \n", classification_report(ytest,log_pred))

Accuracy Score 	 0.958936
Confusion Matrix 
 [[226491   1549]
 [  8717  13243]]
classification Report 
               precision    recall  f1-score   support

         0.0       0.96      0.99      0.98    228040
         1.0       0.90      0.60      0.72     21960

    accuracy                           0.96    250000
   macro avg       0.93      0.80      0.85    250000
weighted avg       0.96      0.96      0.96    250000



In [8]:
# Feature Engineering

# Creating new features
# If the target variable is imbalanced (i.e., there are many more 0s than 1s, or vice versa), 
# then encoding the target variable can help to improve the performance.

data['fraud_encoded'] = np.log(data['fraud'] / (1 - data['fraud']))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
# Handling missing values & Scaling or normalizing features
data.isna().sum() # no missing values

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [17]:
# Upsampling

# Separate the majority and minority classes
from sklearn.utils import resample

majority_class = data[data['fraud'] == 0]
minority_class = data[data['fraud'] == 1]

# Upsample the minority class to match the number of samples in the majority class
upsampled_minority = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=2)

# Combine the upsampled minority class with the majority class
upsampled_data = pd.concat([majority_class, upsampled_minority])

In [18]:
upsampled_data

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
851922,11.007025,0.386596,4.666583,1.0,0.0,0.0,1.0,1.0
495625,35.813249,1.360039,5.386531,1.0,1.0,0.0,1.0,1.0
930081,286.643708,0.037750,5.021484,1.0,0.0,1.0,1.0,1.0
827484,2.089931,0.198649,5.847176,1.0,1.0,0.0,1.0,1.0


In [19]:
upsa_x = upsampled_data.drop('fraud', axis=1)
upsa_y = upsampled_data.iloc[:,-1:].values

In [20]:
upsa_x,upsa_y

(        distance_from_home  distance_from_last_transaction  \
 0                57.877857                        0.311140   
 1                10.829943                        0.175592   
 2                 5.091079                        0.805153   
 3                 2.247564                        5.600044   
 4                44.190936                        0.566486   
 ...                    ...                             ...   
 851922           11.007025                        0.386596   
 495625           35.813249                        1.360039   
 930081          286.643708                        0.037750   
 827484            2.089931                        0.198649   
 349415           22.251491                        8.169915   
 
         ratio_to_median_purchase_price  repeat_retailer  used_chip  \
 0                             1.945940              1.0        1.0   
 1                             1.294219              1.0        0.0   
 2                           

In [21]:
# spliting train and Test dataset
xtrain,xtest,ytrain,ytest = train_test_split(upsa_x,upsa_y,test_size=0.25,random_state=2)

In [22]:
upsample_log_reg = LogisticRegression(solver='liblinear')
upsample_log_reg.fit(xtrain,ytrain)
upsample_log_pred = upsample_log_reg.predict(xtest)
upsample_log_pred

  y = column_or_1d(y, warn=True)


array([1., 0., 0., ..., 0., 1., 1.])

In [23]:
print("Accuracy Score: \t",accuracy_score(ytest,upsample_log_pred))
print()
print("Confusion Matrix: \n",confusion_matrix(ytest,upsample_log_pred))
print()
print("classification Report :\n", classification_report(ytest,upsample_log_pred))

Accuracy Score: 	 0.9411109820534342

Confusion Matrix: 
 [[212733  15259]
 [ 11612 216695]]

classification Report :
               precision    recall  f1-score   support

         0.0       0.95      0.93      0.94    227992
         1.0       0.93      0.95      0.94    228307

    accuracy                           0.94    456299
   macro avg       0.94      0.94      0.94    456299
weighted avg       0.94      0.94      0.94    456299



In [24]:
# Downsampling
downsampled_majority = majority_class.sample(n=len(minority_class), random_state=2)

# Combine the downsampled majority class with the minority class
downsampled_data = pd.concat([downsampled_majority, minority_class])

In [25]:
downsa_x = downsampled_data.iloc[:,:-1].values
downsa_y = downsampled_data.iloc[:,-1:].values

In [26]:
# spliting train and Test dataset
xtrain,xtest,ytrain,ytest = train_test_split(downsa_x,downsa_y,test_size=0.25,random_state=2)

In [27]:
downsample_log_reg = LogisticRegression(solver='liblinear')
downsample_log_reg.fit(xtrain,ytrain)
downsample_log_pred = downsample_log_reg.predict(xtest)
downsample_log_pred

  y = column_or_1d(y, warn=True)


array([0., 1., 0., ..., 0., 0., 0.])

In [28]:
print("Accuracy Score: \t",accuracy_score(ytest,downsample_log_pred))
print()
print("Confusion Matrix: \n",confusion_matrix(ytest,downsample_log_pred))
print()
print("classification Report :\n", classification_report(ytest,downsample_log_pred))

Accuracy Score: 	 0.9400256281177063

Confusion Matrix: 
 [[20327  1489]
 [ 1132 20754]]

classification Report :
               precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     21816
         1.0       0.93      0.95      0.94     21886

    accuracy                           0.94     43702
   macro avg       0.94      0.94      0.94     43702
weighted avg       0.94      0.94      0.94     43702



In [35]:
# Using synthetic data generation techniques (e.g., SMOTE)

from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Scale the training and test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(xtrain)
X_test_scaled = scaler.transform(xtest)

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, ytrain)



(43702, 7)

In [40]:
X_test_scaled.shape,X_train_scaled.shape,X_train_resampled.shape

((43702, 7), (131104, 7), (131174, 7))

In [41]:
# spliting train and Test dataset
xtrain,xtest,ytrain,ytest = train_test_split(X_train_resampled,y_train_resampled,test_size=0.25,random_state=2)

In [44]:
smote_log_reg = LogisticRegression(solver='liblinear')
smote_log_reg.fit(xtrain,ytrain)
smote_log_pred = smote_log_reg.predict(xtest)
smote_log_pred

array([1., 0., 1., ..., 0., 1., 0.])

In [45]:
print("Accuracy Score: \t",accuracy_score(ytest,smote_log_pred))
print()
print("Confusion Matrix: \n",confusion_matrix(ytest,smote_log_pred))
print()
print("classification Report :\n", classification_report(ytest,smote_log_pred))

Accuracy Score: 	 0.9401719826797585

Confusion Matrix: 
 [[15329  1134]
 [  828 15503]]

classification Report :
               precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     16463
         1.0       0.93      0.95      0.94     16331

    accuracy                           0.94     32794
   macro avg       0.94      0.94      0.94     32794
weighted avg       0.94      0.94      0.94     32794



In [29]:
#coefficient
log_reg.coef_

array([[  0.0151646 ,   0.02449207,   0.86096227,  -0.62260492,
         -1.04048923, -13.35022274,   6.67488996]])

In [30]:
# interpret
log_reg.intercept_

array([-10.37644757])

In [None]:
# model comparison

Among all the methods like upsampling,downsampling,SMOTE and 
Feature Engineering of Logistic Regression model has Accuracy less than
the normal Logistic Regression model
