In [1]:
# Show plots inline
%matplotlib inline

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import statsmodels.formula.api as sm
import statsmodels.api as sma

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/SQL/my files/learning/ML/my projects 29apr24/fraud account detection/Variant V.csv')

In [4]:
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,intended_balcon_amount,payment_type,employment_status,credit_risk_score,...,phone_home_valid,phone_mobile_valid,vintage_months_count,has_other_cards,sanctioned_limit,foreign_country_tranfer,source,session_length_in_minutes,device_os,device_fraud_count
0,0,0.1,0.054656,-1,55,50,-0.779957,AB,CC,74,...,1,1,15,0,20000.0,0,INTERNET,5.063178,linux,0
1,0,0.2,0.516874,-1,178,50,-0.471011,AB,CC,164,...,1,1,31,0,20000.0,0,INTERNET,5.403648,linux,0
2,0,0.1,0.870572,-1,26,30,51.068616,AA,CA,169,...,0,1,28,1,20000.0,0,INTERNET,6.701977,linux,0
3,0,0.1,0.234248,29,9,20,16.341333,AA,CA,75,...,1,1,1,0,20000.0,0,INTERNET,12.787474,linux,0
4,0,0.2,0.909504,11,5,30,-1.151589,AB,CB,129,...,0,1,31,1,20000.0,0,INTERNET,4.670977,linux,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 22 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   fraud_bool                    1000000 non-null  int64  
 1   income                        1000000 non-null  float64
 2   name_email_similarity         1000000 non-null  float64
 3   prev_address_months_count     1000000 non-null  int64  
 4   current_address_months_count  1000000 non-null  int64  
 5   customer_age                  1000000 non-null  int64  
 6   intended_balcon_amount        1000000 non-null  float64
 7   payment_type                  1000000 non-null  object 
 8   employment_status             999978 non-null   object 
 9   credit_risk_score             1000000 non-null  int64  
 10  email_is_free                 1000000 non-null  int64  
 11  housing_status                999983 non-null   object 
 12  phone_home_valid             

In [6]:
df["fraud_bool"].unique()

array([0, 1])

# Development-Validation-Holdout

In [7]:
dev, val, holdout = np.split(
    df.sample(frac=1,random_state=1219),
    [
        int(.5*len(df)),
        int(.8*len(df))
    ]
)

In [8]:
print("Population data set shape: \t" , df.shape)
print("Development data set shape: \t" , dev.shape)
print("Validation data set shape: \t" , val.shape)
print("Holdout data set shape: \t" , holdout.shape)

Population data set shape: 	 (1000000, 22)
Development data set shape: 	 (500000, 22)
Validation data set shape: 	 (300000, 22)
Holdout data set shape: 	 (200000, 22)


In [9]:
print("Population fraud rate:",
      round(sum(df.fraud_bool)*100/len(df),2),"%")
print("Development sample fraud rate:",
      round(sum(dev.fraud_bool)*100/len(dev),2),"%")
print("validation fraud rate:",
      round(sum(val.fraud_bool)*100/len(val),2),"%")
print("Holdout fraud rate:",
      round(sum(holdout.fraud_bool)*100/len(holdout),2),"%")

Population fraud rate: 1.1 %
Development sample fraud rate: 1.11 %
validation fraud rate: 1.09 %
Holdout fraud rate: 1.11 %


In [10]:
dev.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,intended_balcon_amount,payment_type,employment_status,credit_risk_score,...,phone_home_valid,phone_mobile_valid,vintage_months_count,has_other_cards,sanctioned_limit,foreign_country_tranfer,source,session_length_in_minutes,device_os,device_fraud_count
120074,0,0.7,0.50495,12,5,20,-1.280685,AC,CA,29,...,0,1,-1,0,20000.0,0,INTERNET,4.744053,linux,0
265124,0,0.1,0.124078,-1,88,50,-0.688306,AC,CA,43,...,1,1,-1,0,20000.0,0,INTERNET,4.890658,linux,0
796184,0,0.6,0.988943,-1,159,50,49.075395,AA,CA,244,...,0,1,6,1,150000.0,0,INTERNET,5.027425,linux,0
795254,0,0.9,0.041099,-1,102,40,24.745517,AA,CA,174,...,1,1,31,1,50000.0,0,INTERNET,28.511181,other,0
37176,0,0.4,0.617012,-1,358,30,10.001946,AA,CB,90,...,1,1,15,1,20000.0,0,INTERNET,1.833216,other,0


# Missing value Imputation

## Why Imputation?

Because statistical packages discard the record having missing value in the data in any column.

In [11]:
def null_values(data):
  # Calculate the count of null values in each column
  print("null values count:")
  missing_data=data.isnull().sum(axis = 0) #axis =0 means row
  print(missing_data[missing_data != 0],"\n")

  # Calculate the percentage of null values in each column
  print("null values percentage:")
  null_percentage_data = (data.isnull().mean()) * 100
  print(null_percentage_data[null_percentage_data != 0])

In [12]:
print("null values of dev:")
print(null_values(dev),"\n")

print("null value of val")
print(null_values(val),"\n")

print("null value of holdout")
print(null_values(holdout))

null values of dev:
null values count:
employment_status    13
housing_status        7
sanctioned_limit     23
dtype: int64 

null values percentage:
employment_status    0.0026
housing_status       0.0014
sanctioned_limit     0.0046
dtype: float64
None 

null value of val
null values count:
employment_status     6
housing_status        6
sanctioned_limit     13
dtype: int64 

null values percentage:
employment_status    0.002000
housing_status       0.002000
sanctioned_limit     0.004333
dtype: float64
None 

null value of holdout
null values count:
employment_status    3
housing_status       4
sanctioned_limit     6
dtype: int64 

null values percentage:
employment_status    0.0015
housing_status       0.0020
sanctioned_limit     0.0030
dtype: float64
None


# How to use impute values


*   Do nothing
*   Mean median mode value imputation
*   KNN(Non parametric Technique)
*   Regression Techniques
*   surrogates  or logistic imputation of missing values
*   Multivariate Imputation by Chained Equation (MICE)



In [13]:
dev.dtypes

fraud_bool                        int64
income                          float64
name_email_similarity           float64
prev_address_months_count         int64
current_address_months_count      int64
customer_age                      int64
intended_balcon_amount          float64
payment_type                     object
employment_status                object
credit_risk_score                 int64
email_is_free                     int64
housing_status                   object
phone_home_valid                  int64
phone_mobile_valid                int64
vintage_months_count              int64
has_other_cards                   int64
sanctioned_limit                float64
foreign_country_tranfer           int64
source                           object
session_length_in_minutes       float64
device_os                        object
device_fraud_count                int64
dtype: object

In [14]:
def fill_missing_values(data):
    # Identify numerical columns
    numerical_columns = data.select_dtypes(include=['number']).columns

    # Fill NaN values for numerical columns with the mean of each column
    data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].median())
    # Identify categorical columns
    categorical_columns = data.select_dtypes(include=['object']).columns

    # Fill NaN values for categorical columns with a placeholder value, such as 'unknown'
    data[categorical_columns] = data[categorical_columns].fillna('unknown')

    return data

# Apply the function to your dev, test val and holdout
dev_filled = fill_missing_values(dev)
val_filled = fill_missing_values(val)
holdout_filled = fill_missing_values(holdout)

In [15]:
print("null values of dev:")
print(null_values(dev),"\n")

print("null value of val")
print(null_values(val),"\n")

print("null value of holdout")
print(null_values(holdout))

null values of dev:
null values count:
Series([], dtype: int64) 

null values percentage:
Series([], dtype: float64)
None 

null value of val
null values count:
Series([], dtype: int64) 

null values percentage:
Series([], dtype: float64)
None 

null value of holdout
null values count:
Series([], dtype: int64) 

null values percentage:
Series([], dtype: float64)
None


In [16]:
# Identify categorical columns
dev.select_dtypes(include=['object']).columns

Index(['payment_type', 'employment_status', 'housing_status', 'source',
       'device_os'],
      dtype='object')

In [17]:
dev['payment_type'].unique()

array(['AC', 'AA', 'AB', 'AD', 'AE'], dtype=object)

In [18]:
dev['employment_status'].unique()

array(['CA', 'CB', 'CE', 'CC', 'CF', 'CD', 'CG', 'unknown'], dtype=object)

In [19]:
dev['foreign_country_tranfer'].unique()

array([0, 1])

In [20]:
dev['source'].unique()

array(['INTERNET', 'TELEAPP'], dtype=object)

# Convert categorical values to numerical

In [21]:
from sklearn.preprocessing import LabelEncoder
# Convert categorical values to numerical using Label Encoding
def cate_to_numeric(data,cate_label_columns,cate_one_hot_columns):

  data_encoded = data.copy()
  for feature in cate_one_hot_columns:
    one_hot_encoded = pd.get_dummies(data_encoded[feature], prefix=feature)
    one_hot_encoded = one_hot_encoded.astype(int)
    data_encoded = pd.concat([data_encoded, one_hot_encoded], axis=1)

  label_encoder = LabelEncoder()
  for feature in cate_label_columns:
    data_encoded[feature] = label_encoder.fit_transform(data_encoded[feature])

  return data_encoded

In [23]:
# Apply the function to your dev, val and holdout
cate_label_columns=[]
cate_one_hot_columns=['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
new_dev = cate_to_numeric(dev_filled,cate_label_columns,cate_one_hot_columns)
new_val = cate_to_numeric(val_filled,cate_label_columns,cate_one_hot_columns)
new_holdout = cate_to_numeric(holdout_filled,cate_label_columns,cate_one_hot_columns)

In [24]:
new_dev.dtypes

fraud_bool                        int64
income                          float64
name_email_similarity           float64
prev_address_months_count         int64
current_address_months_count      int64
customer_age                      int64
intended_balcon_amount          float64
payment_type                     object
employment_status                object
credit_risk_score                 int64
email_is_free                     int64
housing_status                   object
phone_home_valid                  int64
phone_mobile_valid                int64
vintage_months_count              int64
has_other_cards                   int64
sanctioned_limit                float64
foreign_country_tranfer           int64
source                           object
session_length_in_minutes       float64
device_os                        object
device_fraud_count                int64
payment_type_AA                   int64
payment_type_AB                   int64
payment_type_AC                   int64


In [48]:
new_dev.shape

(500000, 50)

In [25]:
new_dev.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,intended_balcon_amount,payment_type,employment_status,credit_risk_score,...,housing_status_BF,housing_status_BG,housing_status_unknown,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
120074,0,0.7,0.50495,12,5,20,-1.280685,AC,CA,29,...,0,0,0,1,0,1,0,0,0,0
265124,0,0.1,0.124078,-1,88,50,-0.688306,AC,CA,43,...,0,0,0,1,0,1,0,0,0,0
796184,0,0.6,0.988943,-1,159,50,49.075395,AA,CA,244,...,0,0,0,1,0,1,0,0,0,0
795254,0,0.9,0.041099,-1,102,40,24.745517,AA,CA,174,...,0,0,0,1,0,0,0,1,0,0
37176,0,0.4,0.617012,-1,358,30,10.001946,AA,CB,90,...,0,0,0,1,0,0,0,1,0,0


In [27]:

columns = new_dev.columns.tolist()
dependent_var = 'fraud_bool'
features = [col for col in columns if col != dependent_var and col != 'payment_type' and col != 'employment_status' and col != 'housing_status'and col != 'source'
       and col != 'device_os'  ]

# Create the formula
formula = dependent_var + ' ~ ' + ' + '.join(features)

# Define the model
mylogit = sm.glm(
    formula,
    data=new_dev,
    family=sma.families.Binomial()
).fit()

# Print the model summary
print(mylogit.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             fraud_bool   No. Observations:               500000
Model:                            GLM   Df Residuals:                   499961
Model Family:                Binomial   Df Model:                           38
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -24701.
Date:                Wed, 15 May 2024   Deviance:                       49403.
Time:                        14:18:28   Pearson chi2:                 5.14e+05
No. Iterations:                    25   Pseudo R-squ. (CS):            0.02273
Covariance Type:            nonrobust                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       

In [49]:
# Extract the coefficients without the intercept (assuming the intercept is the first element)
coefficients_without_intercept = mylogit.params.values[1:]

# Print the coefficients without the intercept
print(coefficients_without_intercept)

[ 9.21629775e-01 -1.21279170e+00 -8.92670088e-03  1.60419931e-04
 -5.28304478e-03 -7.79293357e-03  2.80290786e-03  6.11262706e-01
 -1.03645584e+00 -3.36647240e-01  1.49448969e-02 -1.09375249e+00
  3.24320280e-06  4.78272692e-01  4.88826841e-03  4.29099985e-09
 -1.11968097e+00 -1.11468724e+00 -2.93012515e-01 -1.02643259e+00
 -1.58696548e+00  2.49933065e+00  1.95889094e+00  2.80044000e+00
  1.64872091e+00  1.83490116e+00  1.37714624e+00  2.75567933e+00
 -2.00158880e+01  3.47452401e+00  2.29934257e+00  2.41518943e+00
  2.75576925e+00  1.93686785e+00  2.09938927e+00  3.03795778e+00
 -2.31598189e+01 -2.84617272e+00 -2.29460607e+00 -1.65868033e+00
 -6.46229781e-01 -1.48409625e+00 -4.15085382e-01 -9.36687037e-01]


In [86]:

def calculate_logistic_regression_score(feature, coefficients, intercept):
    # Calculate linear combination
    linear_combination = np.dot(feature, coefficients) + intercept

    # Apply link function (sigmoid)
    probabilities = 1 / (1 + np.exp(-linear_combination))

    return probabilities


In [54]:
feature=new_dev[features]

In [69]:
coefficients_array = np.array(coefficients_without_intercept, dtype=float)
feature = np.array(feature, dtype=float)

In [87]:
intercept = -5.1408  # Intercept obtained from GLM regression
coefficients_array = np.array(coefficients_without_intercept, dtype=float)
feature = np.array(feature, dtype=float)

predicted_probabilities = calculate_logistic_regression_score(feature, coefficients_array, intercept)
print(predicted_probabilities)

[0.00845594 0.00246882 0.00203381 ... 0.00297432 0.01607328 0.00079572]


In [122]:
a=[col for col in mylogit.model.data.orig_exog.columns if col != 'Intercept']
val_features = new_val[a]

predicted_probabilities = mylogit.predict(val_features)

predicted_classes = (predicted_probabilities >= 0.5).astype(int)

In [124]:
from sklearn.metrics import accuracy_score, roc_auc_score


actuals = new_val['fraud_bool']


accuracy = accuracy_score(actuals, predicted_classes)
auc = roc_auc_score(actuals, predicted_probabilities)

print("Accuracy:", accuracy)
print("ROC AUC:", auc)


Accuracy: 0.9890833333333333
ROC AUC: 0.848246789661101


In [119]:
b=[col for col in mylogit.model.data.orig_exog.columns if col != 'Intercept']
holdout_features = new_holdout[b]

predicted_probabilities = mylogit.predict(holdout_features)

predicted_classes = (predicted_probabilities >= 0.5).astype(int)

In [118]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

actuals = new_holdout['fraud_bool']

accuracy = accuracy_score(actuals, predicted_classes)
auc = roc_auc_score(actuals, predicted_probabilities)

print("Accuracy:", accuracy)
print("ROC AUC:", auc)

Accuracy: 0.988915
ROC AUC: 0.851482951029072
