In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("..\\data\\processed\\train_data.csv")

In [3]:
test = pd.read_csv("..\\data\\processed\\test_data.csv")

In [4]:
train.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,12.94198,2.935784,7.386764,1.0,0.0,0.0,1.0,1.0
1,1.951739,25.327314,6.951437,1.0,1.0,0.0,1.0,1.0
2,9.613066,0.519752,0.300678,1.0,0.0,0.0,1.0,0.0
3,13.338463,1.698976,2.220091,1.0,0.0,0.0,0.0,0.0
4,11.45742,0.040196,0.296429,1.0,0.0,0.0,1.0,0.0


In [5]:
test.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,127.333572,0.146608,6.81747,1.0,1.0,0.0,1.0,1.0
1,5.717586,0.286118,2.111874,1.0,0.0,0.0,1.0,0.0
2,4.546093,0.126864,2.688452,1.0,1.0,0.0,1.0,0.0
3,4.115535,2.427539,3.978698,1.0,0.0,0.0,1.0,0.0
4,1.502405,9.498426,1.459913,0.0,1.0,0.0,1.0,0.0


In [6]:
## splitting independent and dependent features of train and test data
train_y = train["fraud"]
test_y = test["fraud"]

In [7]:
train_y

0         1.0
1         1.0
2         0.0
3         0.0
4         0.0
         ... 
699995    0.0
699996    1.0
699997    1.0
699998    0.0
699999    0.0
Name: fraud, Length: 700000, dtype: float64

In [8]:
test_y

0         1.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
299995    0.0
299996    0.0
299997    0.0
299998    0.0
299999    1.0
Name: fraud, Length: 300000, dtype: float64

In [9]:
train_X = train.drop(columns="fraud", axis=1)
test_X = test.drop(columns="fraud",axis=1)

In [10]:
train_X.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
0,12.94198,2.935784,7.386764,1.0,0.0,0.0,1.0
1,1.951739,25.327314,6.951437,1.0,1.0,0.0,1.0
2,9.613066,0.519752,0.300678,1.0,0.0,0.0,1.0
3,13.338463,1.698976,2.220091,1.0,0.0,0.0,0.0
4,11.45742,0.040196,0.296429,1.0,0.0,0.0,1.0


In [11]:
test_X.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
0,127.333572,0.146608,6.81747,1.0,1.0,0.0,1.0
1,5.717586,0.286118,2.111874,1.0,0.0,0.0,1.0
2,4.546093,0.126864,2.688452,1.0,1.0,0.0,1.0
3,4.115535,2.427539,3.978698,1.0,0.0,0.0,1.0
4,1.502405,9.498426,1.459913,0.0,1.0,0.0,1.0


**Possible simple classification model that we can use:**
1. logistic regression classifier
2. nearest neighbors classifiers
3. random forest classifier
4. xgboost classifier

**Since it is okay if we predict non-fraud transactions as fraud but we should try to avoid predicting fraud transactions as non-fraud as much as possible, we should have low precision (high false positive) and high recall (low false negative)**

#### Logistic Regression

In [12]:
## creating logistic model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=2000)
lr.fit(train_X, train_y)

LogisticRegression(max_iter=2000)

In [13]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [14]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
scv = StratifiedKFold(n_splits=10, random_state=2234, shuffle=True)
lr_precision_score = cross_val_score(lr, test_X, test_y, cv=scv, scoring="precision")
lr_roc_auc_score = cross_val_score(lr, test_X, test_y, cv=scv, scoring="roc_auc")

In [15]:
print(f"The precision score for the trained logistic regression model is {np.round(lr_precision_score.mean(),3)} +/- {np.round(lr_precision_score.std(),3)} .")

The precision score for the trained logistic regression model is 0.894 +/- 0.011 .


In [16]:
print(f"The roc_auc score for the trained logistic regression model is {np.round(lr_roc_auc_score.mean(),3)} +/- {np.around(lr_roc_auc_score.std(),3)} .")

The roc_auc score for the trained logistic regression model is 0.966 +/- 0.003 .


#### Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=2, random_state=234)
rfc.fit(train_X,train_y)

RandomForestClassifier(max_depth=2, random_state=234)

In [23]:
scv = StratifiedKFold(n_splits=10, random_state=22734, shuffle=True)
rfc_precision_score = cross_val_score(rfc, test_X, test_y, cv=scv, scoring="precision")
rfc_roc_auc_score = cross_val_score(rfc, test_X, test_y, cv=scv, scoring="roc_auc")

In [25]:
print(f"The precision score for the trained random forest classification model is {rfc_precision_score.mean()} +/- {rfc_precision_score.std()} .")
print(f"The roc_auc score for the trained random forest classification model is {rfc_roc_auc_score.mean()} +/- {rfc_roc_auc_score.std()} .")

The precision score for the trained random forest classification model is 1.0 +/- 0.0 .
The roc_auc score for the trained random forest classification model is 0.9900639037286958 +/- 0.0003042664278205408 .


##### Since random forest classifier with only depth=2 is giving us quite a good model, we do not need to check for other algorithms