### **1. Importing Liabraries**

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
import time
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


### **2. Loading Datasets**

In [2]:
train_data=pd.read_csv("../data/train_data.csv",index_col=0)
test_data=pd.read_csv("../data/test_data.csv",index_col=0)

In [3]:
#convert category to dummy variables in both Train and Test datasets

train_data=pd.get_dummies(train_data, drop_first=True)
test_data=pd.get_dummies(test_data, drop_first=True)

### **3. Split the Train and Test Datasets into X_train,y_train,X_test,y_test**

In [4]:
# Since our data is already splitted in Train and Test data thats why we are not using splitting function to split the data instaed we will mnaully do that  
X_train = train_data.drop('is_fraud', axis = 1)
y_train = train_data['is_fraud'] 

X_test = test_data.drop('is_fraud', axis = 1) 
y_test = test_data['is_fraud'] 

In [5]:
X_train

Unnamed: 0,cc_num,amt,lat,long,city_pop,unix_time,merch_lat,merch_long,age,trans_month,...,state_WI,state_WV,state_WY,label_Train,trans_week_days_Monday,trans_week_days_Saturday,trans_week_days_Sunday,trans_week_days_Thursday,trans_week_days_Tuesday,trans_week_days_Wednesday
0,2703186189652095,4.97,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,30.0,1,...,0,0,0,1,0,0,0,0,1,0
1,630423337322,107.23,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,40.0,1,...,0,0,0,1,0,0,0,0,1,0
2,38859492057661,220.11,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,56.0,1,...,0,0,0,1,0,0,0,0,1,0
3,3534093764340240,45.00,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,51.0,1,...,0,0,0,1,0,0,0,0,1,0
4,375534208663984,41.96,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,32.0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,30560609640617,43.77,40.4931,-91.8912,519,1388534347,39.946837,-91.333331,54.0,12,...,0,0,0,0,0,0,0,1,0,0
1852390,3556613125071656,111.84,29.0393,-95.4401,28739,1388534349,29.661049,-96.186633,21.0,12,...,0,0,0,0,0,0,0,1,0,0
1852391,6011724471098086,86.88,46.1966,-118.9017,3684,1388534355,46.658340,-119.715054,39.0,12,...,0,0,0,0,0,0,0,1,0,0
1852392,4079773899158,7.99,44.6255,-116.4493,129,1388534364,44.470525,-117.080888,55.0,12,...,0,0,0,0,0,0,0,1,0,0


### **4. Resampling Datasets**

In [6]:
method= SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)

In [7]:
print(X_resampled.shape,y_resampled.shape)

(3685486, 82) (3685486,)


### **Modeling**
#### **Model 1 : Random Forest Classifier**

Random Forest is an ensemble method that constructs multiple decision trees, typically trained through the "bagging" technique, known as Bootstrap Aggregation. In the context of a classification problem, each tree is built from a bootstrap sample, providing diversity in the training data. This method is particularly useful for investigating feature importance in large datasets.

The foundational learner in Random Forest is the Decision Tree. Each tree in the ensemble is trained on a subset of the data, incorporating both raw sampling and feature sampling. The training process is conducted in parallel, enhancing computational efficiency.

To arrive at a final prediction, unweighted voting is employed across the ensemble. Notably, Random Forest is considered easier to tune and less prone to overfitting compared to individual decision trees.

Unlike some algorithms, Random Forest classifiers typically do not require feature scaling due to their Decision Tree-based nature. However, for our initial case, we are experimenting with predicting values using scaled data to explore potential performance enhancements.

In [8]:
# Cross Validation
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_resampled, y_resampled, cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.00978974, 0.42804407, 0.49634037, 0.99989147, 0.99989689])

**Grid Search CV**

In [None]:
#rf = RandomForestClassifier()
#param = {'n_estimators': [10, 100, 150],
#        'max_depth': [10, 20, 30, None]}

#gs = GridSearchCV(rf, param,cv=5, n_jobs=-1)
#gs_fit = gs.fit(X_train_scaled, y_train)
#pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

**Build our own Grid-search**

In [10]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_resampled, y_resampled)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [11]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 0.116 / Recall: 0.782 / Accuracy: 0.968
Est: 10 / Depth: 20 ---- Precision: 0.284 / Recall: 0.95 / Accuracy: 0.987
Est: 10 / Depth: 30 ---- Precision: 0.407 / Recall: 0.986 / Accuracy: 0.992
Est: 10 / Depth: None ---- Precision: 0.994 / Recall: 0.99 / Accuracy: 1.0
Est: 50 / Depth: 10 ---- Precision: 0.146 / Recall: 0.787 / Accuracy: 0.975
Est: 50 / Depth: 20 ---- Precision: 0.309 / Recall: 0.953 / Accuracy: 0.989
Est: 50 / Depth: 30 ---- Precision: 0.493 / Recall: 0.997 / Accuracy: 0.995
Est: 50 / Depth: None ---- Precision: 1.0 / Recall: 1.0 / Accuracy: 1.0
Est: 100 / Depth: 10 ---- Precision: 0.143 / Recall: 0.782 / Accuracy: 0.974
Est: 100 / Depth: 20 ---- Precision: 0.332 / Recall: 0.951 / Accuracy: 0.99
Est: 100 / Depth: 30 ---- Precision: 0.521 / Recall: 0.999 / Accuracy: 0.995
Est: 100 / Depth: None ---- Precision: 1.0 / Recall: 1.0 / Accuracy: 1.0


**Evaluation**

In [12]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_resampled, y_resampled)
end = time.time()
fit_time = (end - start)


start = time.time()
y_pred = rf_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 527.48 / Predict time: 19.41 ---- Precision: 1.0 / Recall: 1.0 / Accuracy: 1.0


In [13]:
print('Classification report:\n', classification_report(y_test, y_pred))
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print('Confusion matrix:\n', conf_mat)

print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1842743
           1       1.00      1.00      1.00      9651

    accuracy                           1.00   1852394
   macro avg       1.00      1.00      1.00   1852394
weighted avg       1.00      1.00      1.00   1852394

Confusion matrix:
 [[1842743       0]
 [      0    9651]]
Share of Non-Fraud in Test Data: 0.9948


In [14]:
from sklearn.metrics import confusion_matrix
TP, FP, FN, TN = confusion_matrix(y_test, y_pred).ravel()
c_matrix = pd.DataFrame({'Predicted = Fraud': [TP, FP],
                                 'Predicted = Not Fraud': [FN, TN]
                                }, index=['Fraud', 'Not Fraud'])
c_matrix

Unnamed: 0,Predicted = Fraud,Predicted = Not Fraud
Fraud,1842743,0
Not Fraud,0,9651


**Hyperparameter tuning**


Manual Hyperparameter Tuning
* Increasing the predictive power
  * n_estimators : number of trees the algorithm builds before taking the maximum voting or taking the averages of predictions.
  * max_features : maximum number of features random forest considers to split a node.
* Increasing the model's speed
  * n_jobs : The n_jobs hyperparameter tells the engine how many processors it is allowed to use. If it has a value of one, it can only use one processor. A value of “-1” means that there is no limit.
  * random_state : hyperparameter makes the model’s output replicable
  * The minimum number of samples required to be at a leaf node

In [15]:
### Manual Hyperparameter Tuning
model=RandomForestClassifier(n_estimators=150,criterion='entropy',max_depth=None,
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_resampled,y_resampled)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

[[1836742    6001]
 [    645    9006]]
0.9964122103612947
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1842743
           1       0.60      0.93      0.73      9651

    accuracy                           1.00   1852394
   macro avg       0.80      0.96      0.86   1852394
weighted avg       1.00      1.00      1.00   1852394



#### **Model 2 : XGBoost model**

**Build our own Grid-search**

In [18]:
# hyperparameter tuning with XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [25]:

# creating a KFold object 
folds = 3

# specify range of hyperparameters
param_grid = {'learning_rate': [0.2, 0.6], 
             'subsample': [0.3, 0.6, 0.9]}          


# specify model
xgb_model = XGBClassifier(max_depth=2, n_estimators=200)

# set up GridSearchCV()
XgB_model_cv = GridSearchCV(estimator = xgb_model, 
                        param_grid = param_grid, 
                        scoring= 'roc_auc', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
XgB_model_cv.fit(X_resampled, y_resampled)  



Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [12]:
XgB_model_cv.best_params_

NameError: name 'XgB_model_cv' is not defined

In [19]:
# Applying the hyperparameters

params = {'learning_rate': 0.2,
          'max_depth': 2, 
          'n_estimators':200,
          'subsample':0.9,
         'objective':'binary:logistic'}

# fit model on training data
XgB_df_model = XGBClassifier(params = params)
XgB_df_model.fit(X_resampled, y_resampled)



Parameters: { "params" } are not used.



In [22]:
# Predictions on the test set
XgB_test_pred = XgB_df_model.predict(X_test)

precision, recall, fscore, train_support = score(y_test, XgB_test_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}/ Fscore'.format(
    round(precision, 3), round(recall, 3), round((XgB_test_pred==y_test).sum()/len(XgB_test_pred), 3)))

Precision: 0.26 / Recall: 0.963 / Accuracy: 0.986


In [23]:
# classification_report
print(classification_report(y_test, XgB_test_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1842743
           1       0.26      0.96      0.41      9651

    accuracy                           0.99   1852394
   macro avg       0.63      0.97      0.70   1852394
weighted avg       1.00      0.99      0.99   1852394



In [24]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=XgB_test_pred)
print('Confusion matrix:\n', conf_mat)

Confusion matrix:
 [[1816307   26436]
 [    361    9290]]


In [17]:
from sklearn.metrics import confusion_matrix
TP, FP, FN, TN = confusion_matrix(y_test, XgB_test_pred).ravel()
c_matrix = pd.DataFrame({'Predicted = Fraud': [TP, FP],
                                 'Predicted = Not Fraud': [FN, TN]
                                }, index=['Fraud', 'Not Fraud'])
c_matrix

Unnamed: 0,Predicted = Fraud,Predicted = Not Fraud
Fraud,1816307,361
Not Fraud,26436,9290


#### **Model 2 : LOGISTIC REGRESSION**

Logistic Regression is a statistical method used for binary classification problems. It's a type of regression analysis that is well-suited for predicting the probability of an event occurring, which is then converted into a binary outcome (usually 0 or 1). Despite its name, logistic regression is a classification algorithm, not a regression algorithm.

The C hyperparameter is a regularization parameter in logistic regression that controls how closely Model fits to the training data.

In [17]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [18]:
lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_resampled,y_resampled)

print_results(cv)