# Table of Contents
1. [Import Library](#1)
1. [Load Data](#2)
1. [Data Analysis](#3)
    * [Missing Value](#4)
    * [Distribution](#5)
1. [Feature Engineering](#6)
    * [Analyzing](#7)
    * [Feature Extraction](#8)
1. [Modelling](#9)
    * [Scaling](#10)
    * [Parameters](#11)
    * [Classifiers](#12)
    * [Grid Search Cross Validation and K-Fold](#13)
    * [Visualization](#14)
    * [Prediction](#15)




<a id='1'></a>
# Import Library

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno
import matplotlib.pyplot as plt
import seaborn as sns

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

<a id='2'></a>
# Load Data

In [None]:
source = '/kaggle/input/fraud-detection-bank-dataset-20k-records-binary/fraud_detection_bank_dataset.csv'
data = pd.read_csv(source)

<a id='3'></a>
# Data Analysis

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data.describe()

<a id='4'></a>
### Missing Value

In [None]:
missing_count = data.isnull().any().sum()
print(f'Count of features with missing values: {missing_count}')

<a id='5'></a>
### Distribution

In [None]:
data.tail()

In [None]:
fig, axes = plt.subplots(7,16,figsize=(28,14))
axes = axes.flatten()

for col, ax in enumerate(axes):
    sns.kdeplot(data=data, x=f'col_{col}', 
                fill=True, 
                ax=ax)
 
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(f'col_{col}', loc='center', weight='bold', fontsize=10)

plt.show()

#### Column List:
   * 8-9-10-11-12-18-19-20-21-35-51-52-53-70-71 
   
   
As the features in column list has a single value, we're dropping that from dataset.

In [None]:
drop_list=['col_8','col_9','col_10','col_11','col_12','col_18','col_19','col_20','col_21','col_35','col_51','col_52','col_53','col_70','col_71']
data.drop(drop_list,axis=1,inplace=True)

In [None]:
col_list = ['index']
last_one = len(data.columns)-1
for i in range(last_one):
    if i != last_one-1:
        col_list.append(f'col_{i}')

col_list.append('targets')

data = data.set_axis(col_list,axis=1)

In [None]:
data.head()

<a id='6'></a>
# Feature Engineering

<a id='7'></a>
### Analyzing

In [None]:
fig, axes = plt.subplots(14,7,figsize=(20,30))
axes = axes.flatten()

counter = 0

for col, ax in enumerate(axes):
    counter += 1
    y = f'col_{col}'
    
    if counter<98:
        sns.barplot(data=data, 
                    x='targets',
                    y=y, 
                    palette='rocket', 
                    ax=ax)

        ax.set_xlabel('')
        
plt.title('Features vs Targets')
plt.tight_layout()
plt.show()


<a id='8'></a>
### Feature Extraction

In [None]:
data['c0'] = [0 if i > 3 else 1 for i in data.col_0]
data['c1'] = [0 if i > 200 else 1 for i in data.col_1]
data['c2'] = [0 if i > 0.4 else 1 for i in data.col_2]
data['c3'] = [0 if i > 2 else 1 for i in data.col_3]
data['c5'] = [1 if i > 1 else 0 for i in data.col_5]
data['c7'] = [0 if i > 3 else 1 for i in data.col_7]
data['c9'] = [0 if i > 0.2 else 1 for i in data.col_9]

data['c11'] = [0 if i > 0.4 else 1 for i in data.col_11]
data['c13'] = [0 if i > 4 else 1 for i in data.col_13]
data['c14'] = [0 if i > 30 else 1 for i in data.col_14]
data['c15'] = [0 if i > 3 else 1 for i in data.col_15]
data['c16'] = [0 if i > 2 else 1 for i in data.col_16]
data['c17'] = [0 if i > 1 else 1 for i in data.col_17]

data['c20'] = [0 if i > 1.5 else 1 for i in data.col_20]
data['c21'] = [0 if i > 5 else 1 for i in data.col_21]
data['c22'] = [0 if i > 0.2 else 1 for i in data.col_22]
data['c23'] = [0 if i > 0.4 else 1 for i in data.col_23]
data['c24'] = [0 if i > 0.05 else 1 for i in data.col_24]
data['c25'] = [0 if i > 2 else 1 for i in data.col_25]
data['c28'] = [0 if i > 150 else 1 for i in data.col_28]

data['c31'] = [0 if i > 0.5 else 1 for i in data.col_31]
data['c32'] = [0 if i > 5 else 1 for i in data.col_32]
data['c33'] = [0 if i > 0.3 else 1 for i in data.col_33]
data['c35'] = [0 if i > 0.05 else 1 for i in data.col_35]

data['c41'] = [0 if i > 150 else 1 for i in data.col_41]
data['c46'] = [0 if i > 0.2 else 1 for i in data.col_46]
data['c47'] = [0 if i > 4 else 1 for i in data.col_47]
data['c27'] = [0 if i > 0.4 else 1 for i in data.col_27]
data['c48'] = [0 if i > 0.2 else 1 for i in data.col_48]

data['c54'] = [0 if i > 30000 else 1 for i in data.col_54]
data['c55'] = [0 if i > 30 else 1 for i in data.col_55]
data['c57'] = [1 if i > 0.005 else 0 for i in data.col_57]
data['c58'] = [1 if i > 0.05 else 0 for i in data.col_58]
data['c59'] = [1 if i > 0.05 else 0 for i in data.col_59]

data['c61'] = [1 if i > 0.1 else 0 for i in data.col_61]
data['c62'] = [1 if i > 0.02 else 0 for i in data.col_62]
data['c66'] = [1 if i > 0.01 else 0 for i in data.col_66]
data['c67'] = [1 if i > 0.04 else 0 for i in data.col_67]
data['c68'] = [1 if i > 0.1 else 0 for i in data.col_68]
data['c69'] = [1 if i > 0.025 else 0 for i in data.col_69]

data['c70'] = [1 if i > 0.05 else 0 for i in data.col_70]
data['c71'] = [1 if i > 0.05 else 0 for i in data.col_71]
data['c72'] = [1 if i > 0.025 else 0 for i in data.col_72]
data['c73'] = [1 if i > 0.01 else 0 for i in data.col_73]
data['c74'] = [1 if i > 0.1 else 0 for i in data.col_74]
data['c78'] = [1 if i > 0.05 else 0 for i in data.col_78]
data['c79'] = [1 if i > 0.02 else 0 for i in data.col_79]

data['c83'] = [0 if i > 0.2 else 1 for i in data.col_83]
data['c88'] = [0 if i > 0.005 else 1 for i in data.col_88]

data['c91'] = [0 if i > 0.3 else 1 for i in data.col_91]
data['c92'] = [0 if i > 0.2 else 1 for i in data.col_92]
data['c94'] = [1 if i > 0.05 else 0 for i in data.col_94]
data['c95'] = [1 if i > 0.04 else 0 for i in data.col_95]


In [None]:
data.head()

<a id='9'></a>
# Modelling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data.drop(['targets'],axis=1)
y= data.targets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

<a id='10'></a>
## Scaling

In [None]:
#from sklearn.preprocessing import StandardScaler

In [None]:
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

<a id='11'></a>
## Parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

        
knn_params = {'n_neighbors':np.arange(1,50)}

nb_params = {'var_smoothing': np.logspace(0,-9, num=100)}

rf_params =  {'max_features':[1,3,10],
              'min_samples_split':[2,3,10],
              'min_samples_leaf':[1,3,10],
              'bootstrap':[False],
              'n_estimators':[100,300],
              'criterion':['gini']}

gb_params = {'learning_rate':[0.001,0.01,0.1,0.05],
            'n_estimators':[100,500,100],
            'max_depth':[3,5,10],
            'min_samples_split':[2,5,10]
    
}

param = [rf_params,knn_params,nb_params,gb_params]

<a id='12'></a>
## Classifiers

In [None]:
classifier = [RandomForestClassifier(),
              KNeighborsClassifier(),
              GaussianNB(),
              GradientBoostingClassifier()
             ]

ml_list = ['Random Forest','KNN','Naive Bayes','GradientBoosting']

<a id='13'></a>
## Grid Search Cross Validation and K-Fold

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score


cv_results = []
best_estimators = []

for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i],
                            param_grid=[param[i]],
                            cv = StratifiedKFold(n_splits=5),
                            scoring = 'roc_auc',
                             n_jobs= -1,
                             verbose = 1
                        
                            )
    clf.fit(X_train,y_train)
    cv_results.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print('Method: {}  Score: {} Best: {}' .format(classifier[i],cv_results[i],clf.best_estimator_))
    
results = pd.DataFrame({'CV Means':cv_results,
                       'ML Models':ml_list})

<a id='14'></a>
## Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

g = sns.barplot('CV Means','ML Models',data=results)
g.set_title('ROC-AUC Score')
plt.show()

<a id='15'></a>
## Prediction

In [None]:
from sklearn.ensemble import VotingClassifier
voting_c = VotingClassifier(estimators=[('rf',best_estimators[0]),
                                        ('knn',best_estimators[1]),
                                        ('nb',best_estimators[2]),
                                        ('gb',best_estimators[3])
                                       ],
                           voting='soft',
                           n_jobs=-1)

voting_c = voting_c.fit(X_train,y_train)
my_score = accuracy_score(voting_c.predict(X_test),y_test)
print(my_score)