In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import Libraries

In [None]:
import matplotlib 
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)

# 2. Import Data

In [None]:
df = pd.read_csv('/kaggle/input/fraud-detection-bank-dataset-20k-records-binary/fraud_detection_bank_dataset.csv')
df.head()

# 3. Data cleaning and visualisation

In [None]:
for i in df.columns:
    print (i+": "+str(df[i].isna().sum()))

In [None]:
df = df.drop('Unnamed: 0',axis = 1)
df.describe()

In [None]:
corr= df.corr()
plt.figure(figsize = (25,10))
sns.heatmap(corr,annot = True,cmap = 'rocket')

In [None]:
x = df.drop('targets',axis = 1)
y = df.iloc[:,-1]

In [None]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

x_std = std.fit_transform(x)

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=2)
x_pca = pca.fit_transform(x)
x_pca.shape


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=df['targets'])
plt.xlabel('First principle component')
plt.ylabel('Second principle component')

In [None]:
from sklearn.model_selection import train_test_split
#Split data into Train and test format
x_train,x_test,y_train,y_test = train_test_split(x_pca,y,test_size = 0.20,random_state =42)

print('Shape of Training Xs:{}'.format(x_train.shape))
print('shape of Test:{}'.format(x_test.shape))

In [None]:
from sklearn.linear_model import LogisticRegression

# apply algorithm on data and find out wether model is suitable or not.
clf = LogisticRegression();
clf.fit(x_train,y_train)
y_predicted = clf.predict(x_test)
score = clf.score(x_test,y_test)

In [None]:
print(score)

In [None]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, y_predicted)
np.set_printoptions(precision=2)
cnf_matrix

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
classes = df["targets"].value_counts()
classes.index = [str(x) for x in classes.index]

In [None]:
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes.index,
                      title='Confusion matrix, without normalization')
# With normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes= classes.index, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
#Randomized Search CV


#Criteron for tree
criterion = ['gini','entropy']
#splitter for tree
splitter = ["best", "random"]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 40, num = 8)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15,25,50,75, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5,6,10,15]

In [None]:
# Create the random grid
random_grid = {'criterion': criterion,
               'splitter': splitter,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier();
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = 1)
rf_random.fit(x_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
rf = DecisionTreeClassifier(splitter='best',min_samples_split =25,min_samples_leaf= 1,max_features= 'sqrt',max_depth= 5,criterion= 'entropy')

rf.fit(x_train,y_train)
score = rf.score(x_test,y_test)
y_predicted = rf.predict(x_test)
print(score)

In [None]:
cnf_matrix = confusion_matrix(y_test, y_predicted)
np.set_printoptions(precision=2)
cnf_matrix

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes.index,
                      title='Confusion matrix, without normalization')
# With normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes= classes.index, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 15)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 50, num = 10)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15,25,50,75 ,100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 7, 10, 14]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 50, cv = 3, verbose=2, random_state=42)

rf_random.fit(x_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_best_params = RandomForestClassifier(n_estimators= 1000,min_samples_split= 25,min_samples_leaf= 2, max_features= 'auto',max_depth= 5)

rf_best_params.fit(x_train,y_train)
score = rf_best_params.score(x_train,y_train)
print(score)