In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://www.debt.org/wp-content/uploads/2012/12/Credit-Card.gif)

# Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset

In [1]:
data = pd.read_csv("../input/creditcardfraud/creditcard.csv")

In [1]:
data.head() #Display the first 5 rows

In [1]:
data['Class'].value_counts() # count unique values in class clumns

In [1]:
data.columns #All columns of dataset

In [1]:
data.info() #Information about the dataset like number of null values and data types

In [1]:
data.isnull().sum() # Function return the total number of null values for each columns

In [1]:
data[data.duplicated()] # Display all the duplicates rows

In [1]:
data.duplicated().sum() #Returns total number of duplicates rows of our dataset

In [1]:
data = data.drop_duplicates(keep='first') #Keep the first rows and drop the duplicates rows

In [1]:
data.duplicated().sum()#Check for is still duplicates values are there or not

In [1]:
data = data.drop("Time",axis=1) #Drop the time columns, for this model time columns are not importance

In [1]:
data.describe() #Function display the count,mean,std,min,max and quartiles values

# Feature selection

In [1]:
plt.figure(figsize=(29,13))
cor = data.drop("Class",axis=1).corr()
sns.heatmap(cor,annot = True)
plt.show()

In [1]:
def correlation(dataset,threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [1]:
corr_fea = correlation(data.drop("Class",axis=1),0.8)
print(corr_fea)

**By this correlation we give threshold value 0.8 and we trying to find is any columns are correlated with each other,if correlated then we drop any of this**

# import the dataset, here x is independent and y represented as dependent variable

In [1]:
x = data.drop("Class",axis=1)
y = data['Class']

# Splitting the dataset into the Training set and Test set

In [1]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [1]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Logistic Regression

**Training the Logistic Regression model on the Training set**

In [1]:
from sklearn.linear_model import LogisticRegression

In [1]:
classifier = LogisticRegression(solver='liblinear',random_state = 0)

In [1]:
classifier.fit(x_train,y_train)

# Predicting the Test set results

In [1]:
log_y_pred = classifier.predict(x_test)

In [1]:
print(log_y_pred)

# Making the Confusion Matrix

In [1]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, log_y_pred)
print(cm)

# Computing the accuracy with k-Fold Cross Validation

In [1]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

In [1]:
from sklearn.metrics import accuracy_score
log_acc_score = accuracy_score(y_test, log_y_pred)
print(log_acc_score)

# Training the K-NN model on the Training set

In [1]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(x_train, y_train)

In [1]:
knn_y_pred = classifier.predict(x_test)

# Making the Confusion Matrix

In [1]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, knn_y_pred)
print(cm)


In [1]:
plt.figure(figsize=(6,6))
plt.title('Confusion matrix on test data')
sns.heatmap(cm, annot=True, fmt='d', cmap=plt.cm.Greens, cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [1]:
acc_score = accuracy_score(y_test, knn_y_pred)
print(f"Accuracy score by KNN model is: {acc_score}")

# Training the Kernel SVM model on the Training set

In [1]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results

In [1]:
svm_y_pred = classifier.predict(x_test)

# confusion matrix

In [1]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, svm_y_pred)
print(cm)

In [1]:
acc_score_svm = accuracy_score(y_test, svm_y_pred)
print(f"Accuracy score by KNN model is: {acc_score_svm}")

# Training the Naive Bayes model on the Training set

In [1]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

# Predicting the Test set results

In [1]:
naive_y_pred = classifier.predict(x_test)
print(naive_y_pred)

# Confusion matrix and accuracy

In [1]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, naive_y_pred)
print(cm)

In [1]:
acc_score_naive = accuracy_score(y_test, naive_y_pred)
print(f"Accuracy score by KNN model is: {acc_score_naive}")

# Training the Decision Tree Regression model on training dataset

In [1]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x_train,y_train)

In [1]:
dtr_y_pred = regressor.predict(x_test)

In [1]:
dtr_cm = confusion_matrix(y_test, naive_y_pred)
print(dtr_cm)

In [1]:
dtr_acc =accuracy_score(y_test,dtr_y_pred)
print(f"Accuracy score by DTR is: {dtr_acc}")

# Visualization of all model that are applied in this dataset, by bar chart

In [1]:
mylist=[]
mylist2=[]
mylist.append(log_acc_score)
mylist2.append("Logistic Regression")
mylist.append(acc_score)
mylist2.append("KNN")
mylist.append(acc_score_svm)
mylist2.append("SVM")
mylist.append(acc_score_naive)
mylist2.append("Naive Bayes")
mylist.append(dtr_acc)
mylist2.append("DTR")

In [1]:
plt.rcParams['figure.figsize']=8,6
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "rocket", saturation =1.5)
plt.xlabel("Regressor Models", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Regreesor Models", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

![](https://c.tenor.com/eds_JFXceWoAAAAC/aww-sweet.gif)