In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
plt.rcParams['figure.figsize'] = 14, 8

In [None]:
#importing the dataset
dataset = pd.read_csv('../input/creditcardfraud/creditcard.csv')

In [None]:
#calling the first few rows
dataset.head()

In [None]:
#to understand the shape of the dataset
dataset.shape

In [None]:
#to gain additional information about the data
dataset.info()

In [None]:
#cheching for null values
dataset.isnull().values.any()

In [None]:
#plotting the number of occurrances of fraud vs normal transactions
count_classes = pd.value_counts(dataset['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
LABELS = ["Normal", "Fraud"]
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")

In [None]:
## Get the Fraud and the normal dataset 
fraud = dataset[dataset['Class']==1]
normal = dataset[dataset['Class']==0]

In [None]:
print(fraud.shape,normal.shape)

In [None]:
## We need to analyze more amount of information from the transaction data
#How different are the amount of money used in different transaction classes?
fraud.Amount.describe()

In [None]:
normal.Amount.describe()

In [None]:
#plotting the Amount per transaction by class
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show()

In [None]:
# We Will check Do fraudulent transactions occur more often during certain time frame ? Let us find out 
#with a visual representation.

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

In [None]:
#checking for the outlier fraction
outlier_fraction = len(fraud)/float(len(normal))
print(outlier_fraction)

In [None]:
## Correlation
import seaborn as sns
plt.figure(figsize=(20,20))
#plot heat map
sns.heatmap(dataset.corr(),annot=True,cmap="RdYlGn")


In [None]:
#standardizing the dataset
from sklearn.preprocessing import StandardScaler

dataset['Amount'] = StandardScaler().fit_transform(dataset['Amount'].values.reshape(-1,1))
dataset['Time'] = StandardScaler().fit_transform(dataset['Time'].values.reshape(-1,1))
dataset.head()

In [None]:
#splitting the dataset into train and test 
from sklearn.model_selection import train_test_split
X = dataset.drop("Class", axis=1)
y = dataset["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [None]:
#correcting the imbalanced dataset using SMOTE
from imblearn.over_sampling import SMOTE
# transform the dataset
oversample = SMOTE()
X_smote, y_smote = oversample.fit_resample(X_train, y_train)

In [None]:
#before smote oversampling
print(y_train.value_counts())

#after smote oversampling
print(y_smote.value_counts())

In [None]:
    #Logistic Regression

In [None]:
# Accuracy 0.977%
from sklearn.metrics import confusion_matrix, classification_report,f1_score,recall_score,accuracy_score

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
#modelling
model.fit(X_smote, y_smote)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.3f%%" % (accuracy))

In [None]:
# confusion_matrix
print(confusion_matrix(y_test,y_pred))
# classification_report
print(classification_report(y_test, y_pred))
# recall_score 
print(recall_score(y_test, y_pred))
# f1_score
print(f1_score(y_test, y_pred))

In [None]:
#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model2=RandomForestClassifier(n_estimators=100,random_state=27, verbose=0)

In [None]:
#modelling

model2.fit(X_smote, y_smote)
y_pred2 = model2.predict(X_test)

# confusion_matrix
print(confusion_matrix(y_test,y_pred2))
# classification_report
print(classification_report(y_test, y_pred2))
# recall_score 
print(recall_score(y_test, y_pred2))
# f1_score
print(f1_score(y_test, y_pred2))
accuracy = accuracy_score(y_test, y_pred2)
print("Accuracy: %.3f%%" % (accuracy))

In [None]:
#Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model3=DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
model3.fit(X_smote, y_smote)

In [None]:
y_pred3 = model3.predict(X_test)

In [None]:
# confusion_matrix
print(confusion_matrix(y_test,y_pred3))
# classification_report
print(classification_report(y_test, y_pred3))
# recall_score 
print(recall_score(y_test, y_pred3))
# f1_score
print(f1_score(y_test, y_pred3))
accuracy = accuracy_score(y_test, y_pred3)
print("Accuracy: %.3f%%" % (accuracy))

In [None]:
#Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
model4=GaussianNB()
model4.fit(X_smote,y_smote)
y_nb_pred=model4.predict(X_test)

In [None]:
# confusion_matrix
print(confusion_matrix(y_test,y_nb_pred))
# classification_report
print(classification_report(y_test, y_nb_pred))
# recall_score 
print(recall_score(y_test, y_nb_pred))
# f1_score
print(f1_score(y_test, y_nb_pred))
accuracy = accuracy_score(y_test, y_nb_pred)
print("Accuracy: %.3f%%" % (accuracy))

In [None]:
#Correcting imbalanced dataset using RandomUnderSampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
# transform the dataset
ranUnSample = RandomUnderSampler()
X_ranUnSample, y_ranUnSample = ranUnSample.fit_resample(X_train, y_train)

In [None]:
#before Random undersampling
print(y_train.value_counts())

#after Random undersampling
print(y_ranUnSample.value_counts())

In [None]:
#Support Vector Machine
from sklearn import svm
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(X_ranUnSample, y_ranUnSample)

In [None]:
y_svm_pred = svm_classifier.predict(X_test) #And finally, we predict our data test.

In [None]:
# confusion_matrix
print(confusion_matrix(y_test,y_svm_pred))
# classification_report
print(classification_report(y_test,y_svm_pred))
# recall_score 
print(recall_score(y_test,y_svm_pred))
# f1_score
print(f1_score(y_test,y_svm_pred))
accuracy = accuracy_score(y_test, y_svm_pred)
print("Accuracy: %.3f%%" % (accuracy))