# The Dataset contains transactions made by credit cards.

In [None]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Nonfraudulent", "Fraudulent"]

> Reading the data.

In [None]:
data = pd.read_csv('../input/credit-card-anomaly-detection/creditcard.csv',sep=',')
data.head()

Checking to see the number of rows and columns.

In [None]:
print("Credit Card Fraud Detection data -  rows:",data.shape[0]," columns:", data.shape[1])

Summary statistics (Mean,median,standard deviation) of the data.

In [None]:
data.describe()

> From the above, I can see that there are 284,807 transactions.

In [None]:
data.info()

> Checking to see if there are any null values in the dataset.

In [None]:
data.isnull().values.any()

> There are no null or missing values from the dataset.

# Frequency of normal and fraudulent transactions using a histogram.

In [None]:
count_classes = pd.value_counts(data['Class'], sort = True)

count_classes.plot(kind = 'bar', rot=0)

plt.title("Transaction Class Distribution")

plt.xticks(range(2), LABELS)

plt.xlabel("Class")

plt.ylabel("Frequency")

Checking to see the total amount of Fraudulent and Nonfraudulent transactions.

In [None]:
fraudulent = data[data['Class']==1]

nonfraudulent = data[data['Class']==0]

print(f'fraudulent{fraudulent.shape}  nonfraudulent{nonfraudulent.shape}')

> From the above, we can see that the fraudulent transactions are 492 and the non fraudulent are 284,315.

Checking the summary statistics of the Fraudulent transactions.

In [None]:
fraudulent.Amount.describe()

From the mean, the money transactions for the fraudulent ones are more. 

Checking the summary statistics of the NonFraudulent transactions.

In [None]:
nonfraudulent.Amount.describe()

Checking to see how the transactions are occurring in respect to amount.

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(fraudulent.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(nonfraudulent.Amount, bins = bins)
ax2.set_title('Nonfraudulent')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show()

# Scatter plot showing transactions occurrence verses Time

In [None]:
# We Will check Do fraudulent transactions occur more often during certain time frame ? Let us find out with a visual representation.

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(fraudulent.Time, fraudulent.Amount)
ax1.set_title('Fraud')
ax2.scatter(nonfraudulent.Time, nonfraudulent.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

# Boxplots showing summary statistics for the Amount column

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
s = sns.boxplot(ax = ax1, x="Class", y="Amount", hue="Class",data=data, palette="PRGn",showfliers=False)
s = sns.boxplot(ax = ax2, x="Class", y="Amount", hue="Class",data=data, palette="PRGn",showfliers=True)
plt.show();

# Correlation matrix that contains a heatmap showing the relationship between the different variables.

In [None]:
plt.figure(figsize = (14,14))
plt.title('Credit Card Transactions features correlation plot')
corr = data.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=.1,cmap="Blues")
plt.show()

In the above HeatMap most of the features do not correlate to other features but there are some features that either has a positive or a negative correlation with each other. For example 'V20' and Amount have some correlation.

# Scatterplot for Amount and V2 showing a line of best fit using the equation of a straight line y = mx + c, where m is the slope of the line and c is the y intercept.

In [None]:
s = sns.lmplot(x='V2', y='Amount',data=data, hue='Class', fit_reg=True,scatter_kws={'s':2})
plt.show()

# Building an outlier detection model for the data using the Isolation Forest and the Local Outlier Factor

In [None]:
## Checking Fraud and the normal dataset 
Fraud = data[data['Class']==1]

Valid = data[data['Class']==0]

outlier_fraction = len(Fraud)/float(len(Valid))

In [None]:
print(outlier_fraction)

print("Fraud Cases : {}".format(len(Fraud)))

print("Valid Cases : {}".format(len(Valid)))

In [None]:
#Create independent and Dependent Features
columns = data.columns.tolist()

# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]

# Store the variable we are predicting 
target = "Class"

# Define a random state 
state = np.random.RandomState(42)
X = data[columns]
Y = data[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))

# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

In [None]:
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X),contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto',leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=outlier_fraction)
}
type(classifiers)

# Analyzing the models using Errors, Confusion Matrix, Accuracy Score and Classification Report

In [None]:
from sklearn.metrics import confusion_matrix
n_outliers = len(Fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Isolation Forest":
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    else:    
       print ('No other model')
    
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))
    conf_matrix = confusion_matrix(Y, y_pred)
    sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt='d');
    plt.title('Confusion Matrix for ' + clf_name)
    plt.ylabel('Actual class')
    plt.xlabel('Predicted class')
    plt.show()

# Analysis

* The Isolation Forest model detected 675 errors while the Local Outlier Factor detected 675 errors vs.
* Isolation Forest has a 99.76% more accuracy than Local Outlier Factor of 99.67%
* The Isolation Forest Method performed much better in determining the fraud cases.
* To improve on the accuracy, the sample size can be increased.
* We can also use complex anomaly detection models to get better accuracy in determining more fraudulent cases