In [4]:
import sys
import numpy
import pandas
import matplotlib
import seaborn
import scipy
import sklearn

## Verifying Packages & Versions
print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(numpy.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Matplotlib: {}'.format(matplotlib.__version__))
print('Seaborn: {}'.format(seaborn.__version__))
print('Scipy: {}'.format(scipy.__version__))
print('Sklearn: {}'.format(sklearn.__version__))

Python: 3.7.4 (default, Sep  7 2019, 18:27:02) 
[Clang 10.0.1 (clang-1001.0.46.4)]
Numpy: 1.16.0
Pandas: 0.24.0
Matplotlib: 3.0.2
Seaborn: 0.9.0
Scipy: 1.2.0
Sklearn: 0.20.2


In [5]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Read CSV file that contains credit card information using pandas
dataset = pd.read_csv('https://s3-us-west-2.amazonaws.com/strikedataset/creditcard.csv')

HTTPError: HTTP Error 404: Not Found

In [None]:
# Check total number of records in the file
print(dataset.shape) # This will output a tuple with (total_records_count, total_columns_count)

In [None]:
# exploring the details [This may take some time due to the amount of data]
print(dataset.columns) 

In [None]:
# Let`s describe the data
print(dataset.describe())

In [None]:
# From the class output mean value it tells us that we have a lot more valid transactions 
# as compared to Fraud transactions
# Lets work on 10% of dataset to save computation time/effort.
dataset = dataset.sample(frac=0.1, random_state=1)

In [None]:
print(dataset.shape)

In [None]:
# Plot Histogram
dataset.hist(figsize = (30, 30))
plt.show()

In [None]:
# Here you can see very few Fraud Transactions as compared to Valid Transactions
Fraud = dataset[dataset['Class'] == 1]
Valid = dataset[dataset['Class'] == 0]

outlier_fraction = len(Fraud)/float(len(Valid))
print("OUTLIER_FRACTION : %s " %(outlier_fraction))

In [None]:
# Print Both Fraud and Valid Cases Individually [Remember this is the count from only 10% total data]
print('Fraud Cases: {}'.format(len(Fraud)))
print('Valid Cases: {}'.format(len(Valid)))

In [None]:
# Correlation Matrix with Heat Map.
corrmat = dataset.corr()
fig = plt.figure(figsize=(12,9))
sns.heatmap(corrmat, vmax=.10, square=True)
plt.show()

In [None]:
# Get all columns from the dataframe.
columns = dataset.columns.tolist()

# Filtering the data from the dataset that we don`t want.
columns = [c for c in columns if c not in ["Class"]]

# Store variables we`ll be predicting on
target = "Class"

X = dataset[columns]
Y = dataset[target]

# Print shapes of X & Y
print(X.shape)
print(Y.shape)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
# Defining a Random state
state = 1

# Defining a outlier detection method
classifiers = {
    "Isolation Forest": IsolationForest(max_samples=len(X), contamination=outlier_fraction,random_state=state),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, contamination=outlier_fraction)
}

In [None]:
# Fit the model
n_outliers = len(Fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):
    # fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_predict = clf.fit_predict(X)
        scores_predict = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_predict = clf.decision_function(X)
        y_predict = clf.predict(X)
        
    #Reshape the prediction values to 0 for valid, 1 for fraud
    y_predict[y_predict == 1] = 0
    y_predict[y_predict == -1] = -1
    
    n_errors = (y_predict != Y).sum()
    
    #Run classifier metrics
    print('{}:{}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_predict))
    print(classification_report(Y, y_predict))