# Credit Card Kaggle Anamoly Detection


## Libraries

In [None]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
from statistics import mean

In [None]:
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

## Import the credit card fraud data set

In [None]:
data = pd.read_csv('../input/creditcard/creditcard.csv')
data.head()

## Plot histograms for the frequency/number of fraudulent and non-fraudulent transactions against Amount

In [None]:
## Get the Fraud and the normal dataset 

fraud = data[data['Class']==1]

normal = data[data['Class']==0]

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')


shows that alot of fraduelent transcationsare few and focus on small amounts of money.

# Draw boxplots showing summary statistics for the Amount column


In [None]:
plt.boxplot(data.Amount)

Shows that alot of the transaction are small amounts around 88, and there are still alot of large transaction as show by the box plot but most of transaction are small amounts.

# Generate a correlation matrix illustrating using a heatmap the relationship between the different variables

In [None]:
data1= data.sample(frac = 0.1,random_state=1)
Fraud = data1[data1['Class']==1]

Valid = data1[data1['Class']==0]
state = np.random.RandomState(42)

outlier_fraction = len(Fraud)/float(len(Valid))
state = np.random.RandomState(42)

In [None]:
## Correlation
import seaborn as sns
#get correlations of each features in dataset
corrmat = data1.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# Generate a scatterplot for Amount and V2 showing a line of best fit using the equation of a straight line is y = mx + c, where m is the slope of the line and c is the y intercept

In [None]:
xs = data.V2
ys= data.Amount

def line_of_best(xs,ys):
    m= (((mean(xs) * mean(ys)) - mean(xs*ys))/((mean(xs)*mean(xs))- mean(xs*xs)))
    b= mean(ys)- m*mean(xs)
    return m,b
    

In [None]:
m,b= line_of_best(xs,ys)
regression_line = [(m*x)+b for x in xs]
plt.scatter(xs,ys)
plt.plot(xs,regression_line)

Again shows that the vast amount of transaction are of small amounts.

## Build an outlier detection model for your data using the Isolation Forest and the Local Outlier Factor

In [None]:
#Create independent and Dependent Features
columns = data1.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting 
target = "Class"
# Define a random state 
state = np.random.RandomState(42)
X = data1[columns]
Y = data1[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

In [None]:
##Define the outlier detection methods

classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction)
    
   
}

In [None]:
type(classifiers)

## Analyze the models using Errors, Confusion Matrix, Accuracy Score and Classification Report to identify the strengths and weaknesses of the models

In [None]:
n_outliers = len(Fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_

    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))

## Discuss as a conclusion the best model and how to use it in the future in identifying fraudulent credit card transactions.

- Isolation Forest detected 73 errors versus Local Outlier Factor detecting 97 errors 

- Isolation Forest has a 99.74% more accurate than LOF of 99.65%

- the Isolation Forest performed much better than the LOF as we can see that the detection of fraud cases is around 27% versus LOF detection rate of just 2%

Comapnies should y run its transactions in real time through an isolation forest in order to better detect fradulent transactions.