In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt, rcParams
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
rcParams['figure.figsize'] = 11,10
import joblib
link = '../Datasets/transaction_data.csv'

In [None]:
df = pd.read_csv(link)

In [None]:
df.info()
#checking feature data types

In [None]:
df.head()
#brief overview of the data

In [None]:
df.isnull().sum()
#checking for missing values

In [None]:
 df['bank'].value_counts()
#checking the distrubuting of catagories in the bank attributes

In [None]:
 df['card_type'].value_counts()
    #checking the distrubution of catagories in the card type attributes

In [None]:
 df['location'].value_counts()
    #checking the distrubution of catagories in the location attributes

In [None]:
#data visualisation
sns.lmplot('feature3', 'age', df, hue='Label', fit_reg=False)
fig = plt.gcf()
fig.set_size_inches(15,10)
plt.show()

In [None]:
df['Label'].value_counts()
#checking the distrubution of catagories in the data label

In [None]:
matrix = df.corr()

In [None]:
#checking the correlation of features with respect to the label
matrix['Label'].sort_values(ascending=False)

In [None]:
#plotting a correlation heat map
sns.heatmap(matrix, vmax=0.8, square=True)

In [None]:
#defing a data transformation pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import StandardScaler
num_pipe = Pipeline([
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])

pipe = ColumnTransformer([
("num", num_pipe, ['feature0', 'feature1', 'feature2', 'feature3', 'feature4', 'age', 'feature5', 'cv_data', 'asv', 'cvv', 'Amount', 'CardNo']),
("cat", OneHotEncoder(), ['card_type','location','bank'])
])

#exporting the data transformation pipeline
#joblib.dump(pipe,'pipeline.pkl')

In [None]:
#separating the data label from the other features

target = "Label"
X = df.drop(target, axis=1)  #training set
y = df[target]   #labels
print(X.shape)
#transforming the data
pipe= pipe.fit(X)
joblib.dump(pipe,'pipeline.pkl')
X = pipe.transform(X)

In [None]:
X.shape


In [None]:
y.shape

In [None]:
#defing the outlier fraction for the local Outlier Factor algorithm
fraud = df[df['Label'] == 1]
normal = df[df['Label'] == 0]
outlier_fraction =  len(fraud) / float(len(normal))
outlier_fraction

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.tree import DecisionTreeClassifier

In [None]:
classifier_accuracy = {}


In [None]:
#initializing the classification algorithms
classifiers = {
               'Local Outlier Factor': LocalOutlierFactor(n_neighbors=20, contamination=outlier_fraction, algorithm='auto', leaf_size=25, metric='minkowski'),
               'Support Vector Machine': OneClassSVM(kernel='rbf', degree=3, gamma= 0.1, nu=0.05, max_iter=-1),
               'Decision Tree Classifier':DecisionTreeClassifier( criterion='entropy',splitter="best", max_leaf_nodes=15),
               'Isolation Forest': IsolationForest(max_samples=len(X), contamination=outlier_fraction, random_state=1, verbose=0),
               'Ada Boost Classifier':AdaBoostClassifier(),
               'Random Forest Classifier':RandomForestClassifier(), 
               'KNeighbors Classifier': KNeighborsClassifier(5)
               
              }

In [None]:
#fitting, training and evaluating each algorithm 
n_outliers = len(fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):
   
    if clf_name == 'Local Outlier Factor':
        y_pred = clf.fit_predict(X)
        y_pred[y_pred ==1]=0
        y_pred[y_pred ==-1]=1
        n_errors = (y_pred !=y).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y, y_pred))
        print(classification_report(y, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y, y_pred)})
    
    
    elif clf_name == 'Support Vector Machine':
        clf.fit(X)
        y_pred = clf.predict(X)
        y_pred[y_pred ==1]=0
        y_pred[y_pred ==-1]=1
        n_errors = (y_pred !=y).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y, y_pred))
        print(classification_report(y, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y, y_pred)})
    
    
    
    elif clf_name == 'Decision Tree Classifier':
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        n_errors = (y_pred !=y_test).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y_test, y_pred)})
       
        x = clf
    
    elif clf_name == 'Ada Boost Classifier':
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        n_errors = (y_pred !=y_test).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y_test, y_pred)})

        
    elif clf_name == 'Random Forest Classifier':
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)
        clf = DecisionTreeClassifier()
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        n_errors = (y_pred !=y_test).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y_test, y_pred)})
         #exporting the prefered model
        joblib.dump(clf,'fraud_detection_model.pkl')

    elif clf_name == 'KNeighbors Classifier':
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        n_errors = (y_pred !=y_test).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y_test, y_pred)})
        
    else:
        clf.fit(X)
        score_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
        y_pred[y_pred ==1]=0
        y_pred[y_pred ==-1]=1
        n_errors = (y_pred !=y).sum()
        print('{}:{}'.format(clf_name,n_errors))
        print('Accuracy Score:',accuracy_score(y, y_pred))
        print(classification_report(y, y_pred))
        classifier_accuracy.update({clf_name:accuracy_score(y, y_pred)})
    print('-------------------------------------------------------------')

In [None]:
#creating a table of classifiers and thier accuracy
ca = [[a,b] for a,b in classifier_accuracy.items()]
ca = pd.DataFrame(ca, columns=['Classifier','Accuracy'])
ca


In [None]:
#plotting classifiers and thier accuracy
import seaborn as sns
sns.set_color_codes('muted')
sns.barplot(x='Accuracy', y='Classifier', data=ca, color='g')
plt.xlabel('Accuracy %')
plt.ylabel('Classifier ')
plt.title('Classifier Accuracy')
plt.show()