# Data Protection and Complex System Security

## Lab session 2: Anomaly detection

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



### Loading data
    1. Open a new Jupyter notebook and name it ‘PDSSC - Lab session – anomaly detection’
    2. Include the IEEE-CIS Fraud Detection context 2019 dataset from https://www.kaggle.com/c/ieeefraud-detection/data
    3. Load the CSV files as ‘train_transaction.csv’ using Pandas


In [None]:
df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [None]:
df.head()

In [None]:
df[df.columns.drop(list(df.filter(regex="V\d+")))]


In [None]:
list(df.columns)

In [None]:
df.R_emaildomain

    4. From train_transaction keep 10000 entries: the first 9700 non-fraud entries and the first 300 fraud entries in dataset reduced_transaction_df


In [None]:
reduced_transaction_df = pd.concat((df[df.isFraud==0].head(9700), df[df.isFraud==1].head(300)))

### Data set observation

    5. Print the head of the dataset


In [None]:
reduced_transaction_df.head()

    6. List following information for the training set
        6.1. Column number
        6.2. Column names
        6.3. Size of the data set
        6.4. Column types


In [None]:
# nombre de colonnes
len(df.columns)

In [None]:
# nom des colonnes
list(df.columns)

In [None]:
# taille du dataset
len(df)

In [None]:
# types des colonnes
list(zip(df.columns, df.dtypes))

    7. Control the correct extraction of transaction: 
        number of non-fraudulent transactions, 
        number of fraudulent transactions, 
        rate of fraudulent transactions

In [None]:
print("{} / {} = {} % de fraude".format(len(reduced_transaction_df[reduced_transaction_df.isFraud==1]), len(reduced_transaction_df[reduced_transaction_df.isFraud==0]), len(reduced_transaction_df[reduced_transaction_df.isFraud==1])/len(reduced_transaction_df[reduced_transaction_df.isFraud==0]) ))

    8. Which columns are categories? List them; extract existing values.


In [None]:
df.select_dtypes('object').columns

In [None]:
for i in reduced_transaction_df.select_dtypes('object').columns:
    print(i)
    print(reduced_transaction_df[i].unique())
    print ('#'*80)

    9. Which columns are numeric? List them; extract min, max, mean, median and standard deviation values.

In [None]:
reduced_transaction_df._get_numeric_data().columns[0:45]

In [None]:
for i in reduced_transaction_df._get_numeric_data().columns:
    print (i)
    print ("mean = ", df[i].mean(), "median = ", df[i].median(),"std = ",df[i].std() )
    print("#"*60)

    10. For each column, print the rate of undefined values (NaN for numeric)


In [None]:
for i in reduced_transaction_df.columns:
    print (i,' : ', end='')
    print ( reduced_transaction_df[i].isna().sum()/len(reduced_transaction_df), '%' )

    11. For each numerical column, print the rate of zero (0) value


In [None]:
for i in reduced_transaction_df.columns:
    print (i,' : ', end='')
    print ( (reduced_transaction_df[i]==0).sum()/len(reduced_transaction_df), '%' )
    


### Data set visualisation
    12. Visualise the dataset using dimensions: 'TransactionAmt', 'card1','addr1'.


In [None]:
def show3D_transation_data(transac_dataset, x_axis_name, y_axis_name, z_axis_name):
    X = transac_dataset.drop(columns=['isFraud'])
    Y = transac_dataset['isFraud']
    
    x = x_axis_name
    y = y_axis_name
    z = z_axis_name

    zOffset = 0.02
    limit = len(X)

    sb.reset_orig()

    fig = plt.figure(figsize = ( 10, 12))
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(X.loc[Y == 0, x][:limit], X.loc[Y == 0, y][:limit], -np.log10(X.loc[Y == 0, z][:limit] + zOffset), c = 'g', marker = '.', s = 1, label = 'genuine')
    
    ax.scatter(X.loc[Y == 1, x][:limit], X.loc[Y == 1, y][:limit], -np.log10(X.loc[Y == 1, z][:limit] + zOffset), c = 'r', marker = '.', s = 1, label = 'fraudulent')
    
    ax.set_xlabel(x, size = 16)
    ax.set_ylabel(y + ' [hour]', size = 16)
    ax.set_zlabel('- log$_{10}$ (' + z + ')', size = 16)
    ax.set_title('Error-based features separate out genuine and fraudulent transactions', size = 20)
    
    plt.axis('tight')
    ax.grid(1)
    
    noFraudMarker = Line2D([], [], linewidth = 0, color = 'g', marker = '.', markersize = 10, label = 'genuine')
    fraudMarker = Line2D([], [], linewidth = 0, color = 'r', marker = '.', markersize = 10, label = 'fraudulent')
    
    plt.legend(handles = [noFraudMarker, fraudMarker], bbox_to_anchor = (1.20, 0.38), frameon = False, prop = {'size': 16})

In [None]:
show3D_transation_data(reduced_transaction_df, 'card1', 'addr1', 'TransactionAmt')

13. Create an alternate visualisation function for visualising fraud entries only, in red

In [None]:
def show3D_transation_data_f(transac_dataset, x_axis_name, y_axis_name, z_axis_name):
    X = transac_dataset.drop(columns=['isFraud'])
    Y = transac_dataset['isFraud']
    
    x = x_axis_name
    y = y_axis_name
    z = z_axis_name

    zOffset = 0.02
    limit = len(X)

    sb.reset_orig()

    fig = plt.figure(figsize = ( 10, 12))
    ax = fig.add_subplot(111, projection='3d')

#    ax.scatter(X.loc[Y == 0, x][:limit], X.loc[Y == 0, y][:limit], -np.log10(X.loc[Y == 0, z][:limit] + zOffset), c = 'g', marker = '.', s = 1, label = 'genuine')
    
    ax.scatter(X.loc[Y == 1, x][:limit], X.loc[Y == 1, y][:limit], -np.log10(X.loc[Y == 1, z][:limit] + zOffset), c = 'r', marker = '.', s = 1, label = 'fraudulent')
    
    ax.set_xlabel(x, size = 16)
    ax.set_ylabel(y + ' [hour]', size = 16)
    ax.set_zlabel('- log$_{10}$ (' + z + ')', size = 16)
    ax.set_title('Features separate for fraudulent transactions', size = 20)
    
    plt.axis('tight')
    ax.grid(1)
    
 #   noFraudMarker = Line2D([], [], linewidth = 0, color = 'g', marker = '.', markersize = 10, label = 'genuine')
    fraudMarker = Line2D([], [], linewidth = 0, color = 'r', marker = '.', markersize = 10, label = 'fraudulent')
    
    plt.legend(bbox_to_anchor = (1.20, 0.38), frameon = False, prop = {'size': 16})

In [None]:
show3D_transation_data_f(reduced_transaction_df, 'card1', 'addr1', 'TransactionAmt')

# Data cleaning

14. Perform one-hot encoding of categorical data

In [None]:
ohe_reduced_transaction_df = pd.get_dummies(reduced_transaction_df)

15. Remove NaN (Not a number) values by imputation of the mean of the column

In [None]:
for i in ohe_reduced_transaction_df:
    ohe_reduced_transaction_df[i]= ohe_reduced_transaction_df[i].fillna(ohe_reduced_transaction_df[i].mean())

16. Control that no NaN value remain in the dataframe

In [None]:
ohe_reduced_transaction_df.isna().sum().sum()

# Outlier detection

## Isolation Forests

17. Extract outliers using sklearn.ensemble.IsolationForests, using and
outliers_fraction = 0.03. Control the numbers of outliers.

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
i_f = IsolationForest(contamination=0.03)
i_f.fit(ohe_reduced_transaction_df)

In [None]:
ohe_reduced_transaction_df_outlier = ohe_reduced_transaction_df.copy()

In [None]:
ohe_reduced_transaction_df_outlier['is_outlier'] = [i == -1 for i in  i_f.predict(ohe_reduced_transaction_df) ]

In [None]:
ohe_reduced_transaction_df_outlier.groupby(by=['is_outlier', 'isFraud']).count()

In [None]:
36/300

In [None]:
264/9700

Il y a une plus forte probabilité d'être une fraude pour un outlier, mais ce n'est pas systématique.

In [None]:
ohe_reduced_transaction_df_outlier[(ohe_reduced_transaction_df_outlier.isFraud==1) & (ohe_reduced_transaction_df_outlier.is_outlier==True)]

18. Create yet another visualisation function for visualising IsolationForest outliers entries only, in red

In [None]:
def show3D_transation_data_outlier(transac_dataset, x_axis_name, y_axis_name, z_axis_name):
    X = transac_dataset.drop(columns=['is_outlier'])
    Y = transac_dataset['is_outlier']
    
    x = x_axis_name
    y = y_axis_name
    z = z_axis_name

    zOffset = 0.02
    limit = len(X)

    sb.reset_orig()

    fig = plt.figure(figsize = ( 10, 12))
    ax = fig.add_subplot(111, projection='3d')

#    ax.scatter(X.loc[Y == 0, x][:limit], X.loc[Y == 0, y][:limit], -np.log10(X.loc[Y == 0, z][:limit] + zOffset), c = 'g', marker = '.', s = 1, label = 'genuine')
    
    ax.scatter(X.loc[Y == 1, x][:limit], X.loc[Y == 1, y][:limit], -np.log10(X.loc[Y == 1, z][:limit] + zOffset), c = 'r', marker = '.', s = 1, label = 'fraudulent')
    
    ax.set_xlabel(x, size = 16)
    ax.set_ylabel(y + ' [hour]', size = 16)
    ax.set_zlabel('- log$_{10}$ (' + z + ')', size = 16)
    ax.set_title('Features separate for fraudulent transactions', size = 20)
    
    plt.axis('tight')
    ax.grid(1)
    
 #   noFraudMarker = Line2D([], [], linewidth = 0, color = 'g', marker = '.', markersize = 10, label = 'genuine')
    fraudMarker = Line2D([], [], linewidth = 0, color = 'r', marker = '.', markersize = 10, label = 'is_outlier')
    
    plt.legend(bbox_to_anchor = (1.20, 0.38), frameon = False, prop = {'size': 16})

In [None]:
show3D_transation_data_outlier(ohe_reduced_transaction_df_outlier, 'card1', 'addr1', 'TransactionAmt')

## Local Outlier Factor (LOF)

19. Extract outliers using sklearn.neighbors.LocalOutlierFactor, using and outliers_fraction = 0.03. Control the numbers of outliers.

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
lof = LocalOutlierFactor(contamination=0.03)
lof.fit(ohe_reduced_transaction_df)

In [None]:
is_lof=lof.fit_predict(ohe_reduced_transaction_df)

In [None]:
ohe_reduced_transaction_df_outlier['lof_outlier'] = [i==-1 for i in is_lof]

20. Create yet another visualisation function for visualising LOF outliers entries only, in red show3D_transation_data_lof_outliers_only(transac_dataset, x_axis_name, y_axis_name, z_axis_name)

In [None]:
def show3D_transation_data_lof_outliers_only(transac_dataset, x_axis_name, y_axis_name, z_axis_name):
    X = transac_dataset.drop(columns=['lof_outlier'])
    Y = transac_dataset['lof_outlier']
    
    x = x_axis_name
    y = y_axis_name
    z = z_axis_name

    zOffset = 0.02
    limit = len(X)

    sb.reset_orig()

    fig = plt.figure(figsize = ( 10, 12))
    ax = fig.add_subplot(111, projection='3d')

#    ax.scatter(X.loc[Y == 0, x][:limit], X.loc[Y == 0, y][:limit], -np.log10(X.loc[Y == 0, z][:limit] + zOffset), c = 'g', marker = '.', s = 1, label = 'genuine')
    
    ax.scatter(X.loc[Y == 1, x][:limit], X.loc[Y == 1, y][:limit], -np.log10(X.loc[Y == 1, z][:limit] + zOffset), c = 'r', marker = '.', s = 1, label = 'lof')
    
    ax.set_xlabel(x, size = 16)
    ax.set_ylabel(y + ' [hour]', size = 16)
    ax.set_zlabel('- log$_{10}$ (' + z + ')', size = 16)
    ax.set_title('Features separate for lof transactions', size = 20)
    
    plt.axis('tight')
    ax.grid(1)
    
 #   noFraudMarker = Line2D([], [], linewidth = 0, color = 'g', marker = '.', markersize = 10, label = 'genuine')
    fraudMarker = Line2D([], [], linewidth = 0, color = 'r', marker = '.', markersize = 10, label = 'lof_outlier')
    
    plt.legend(bbox_to_anchor = (1.20, 0.38), frameon = False, prop = {'size': 16})

In [None]:
show3D_transation_data_lof_outliers_only(ohe_reduced_transaction_df_outlier, 'card1', 'addr1', 'TransactionAmt')

## Comparison of IsolationForest and LOF

21. Control the complementarity between the 2 algorithms

In [None]:
ohe_reduced_transaction_df_outlier.groupby(by = ['is_outlier', 'lof_outlier']).count()["isFraud"]

In [None]:
ohe_reduced_transaction_df_outlier.groupby(by = ['lof_outlier', 'is_outlier']).count()["isFraud"]

21.1. How many outliers are common to IsolationForest and LOF?


In [None]:
ohe_reduced_transaction_df_outlier.groupby(by = ['lof_outlier', 'is_outlier']).count().loc[True,True]

=> 21

21.2. How many fraudulent outliers are common to IsolationForest and LOF?

In [None]:
ohe_reduced_transaction_df_outlier.groupby(by = ['lof_outlier', 'is_outlier', 'isFraud']).count().loc[True,True,1]

=> Seulement 2

21.3. What do you deduce for building unsupervised outlier detectors?

Je suppose que les modèles sont assez faibles individuellements, puisque ne se recoupants pas, mais apportent une certaine information tout de même, puisqu'il y a corrélation entre outliers et fraud. Peut être alors qu'en utilisant plusieurs algorithmes de detections d'outliers permettant de faire du feature engineering en ajoutant ces informations aux données pour un classificateur supervisé est la bonne solution ? 