### Data Preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
import seaborn as sns

In [None]:
data=pd.read_csv('/content/drive/MyDrive/part-088.csv')

In [None]:
data.head()

In [None]:
#dimensions
data.shape

In [None]:
#removing rows with constant values

def remove_constant_value_features(df):
    return [e for e in df.columns if df[e].nunique() == 1]
data=data.drop(columns=remove_constant_value_features(data))

In [None]:
#null values
np.sum(np.sum(data.isna()))

In [None]:
#anomalies by hour
timedelta = pd.to_datetime(data['timestamp'])
data['Time_hour'] = (timedelta.dt.hour).astype(int)

plt.figure(figsize=(12,5))
sns.distplot(data[data['isAnomaly'] == 0]["Time_hour"], color='g')
sns.distplot(data[data['isAnomaly'] == 1]["Time_hour"], color='r')
plt.title('Fraud and Normal Transactions by Hours', fontsize=17)
plt.xlim([-1,25])
plt.show()

Peak in fraud transcations at 3 pm

In [None]:
target=data['isAnomaly']
data=data.drop(columns=['timestamp']) #dropping right now but will require in a later section

In [None]:
data['isAnomaly'].value_counts() #number of anomalies

In [None]:
#Feature Selection
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators = 100 , criterion = 'entropy',random_state = 0)
rnd_clf.fit(data,target);

not_imp=[]
for name, importance in zip(data.columns, rnd_clf.feature_importances_):
  if importance > 0.020 :
    not_imp.append(name)

data=data.drop(columns=not_imp)

In [None]:
list_of_tuples = list(zip(data.columns, rnd_clf.feature_importances_))
pd.DataFrame(list_of_tuples, columns = ['Columns', 'Importance']).sort_values(by='Importance', ascending=False)

In [None]:
#Dropping high correlated columns
cor_matrix=data.corr()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
data=data.drop(columns=to_drop)

### Auto Outlier Detection Algorithms

In [None]:
factor=1723/38797 #number of fraud cases

In [None]:
#Define the outlier detection methods

classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(data), 
                                       contamination=factor,random_state=0, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=factor),
    "Support Vector Machine":OneClassSVM(kernel='linear', degree=3, gamma=0.1,nu=0.05, 
                                         max_iter=-1),
    "Elliptic Envelope":EllipticEnvelope(contamination=factor)
   
}

In [None]:
from sklearn.metrics import classification_report,accuracy_score
n_outliers = 1723
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(data)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Support Vector Machine":
        clf.fit(data)
        y_pred = clf.predict(data)
    else:    
        clf.fit(data)
        scores_prediction = clf.decision_function(data)
        y_pred = clf.predict(data)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != target).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(target,y_pred))
    print(confusion_matrix(target, y_pred))

### Auto-Encoders

In [None]:
#Looking for clusters
from sklearn.decomposition import PCA
X_reduced_pca = PCA(n_components=2, random_state=42).fit_transform(data)
plt.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], alpha=.1, color='black')

In [None]:
import tensorflow as tf
import random as rn
# manual parameters
RANDOM_SEED = 42
VALIDATE_SIZE = 0.2

# setting random seeds for libraries to ensure reproducibility
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

def tsne_scatter(features, labels, dimensions=2, save_as='graph.png'):
    if dimensions not in (2, 3):
        raise ValueError('tsne_scatter can only plot in 2d or 3d (What are you? An alien that can visualise >3d?). Make sure the "dimensions" argument is in (2, 3)')

    # t-SNE dimensionality reduction
    features_embedded = TSNE(n_components=dimensions, random_state=RANDOM_SEED).fit_transform(features)
    
    # initialising the plot
    fig, ax = plt.subplots(figsize=(8,8))
    
    # counting dimensions
    if dimensions == 3: ax = fig.add_subplot(111, projection='3d')

    # plotting data
    ax.scatter(
        *zip(*features_embedded[np.where(labels==1)]),
        marker='o',
        color='r',
        s=2,
        alpha=0.7,
        label='Fraud'
    )
    ax.scatter(
        *zip(*features_embedded[np.where(labels==0)]),
        marker='o',
        color='g',
        s=2,
        alpha=0.3,
        label='Clean'
    )

    # storing it to be displayed later
    plt.legend(loc='best')
    plt.savefig(save_as);
    plt.show;

In [None]:
tsne_scatter(data, target, dimensions=2, save_as='tsne_initial_2d.png')

Anomalies aren't apparent

In [None]:
data=pd.concat([data,target],axis=1)

In [None]:
fraud = data[data.isAnomaly == 1]
clean = data[data.isAnomaly == 0]

In [None]:
print(f"""Shape of the datasets:
    clean (rows, cols) = {clean.shape}
    fraud (rows, cols) = {fraud.shape}""")

In [None]:
TRAINING_SAMPLE = 25952
# shuffle our training set
clean = clean.sample(frac=1).reset_index(drop=True)

# training set: exlusively non-fraud transactions
X_train = clean.iloc[:TRAINING_SAMPLE].drop('isAnomaly', axis=1)

# testing  set: the remaining non-fraud + all the fraud 
X_test = clean.iloc[TRAINING_SAMPLE:].append(fraud).sample(frac=1)

In [None]:
print(f"""Our testing set is composed as follows:

{X_test.isAnomaly.value_counts()}""")

In [None]:
from sklearn.model_selection import train_test_split

# train // validate - no labels since they're all clean anyway
X_train, X_validate,y_train, y_validate = train_test_split(X_train, 
                                       test_size=VALIDATE_SIZE, 
                                       random_state=RANDOM_SEED)

# manually splitting the labels from the test df
X_test, y_test = X_test.drop('isAnomaly', axis=1).values, X_test.isAnomaly.values

In [None]:
print(f"""Shape of the datasets:
    training (rows, cols) = {X_train.shape}
    validate (rows, cols) = {X_validate.shape}
    holdout  (rows, cols) = {X_test.shape}""")

In [None]:
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline

# configure our pipeline
pipeline = Pipeline([('normalizer', Normalizer()),
                     ('scaler', MinMaxScaler())])

In [None]:
pipeline.fit(X_train);

In [None]:
X_train_transformed = pipeline.transform(X_train)
X_validate_transformed = pipeline.transform(X_validate)

In [None]:
g = sns.PairGrid(X_train.iloc[:,:3].sample(600, random_state=RANDOM_SEED))
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Before:')
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot);

In [None]:
g = sns.PairGrid(pd.DataFrame(X_train_transformed).iloc[:,:3].sample(600, random_state=RANDOM_SEED))
plt.subplots_adjust(top=0.9)
g.fig.suptitle('After:')
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot);

We can tell the data is slightly more uniform and proportionally distributed. 
The ranges were also shrunk to fit between 0 and 1.

In [None]:
input_dim = X_train_transformed.shape[1]
BATCH_SIZE = 256
EPOCHS = 100

autoencoder = tf.keras.models.Sequential([
    
    # deconstruct / encode
    tf.keras.layers.Dense(input_dim, activation='elu', input_shape=(input_dim, )), 
    tf.keras.layers.Dense(16, activation='elu'),
    tf.keras.layers.Dense(8, activation='elu'),
    tf.keras.layers.Dense(4, activation='elu'),
    tf.keras.layers.Dense(2, activation='elu'),
    
    # reconstruction / decode
    tf.keras.layers.Dense(4, activation='elu'),
    tf.keras.layers.Dense(8, activation='elu'),
    tf.keras.layers.Dense(16, activation='elu'),
    tf.keras.layers.Dense(input_dim, activation='elu')
    
])

autoencoder.compile(optimizer="adam", 
                    loss="mse",
                    metrics=["acc"])

# print an overview of our model
autoencoder.summary();

In [None]:
from datetime import datetime

# current date and time
yyyymmddHHMM = datetime.now().strftime('%Y%m%d%H%M')

# new folder for a new run
log_subdir = f'{yyyymmddHHMM}_batch{BATCH_SIZE}_layers{len(autoencoder.layers)}'

# define our early stopping
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.0001,
    patience=10,
    verbose=1, 
    mode='min',
    restore_best_weights=True
)

save_model = tf.keras.callbacks.ModelCheckpoint(
    filepath='autoencoder_best_weights.hdf5',
    save_best_only=True,
    monitor='val_loss',
    verbose=0,
    mode='min'
)

tensorboard = tf.keras.callbacks.TensorBoard(
    f'logs/{log_subdir}',
    batch_size=BATCH_SIZE,
    update_freq='batch'
)

# callbacks argument only takes a list
cb = [early_stop, save_model, tensorboard]

In [None]:
history = autoencoder.fit(
    X_train_transformed, X_train_transformed,
    shuffle=True,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=cb,
    validation_data=(X_validate_transformed, X_validate_transformed)
);

In [None]:
# transform the test set with the pipeline fitted to the training set
X_test_transformed = pipeline.transform(X_test)

# pass the transformed test set through the autoencoder to get the reconstructed result
reconstructions = autoencoder.predict(X_test_transformed)

In [None]:
# calculating the mean squared error reconstruction loss per row in the numpy array
mse = np.mean(np.power(X_test_transformed- reconstructions, 2), axis=1)

In [None]:
THRESHOLD = 3

def mad_score(points):
    
    m = np.median(points)
    ad = np.abs(points - m)
    mad = np.median(ad)
    
    return 0.6745 * ad / mad

z_scores = mad_score(mse)
outliers = z_scores > THRESHOLD

In [None]:
print(f"Detected {np.sum(outliers):,} outliers in a total of {np.size(z_scores):,} operations [{np.sum(outliers)/np.size(z_scores):.2%}].")

In [None]:
from sklearn.metrics import (confusion_matrix, 
                             precision_recall_curve)

# get (mis)classification
cm = confusion_matrix(y_test, outliers)

# true/false positives/negatives
(tn, fp, 
 fn, tp) = cm.flatten()

In [None]:
print(f"""The classifications using the MAD method with threshold={THRESHOLD} are as follows:
{cm}

% of transactions labeled as fraud that were correct (precision): {tp}/({fp}+{tp}) = {tp/(fp+tp):.2%}
% of fraudulent transactions were caught succesfully (recall):    {tp}/({fn}+{tp}) = {tp/(fn+tp):.2%}""")

In [None]:
# Accuracy
(9701/12845)*100

### FBProphet

In [None]:
!pip install fbprophet
from fbprophet import Prophet
import os

In [None]:
# View the data as a table
df_ = pd.DataFrame(data, columns=['timestamp', r'Available db connection activity : (d/dx (MXBean(com.bea:Name=source09,Type=JDBCDataSourceRuntime).NumAvailable))'])
df_['ds']=df_['timestamp']
df_['y']=df_[r'Available db connection activity : (d/dx (MXBean(com.bea:Name=source09,Type=JDBCDataSourceRuntime).NumAvailable))'].astype(float)
df_=df_.drop(['timestamp',r'Available db connection activity : (d/dx (MXBean(com.bea:Name=source09,Type=JDBCDataSourceRuntime).NumAvailable))'],axis=1)
df_.head()

In [None]:
def fit_predict_model(dataframe, interval_width = 0.99, changepoint_range = 0.8):
    m = Prophet(daily_seasonality = False, yearly_seasonality = False, weekly_seasonality = False,
#                 seasonality_mode = 'multiplicative', 
                interval_width = interval_width,
                changepoint_range = changepoint_range)
    m = m.fit(dataframe)
    
    forecast = m.predict(dataframe)
    forecast['fact'] = dataframe['y'].reset_index(drop = True)
    print('Displaying Prophet plot')
    fig1 = m.plot(forecast)
    return forecast
    
pred = fit_predict_model(df_)

In [None]:
def detect_anomalies(forecast):
    forecasted = forecast[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper', 'fact']].copy()
    #forecast['fact'] = df['y']

    forecasted['anomaly'] = 0
    forecasted.loc[forecasted['fact'] > forecasted['yhat_upper'], 'anomaly'] = 1
    forecasted.loc[forecasted['fact'] < forecasted['yhat_lower'], 'anomaly'] = 1 #-1

    #anomaly importances
    forecasted['importance'] = 0
    forecasted.loc[forecasted['anomaly'] ==1, 'importance'] = \
        (forecasted['fact'] - forecasted['yhat_upper'])/forecast['fact']
    forecasted.loc[forecasted['anomaly'] ==-1, 'importance'] = \
        (forecasted['yhat_lower'] - forecasted['fact'])/forecast['fact']
    
    return forecasted

pred = detect_anomalies(pred)

In [None]:
pred.head()

In [None]:
pred[ r'anomaly'].value_counts()

In [None]:
#Accuracy
1361/1723

### Supervised

In [None]:
#Handling imbalance
from imblearn.under_sampling import NearMiss

nm = NearMiss()

x_nm, y_nm = nm.fit_resample(data, target)

In [None]:
print(x_nm.shape,y_nm.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
x_scaled = scalar.fit_transform(x_nm)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y_nm, test_size = 0.25)

In [None]:
scores = {}
acc = []
cv_scores = []
def model(model):
    model.fit(x_train,y_train)
    score = model.score(x_test,y_test)
    print("Accuracy: {}".format(score))
    cv_score = cross_val_score(model,x_train,y_train,cv=5)
    print("Cross Val Score: {}".format(np.mean(cv_score)))
    acc.append(score)
    cv_scores.append(np.mean(cv_score))

In [None]:
from xgboost import XGBClassifier
clf = XGBClassifier()
model(clf)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
model(clf)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
model(clf)
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
model(clf)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
model(clf)
from sklearn.svm import SVC
clf = SVC()
model(clf)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
model(clf)
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
model(clf)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
model(clf)

In [None]:
models = ["XGBClassifier","LogisticRegression","RandomForestClassifier","DecisionTreeClassifier","KNeighborsClassifier","SVC","GaussianNB","AdaBoostClassifier","GradientBoostingClassifier"]
scores = { "Model Name" : models , "Accuracy Score" : acc, "Cross val Score": cv_scores}
df1 = pd.DataFrame(scores)

In [None]:
df1