In [None]:
import pandas as pd
import numpy as np
import sklearn
import junoutils
import seaborn as sns

In [None]:
ifst = sklearn.ensemble.IsolationForest()

In [None]:
s = junoutils.openPickle('summary_withemb.pickle')

In [None]:
drop_prefixes = ['__v']

In [None]:
for col in s.columns:
    print(col)

In [None]:
drops = ['email','tSNE_x', 'tSNE_y', 'suspected_fraud', 'fraud_count', 'value_std']

In [None]:
values = []

for c in s.columns:
    if 'value' in c:
        values.append(c)

In [None]:
sd = s.drop(drops+values, axis=1)

In [None]:
sd_scaled = junoutils.scaleDf(sd)

In [None]:
print("Null columns")
for col in sd:
    if np.sum(pd.isnull(sd[col])) > 0:
        print(col, np.sum(np.isnull(sd[col])))

print("Inf Columns:")
for col in sd:

    if np.sum(np.isfinite(sd[col]) == False) > 0:
        print(col, np.sum(np.isfinite(sd[col] == False)) > 0)

print("Columns could not convert to int32")
for col in sd:
    try:
        sd[col] = sd[col].astype('float32')
    except:
        print(col)

In [None]:
fraud_percentage = []
caught = []
percent_caught = []
number_to_check = []
bang_for_buck = []

for con in [0.001, 0.01, 0.1, 0.2, 0.3]:

    ifst = sklearn.ensemble.IsolationForest(contamination=con)#contamination=0.01)
    ifst.fit(sd_scaled)
    anomoly = ifst.predict(X=sd_scaled)

    s['anomoly'] = anomoly

    anomoly_emails = list(s['email'][s['anomoly'] == -1])
    flagged_emails = list(s['email'][s['suspected_fraud'] == True])

    count = 0

    for email in flagged_emails:
        if email in anomoly_emails:
            #print(email)
            count +=1
    
    caught.append(count)
    fraud_percentage.append(con)
    percent_caught.append(count/len(flagged_emails))
    number_to_check.append(len(anomoly_emails))
    bb = count/len(flagged_emails)/len(anomoly_emails)
    bang_for_buck.append(bb)

    print('Contamination: {}, Percent Caught: {}, Percent Caught/Number To Check: {}'.format(con, count/len(flagged_emails), count/len(flagged_emails)/len(anomoly_emails)))
    
bb = np.array(bang_for_buck)
    
results = pd.DataFrame({'Assumed Fraud Percentage': fraud_percentage,
                        'Number Caught': caught,
                        'Percent Caught of Known Accounts': percent_caught,
                        'Number to Check': number_to_check,
                        'Bang for Your Buck': (bb-np.min(bb))/(np.max(bb)-np.min(bb))})

results

In [None]:
sklearn.svm.OneClassSVM()
ifst = sklearn.svm.OneClassSVM()
ifst.fit(sd_scaled)
anomoly = ifst.predict(X=sd_scaled)

In [None]:
anomoly

In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [None]:
drop_cols = ['email']

for col in sd_scaled:
    if 'fraud' in col or 'value' in col:
        print(col)
        drop_cols.append(col)
        
drop_cols = list(set(drop_cols))

In [None]:
data = sd_scaled
data['fraud'] = s['suspected_fraud']
data['email'] = s['email']
X_train, X_test = train_test_split(data, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train[X_train.fraud == 0]
X_train = X_train.drop(drop_cols, axis=1)
y_test = X_test['fraud']
X_test = X_test.drop(drop_cols, axis=1)
X_train = X_train.values
X_test = X_test.values
X_train.shape

input_dim = X_train.shape[1]
encoding_dim = int(input_dim/10)

input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
nb_epoch = 100
batch_size = 32
autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
# tensorboard = TensorBoard(log_dir='./logs',
#                           histogram_freq=0,
#                           write_graph=True,
#                           write_images=True)
history = autoencoder.fit(X_train, X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test),
                    verbose=1,
                    callbacks=[checkpointer]).history

In [None]:
autoencoder = load_model('model.h5')

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right');

In [None]:
predictions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})
error_df.describe()

In [None]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

In [None]:
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
precision, recall, th = precision_recall_curve(error_df.true_class, error_df.reconstruction_error)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, precision[1:], 'b', label='Threshold-Precision curve')
plt.title('Precision for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision')
plt.show()

In [None]:
plt.plot(th, recall[1:], 'b', label='Threshold-Recall curve')
plt.title('Recall for different threshold values')
plt.xlabel('Reconstruction error')
plt.ylabel('Recall')
plt.show()

In [None]:
threshold = 0.1

groups = error_df.groupby('true_class')
fig, ax = plt.subplots()

for name, group in groups:
    ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='',
            label= "Fraud" if name == 1 else "Normal")
ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
plt.title("Reconstruction error for different classes")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show();

y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.true_class, y_pred)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
sd_scaled.columns

In [None]:
d

In [None]:
d = sd_scaled.drop(drop_cols, axis=1)
c = autoencoder.predict(d)

squared_error = np.power(d - c, 2)
mse = np.mean(squared_error, axis=1)

In [None]:
max_squared_error = np.apply_along_axis(arr=squared_error, axis=1, func1d=np.max)
col_of_biggest_squared_error = np.array(d.columns[np.apply_along_axis(arr=squared_error, axis=1, func1d=np.argmax)])

In [None]:
anomoly = mse > 0.08

predicted_emails = list(sd_scaled.email[anomoly == True])
max_sqe = list(max_squared_error[anomoly == True])
col_biggest_error = col_of_biggest_squared_error[anomoly == True]

count = 0

for email in flagged_emails:
        if email in predicted_emails:
            count +=1
        else:
            print(email)
            
print('Found {} out of {} or {} percent, {} other users flagged'.format(count, len(flagged_emails), count/len(flagged_emails), len(predicted_emails)))

In [None]:
results = pd.DataFrame({'email': predicted_emails,
                        'prediction_error': mse[anomoly],
                        'column_biggest_error': col_biggest_error,
                        'biggest_error_value': max_sqe}).sort_values(by='prediction_error', ascending=False).reset_index(drop=True)
results['known fraudster'] = results.email.apply(lambda x: True if x in flagged_emails else False)
results = results.reset_index(drop=True)
results = results.dropna()

results.to_csv('anomaly_detection_results.csv')

results.to_html('anomaly_detection_results.html')

results

In [None]:
results.column_biggest_error.value_counts()

In [None]:
sns.distplot(results.prediction_error, hist=False, rug=True)