In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from scipy.stats import multivariate_normal


In [None]:
df = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")
df.shape

In [None]:
df.columns

In [None]:
print("Fraud cases: ", len(df[df['Class']==1]))
print("Normal Cases: ", len(df[df['Class']==0]))
print("Contamination: {}".format((len(df[df['Class']==1]))/(len(df[df['Class']==0]))*100))

In [None]:
df.describe()

In [None]:
tsne_data_fraud = df[df.Class ==1]
tsne_data_normal = df[df.Class ==0].sample(frac=0.05, random_state=1)
print(tsne_data_fraud.shape)
tsne_data_normal.shape

In [None]:
tsne_data = tsne_data_fraud.append(tsne_data_normal, ignore_index=True)
tsne_data = shuffle(tsne_data)
label = tsne_data.iloc[:, -1]
tsne_data = tsne_data.iloc[:, :30]
tsne_data = tsne_data.astype(np.float64)

standard_scaler = StandardScaler()
tsne_data = standard_scaler.fit_transform(tsne_data)
print(label.shape)
tsne_data.shape


In [None]:
tsne = TSNE(n_components=2, random_state=0)
tsne_data = tsne.fit_transform(tsne_data)
tsne_data

In [None]:
tsne_data.shape

In [None]:
tsne_plot = np.vstack((tsne_data.T, label))
tsne_plot = tsne_plot.T
tsne_plot.shape

In [None]:
tsne_plot = pd.DataFrame(tsne_plot, columns=("V1", "V2", "Class"))
sns.FacetGrid(tsne_plot, size=6, hue ='Class').map(plt.scatter, "V1", "V2").add_legend()

In [None]:
df.hist(figsize=(20,20), bins=50, color='green', alpha=0.5)
plt.show()

In [None]:
df_fraud = shuffle(df[df['Class']==1])
df_normal = shuffle(df[df.Class==0].sample(n=280000))
print(df_fraud.shape)
df_normal.shape

In [None]:
df_train = df_normal.iloc[:240000].drop(labels=['Class', 'Time'],axis=1)

In [None]:
df_cross = shuffle(df_normal.iloc[240000:260000, :].append(df_fraud.iloc[:246, :]))
Y_cross = df_cross.loc[:, "Class"]


In [None]:
df_cross = df_cross.drop(labels=['Class', 'Time'], axis=1)
df_test = shuffle(df_normal.iloc[260000: :].append(df_fraud.iloc[246:, :]))
Y_test = df_test.loc[:, 'Class']
df_test = df_test.drop(labels = ["Class", "Time"], axis=1)
df_test.shape

In [None]:
def mean_variance(data):
    mean = np.mean(data, axis=0)
    cov = np.cov(data.T)
    return mean, cov


def gaussian_dist(data, mean, cov):
    prob = multivariate_normal.pdf(data, mean = mean, cov = cov)
    return prob



In [None]:
mean , cov = mean_variance(df_train)
print(mean.shape)
print(cov.shape)

In [None]:
prob_train = gaussian_dist(df_train, mean, cov)
prob_train.min()

In [None]:
prob_cross = gaussian_dist(df_cross, mean, cov)
prob_test = gaussian_dist(df_test, mean, cov)
prob_cross.shape
prob_test.shape

In [None]:
def optimize_for_epsilon(prob_train, prob_cross, Y_cross):
    best_f1 = 0
    max_e = 2.062044871798754e-79
    min_e = prob_train.min()
    step = (max_e - min_e)/1000
    
    
    for e in np.arange(prob_cross.min(), max_e, step):
        Y_cross_pred = prob_cross < e
        precision, recall, f1_score, support = prfs(Y_cross, Y_cross_pred, average="binary")
        print("for epsilon: {}".format(e))
        print("f1_score: {}".format(f1_score))
        print("recall: {}".format(recall))
        print("support: {}".format(support))
        print("precision: {}".format(precision))
        print()
        
        
        if f1_score > best_f1:
            best_f1 = f1_score
            best_epsilon = e
            recall = recall
            
    return best_f1, best_epsilon, recall

In [None]:
best_f1, best_epsilon, recall = optimize_for_epsilon(prob_train, prob_cross, Y_cross)
print(best_f1, best_epsilon, recall)


In [None]:
Y_test_pred = prob_test < best_epsilon
precision, recall, f1_score, ignore = prfs(Y_test, Y_test_pred, average="binary")
precision

In [None]:
best_epsilon

In [None]:
f1_score