In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Common libraries
import numpy as np
import pandas as pd
display_settings = {
    'max_rows': 7,
    'max_columns': 7,
    'precision': 2
}
for op, value in display_settings.items():
    pd.set_option("display.{}".format(op), value)

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 4})
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

# Estimators
from sklearn.cluster import KMeans
from sklearn.svm import SVC

# Preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from collections import Counter
from imblearn.under_sampling import *
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Metrics
from sklearn import metrics

df = pd.read_csv("../dataset/creditcard.csv")
df.head(5)

samples_untouched = df.drop(columns='Class')
labels_untouched = df['Class'].ravel()

scaler = MinMaxScaler(feature_range=(-1,1))
X_train, X_test, y_train, y_test = train_test_split(samples_untouched, labels_untouched, test_size=0.2, random_state=8)
features = X_train.columns
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=features)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=features)

# Used in case of outlier removal
train_df = pd.concat([X_train, pd.DataFrame(y_train, columns=['Class'])], axis=1)
neg_samples = train_df.loc[train_df['Class'] == 0]
pos_samples = train_df.loc[train_df['Class'] == 1]

samples_untouched.isnull().sum()

print(f'Number of negative samples: {Counter(labels_untouched)[0]}\nNumber of positive samples: {Counter(labels_untouched)[1]}')

sm = SMOTE(random_state=42, n_jobs=-1, sampling_strategy=0.03)
X_train, y_train = sm.fit_resample(X_train, y_train)
print(f'Number of negative samples: {Counter(y_train)[0]}\nNumber of positive samples: {Counter(y_train)[1]}')

us = NearMiss()
X_train, y_train = us.fit_resample(X_train, y_train)
print(f'Number of negative samples: {Counter(y_train)[0]}\nNumber of positive samples: {Counter(y_train)[1]}')

isof = IsolationForest(contamination=0.04, n_jobs=-1, random_state=42)
y_pred = isof.fit_predict(neg_samples)
non_outlier_mask = y_pred != -1

neg_samples = neg_samples[non_outlier_mask]
print(neg_samples.shape[0])

X_train = pd.concat([neg_samples, pos_samples])
X_train.reset_index(drop=True, inplace=True)
y_train = X_train['Class'].ravel()
X_train.drop(columns=['Class'], inplace=True)

inertia_values = []
for i in range(1, 5):
    kmeans_clusterer = KMeans(n_clusters = i, init = 'k-means++', random_state = 1)
    kmeans_clusterer.fit(samples)
    inertia_values.append(kmeans_clusterer.inertia_)

plt.figure(figsize=(1.25,0.75))
plt.plot(
    range(1, 5), inertia_values,
    color = "#42A5F5",
    linewidth = '1'
    )
plt.xticks(range(1,5))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

kmeans_clusterer = KMeans(n_clusters = 2, init = 'k-means++', n_init='auto', random_state = 1)
kmeans_clusterer.fit(samples_untouched)

sample_cluster_mask = kmeans_clusterer.labels_

cf_matrix = metrics.cluster.contingency_matrix(labels_true=labels_untouched, labels_pred=sample_cluster_mask)
plt.figure(figsize=(2,2))
group_names = ['TN','FP','FN','TP']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels_cf = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels_cf = np.asarray(labels_cf).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels_cf, fmt='', cmap='Blues')

def purity_score(cf_matrix):
    return np.sum(np.max(cf_matrix, axis=1)) / np.sum(cf_matrix)

print(f'Clustering purity: {purity_score(cf_matrix)}')

param_combinations = {'kernel':['sigmoid','rbf'],'C':[0.1, 1.0, 10], 'gamma':[0.01, 0.1, 1.0, 10.0]}
svm_classifier = SVC()

clf = GridSearchCV(estimator=svm_classifier, param_grid=param_combinations, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

poly_param_combinations = {'kernel':['poly'], 'C':[0.1, 1.0, 10], 'gamma':[0.01, 0.1, 1.0, 10.0], 'degree':[3, 4, 5]}
poly_svm_classifier = SVC()

clf_poly = GridSearchCV(estimator=poly_svm_classifier, param_grid=poly_param_combinations, cv=5, scoring='accuracy', n_jobs=-1)
clf_poly.fit(X_train, y_train)
print(clf_poly.best_score_)
print(clf_poly.best_params_)

opt_clf = SVC(kernel='poly', degree=4)
opt_clf.fit(X_train, y_train)

y_pred = opt_clf.predict(X_test)

print(f'Number of negative samples: {Counter(y_pred)[0]}\nNumber of positive samples: {Counter(y_pred)[1]}')

cf_matrix = metrics.cluster.contingency_matrix(labels_true=y_test, labels_pred=y_pred)
plt.figure(figsize=(2,2))
group_names = ['TN','FP','FN','TP']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels_cf = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels_cf = np.asarray(labels_cf).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels_cf, fmt='', cmap='Greens')

tp = cf_matrix[1][1]
tn = cf_matrix[0][0]
fp = cf_matrix[0][1]
fn = cf_matrix[1][0]

recall_score = tp/(tp+fn)
precision_score = tp/(tp+fp)
accuracy = (tp+tn)/(tp+tn+fn+fp)
fone = (tp)/(tp+0.5*(fp+fn))

print(f'Accuracy: {(accuracy)}\nRecall score: {recall_score}\nPrecision score: {precision_score}\nF1-score: {fone}')

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/creditcard.csv'