# Init

In [19]:
!pip install lime
!pip install shap



In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import lime
import lime.lime_tabular
import shap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import GridSearchCV

In [21]:
data = pd.read_csv('files/creditcard.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
X = data.drop('Class', axis=1)
y = data['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Decision Tree

## First DT model

In [None]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=X.columns, class_names=["No Fraud", "Fraud"])
plt.show()

In [None]:
print(clf.get_depth())

## Second DT model (prunned)

In [None]:
max_depth = 7

In [None]:
pruned_clf = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
pruned_clf.fit(X_train, y_train)

In [None]:
y_pred_pruned = pruned_clf.predict(X_test)
print(classification_report(y_test, y_pred_pruned))

In [None]:
plt.figure(figsize=(20,10))
plot_tree(pruned_clf, filled=True, feature_names=X.columns, class_names=["No Fraud", "Fraud"])
plt.show()

## Tree Features Importance

In [None]:
feature_importances = clf.feature_importances_

In [None]:
indices = np.argsort(feature_importances)[::-1]  # Sort features by importance
plt.figure(figsize=(10, 6))
plt.title("Feature Importance in Decision Tree")
plt.bar(range(X.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.show()

## LIME

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X.columns, class_names=['No Fraud', 'Fraud'], verbose=True, mode='classification')

i = 0
exp = explainer.explain_instance(X_test.iloc[i].values, pruned_clf.predict_proba, num_features=10)

exp.show_in_notebook(show_table=True)

## SHAP

In [None]:
explainer = shap.TreeExplainer(pruned_clf)

In [None]:
shap_values = explainer.shap_values(X_test)

In [None]:
shap_values.shape

In [None]:
test = shap_values[:,:,1]
test.shape

In [None]:
shap.summary_plot(
    shap_values[:,:, 0], 
    X_test
)

In [None]:
shap_values_test = explainer(X_test)
shap_values_test.shape

In [None]:
shap.initjs()
shap.force_plot(
    explainer.expected_value[0],
    shap_values_test.values[0, :, 0],
    X_test.to_numpy()[0, :]
)

## Retrain with Important Features

In [None]:
important_features = ['V17', 'V14', 'V12', 'V10']

In [None]:
X_train_reduced = X_train[important_features]
X_test_reduced = X_test[important_features]

In [None]:
final_clf = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
final_clf.fit(X_train_reduced, y_train)

In [None]:
y_pred_final = final_clf.predict(X_test_reduced)
print(classification_report(y_test, y_pred_final))

# SVM

## SVM Linear

In [None]:
def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

In [None]:
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train, y_train)

In [None]:
y_pred_linear = svm_linear.predict(X_test)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred_linear))

In [None]:
plot_confusion_matrix(y_test, y_pred_linear, title='Confusion Matrix for SVM Linear')

## SVM Kernel

In [None]:
X_train_kernel = X_train.copy()
X_train_kernel['kernel'] = X_train_reduced.iloc[:, 0] * X_train_reduced.iloc[:, 1] * X_train_reduced.iloc[:, 2] * X_train_reduced.iloc[:, 3]

X_test_kernel = X_test.copy()
X_test_kernel['kernel'] = X_test_reduced.iloc[:, 0] * X_test_reduced.iloc[:, 1] * X_test_reduced.iloc[:, 2] * X_test_reduced.iloc[:, 3]

scaler_kernel = StandardScaler()
X_train_kernel = pd.DataFrame(scaler_kernel.fit_transform(X_train_kernel), columns=X_train_kernel.columns)
X_test_kernel = pd.DataFrame(scaler_kernel.transform(X_test_kernel), columns=X_test_kernel.columns)

In [None]:
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train_kernel, y_train)

In [None]:
y_pred_linear = svm_linear.predict(X_test_kernel)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred_linear))

In [None]:
plot_confusion_matrix(y_test, y_pred_linear, title='Confusion Matrix for SVM Linear')

## Grid Search

In [None]:
svm_linear = SVC(kernel='linear', random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(svm_linear, param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print("Melhores parâmetros encontrados:")
print(grid_search.best_params_)

In [None]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred_best))

In [None]:
plot_confusion_matrix(y_test, y_pred_best, title='Confusion Matrix for SVM Linear')