In [25]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
import matplotlib.pylab as plt

In [2]:
dados = pd.read_csv('./creditcard.csv')

In [3]:
dados.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [4]:
print(dados.isna().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [5]:
print(dados['Class'].count())

284807


In [6]:
n_transacoes = dados['Class'].count()
n_fraudes = dados['Class'].sum()
n_normais = n_transacoes - n_fraudes
fraudes_porc = n_fraudes / n_transacoes
normais_porc = n_normais / n_transacoes

print(f'Número de transações: {n_transacoes}')
print(f'Número de fraudes: {n_fraudes} {fraudes_porc * 100:.2f}%')
print(f'Número de transações normais: {n_normais} {normais_porc * 100:.2f}%')

Número de transações: 284807
Número de fraudes: 492 0.17%
Número de transações normais: 284315 99.83%


# USANDDO SKELEARN

In [7]:
def executar_validator(x, y):
    validador = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    for treino_id, teste_id in validador.split(x, y):
        x_train, x_test = x[treino_id], x[teste_id]
        y_train, y_test = y[treino_id], y[teste_id]
    return x_train, x_test, y_train, y_test


def executar_classificador(classificador, x_train, x_test, y_train):
    arvore = classificador.fit(x_train, y_train)
    y_pred = arvore.predict(x_test)
    return y_pred


def salvar_arvore(classificador, nome):
    plt.figure(figsize=(200, 100))
    tree.plot_tree(classificador, filled=True, fontsize=14)
    plt.savefig(nome)
    plt.close()


def validar_arvore(y_test, y_pred):
    print(accuracy_score(y_test, y_pred))
    print(precision_score(y_test, y_pred))
    print(recall_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [8]:
x = dados.drop('Class', axis=1).values
y = dados['Class'].values

In [9]:
classificador_arvore_decisao = tree.DecisionTreeClassifier()
x_train, x_test, y_train, y_test = executar_validator(x, y)
y_pred_arvore_decisao = executar_classificador(classificador_arvore_decisao, x_train, x_test, y_train)

In [10]:
salvar_arvore(classificador_arvore_decisao, "arvore_decisao_1.png")

In [11]:
validar_arvore(y_test, y_pred_arvore_decisao)

0.9990168884519505
0.723404255319149
0.6938775510204082
[[28419    13]
 [   15    34]]


In [12]:
print(classificador_arvore_decisao)
print(classificador_arvore_decisao.get_depth())

DecisionTreeClassifier()
21


In [13]:
classificador_arvore_decisao = tree.DecisionTreeClassifier(max_depth=10, random_state=0, min_samples_leaf=10)
x_train, x_test, y_train, y_test = executar_validator(x, y)
y_pred_arvore_decisao = executar_classificador(classificador_arvore_decisao, x_train, x_test, y_train)

In [14]:
salvar_arvore(classificador_arvore_decisao, "arvore_decisao_2.png")

In [15]:
validar_arvore(y_test, y_pred_arvore_decisao)

0.9993679997191109
0.8604651162790697
0.7551020408163265
[[28426     6]
 [   12    37]]


In [16]:
classificador_arvore_decisao = tree.DecisionTreeClassifier(max_depth=5, random_state=0)
x_train, x_test, y_train, y_test = executar_validator(x, y)
y_pred_arvore_decisao = executar_classificador(classificador_arvore_decisao, x_train, x_test, y_train)

In [17]:
validar_arvore(y_test, y_pred_arvore_decisao)

0.999403110845827
0.9210526315789473
0.7142857142857143
[[28429     3]
 [   14    35]]


# Random Forest

In [19]:
classificador_random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
y_pred_random_forest = executar_classificador(classificador_random_forest, x_train, x_test, y_train)

In [20]:
salvar_arvore(classificador_random_forest.estimators_[0], "random_forest1.png")
salvar_arvore(classificador_random_forest.estimators_[1], "random_forest2.png")

In [21]:
validar_arvore(y_test, y_pred_random_forest)

0.9995084442259752
0.9487179487179487
0.7551020408163265
[[28430     2]
 [   12    37]]


In [23]:
classificador_random_forest = RandomForestClassifier(n_estimators=10, random_state=0, max_depth=10)
y_pred_random_forest = executar_classificador(classificador_random_forest, x_train, x_test, y_train)

In [24]:
validar_arvore(y_test, y_pred_random_forest)

0.9994382219725431
0.9230769230769231
0.7346938775510204
[[28429     3]
 [   13    36]]


# AdaBoost

In [26]:
ada_boost_classifier = AdaBoostClassifier(random_state=0)
y_pred_ada_boost = executar_classificador(ada_boost_classifier, x_train, x_test, y_train)

In [27]:
validar_arvore(y_test, y_pred_random_forest)

0.9994382219725431
0.9230769230769231
0.7346938775510204
[[28429     3]
 [   13    36]]


In [28]:
ada_boost_classifier = AdaBoostClassifier(random_state=0, n_estimators=100)
y_pred_ada_boost = executar_classificador(ada_boost_classifier, x_train, x_test, y_train)

KeyboardInterrupt: 

In [None]:
validar_arvore(y_test, y_pred_random_forest)