# Desafio de Classificação - v1

Tema: detecção de fraude em transações mobile.

##### Etapas

1. Tratamento de Dados
2. Modelagem
3. Análise de Resultados
4. Análise da Solução


##### Atenção
- Solução mais barata é preferível. Isso significa menor tempo de treinamento do modelo
- Empresas perdem em triplo: uma vez pela fraude e duas vezes com indenizações ao consumidor
- _"36% das empresas ainda não percebem com clareza os benefícios das iniciativas de análise de dados para as atividades antifraude"_: é provável supor que a solução precisará convencer pessoas chaves dentro da empresa para poder ser implementada

In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from time import time


In [34]:
#variáveis globais
SEED = 42
REPORTCARD = pd.DataFrame(columns=['modelo','tempo_treino','tempo_ciclo','tp','fp','fn','tp', 'fscore'])
FSCORE_BETA = 2

#### Ciclo 1: baseline da loucura

In [35]:
df_orig = pd.read_csv('dados/fraud_detection_dataset.csv')
df_orig.info()
print(df_orig.shape)
df_orig.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
(6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [36]:
df = df_orig.copy(deep=True)

#inicio do ciclo
start_ciclo = time()

#criando dummies da coluna 'type'
print(df['type'].value_counts())
df = pd.get_dummies(df, prefix='type', columns=['type'], drop_first=False)
df.head()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64


Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0


In [37]:
print(df.columns.tolist())
lista_x = [
    'step',
    'amount',
    #'nameOrig', <-- string
    'oldbalanceOrg',
    #'newbalanceOrig', <-- multicolinearidade
    #'nameDest', <-- string
    'oldbalanceDest',
    'newbalanceDest',
    #'isFraud', <-- fraude
    'isFlaggedFraud',
    #'type_CASH_IN', <-- caso mais frequente
    'type_CASH_OUT',
    'type_DEBIT',
    'type_PAYMENT',
    'type_TRANSFER']

lista_y = ['isFraud']

#separando X e y numéricas
X_train, X_test, y_train, y_test = train_test_split(df[lista_x], df[lista_y], test_size=0.2, random_state=SEED, stratify=df[lista_y])

['step', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']


In [38]:
model1 = RandomForestClassifier(n_estimators = 100,
                                max_depth=2,    #nunca deixa padrão
                                random_state=SEED,
                                bootstrap=True,
                                warm_start=True,
                                )
start_model = time()
model1.fit(X_train, y_train)
stop_model = time()

  model1.fit(X_train, y_train)


In [39]:
#printando as métricas
y_pred = model1.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
print('''
[Verdadeiro Positive ,   Falso Positivo   ]
[   Falso Negativo   , Verdadeiro Negativo] 
''')

print(classification_report(y_test, y_pred, target_names=['Fraud', 'Not Fraud']))

#fim do ciclo
stop_ciclo = time()

#salvando os resultados
model = model1
duracao_treino = round((stop_model - start_model)/60, 2)
duracao_ciclo = round((stop_ciclo - start_ciclo)/60, 2)
tp = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tn = conf_matrix[1][1]
fscore = (1 + FSCORE_BETA**2) * tp / ((1 + FSCORE_BETA**2) * tp + FSCORE_BETA**2 * fn + fp)

nova_linha = {'modelo': model,
              'tempo_treino': duracao_treino,
              'tempo_ciclo': duracao_ciclo,
               'tp': tp,
               'fp': fp,
               'fn': fn,
               'tp': tp,
               'fscore': fscore}

REPORTCARD = REPORTCARD.append(nova_linha, ignore_index=True)
REPORTCARD


[[1270881       0]
 [   1598      45]]

[Verdadeiro Positive ,   Falso Positivo   ]
[   Falso Negativo   , Verdadeiro Negativo] 

              precision    recall  f1-score   support

       Fraud       1.00      1.00      1.00   1270881
   Not Fraud       1.00      0.03      0.05      1643

    accuracy                           1.00   1272524
   macro avg       1.00      0.51      0.53   1272524
weighted avg       1.00      1.00      1.00   1272524



Unnamed: 0,modelo,tempo_treino,tempo_ciclo,tp,fp,fn,tp.1,fscore
0,"(DecisionTreeClassifier(max_depth=2, max_featu...",3.25,3.63,1270881,0,1598,1270881,0.998995
