In [None]:
# Manejo de datos
import pandas as pd
import numpy as np
# Pre-procesamiento
from sklearn.preprocessing import RobustScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Modelado
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM
from sklearn import svm
# Validacion
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
#Graficos
%matplotlib inline
from matplotlib import pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df_eth_fraude = pd.read_csv('../input/datasetfraudethcleaned/transaction_dataset_procesado.csv',delimiter=',',header='infer')
df_eth=df_eth_fraude.copy()
df_eth

# Regresion Lineal

In [None]:
trans = RobustScaler()
df_eth_arr = trans.fit_transform(df_eth)
df_eth_escal=pd.DataFrame(df_eth_arr)
df_eth_escal.columns=df_eth_fraude.columns
df_eth_escal

In [None]:
X = df_eth_escal.loc[:, df_eth_escal.columns != 'FLAG']
y = df_eth_escal.loc[:, df_eth_escal.columns == 'FLAG']

# Numeros de los casos fraudulentos- Number of fraud cases
frauds = len(df_eth_escal[df_eth_escal.FLAG == 1])
# Seleccion de los indices de los casos validos- Selecting the indices of the non-fraud classes
fraud_indices = df_eth_escal[df_eth_escal.FLAG == 1].index
nonfraud_indices = df_eth_escal[df_eth_escal.FLAG == 0].index
#
# from all non-fraud observations, randomly select observations equal to number of fraud observations
random_nonfraud_indices = np.random.choice(nonfraud_indices, frauds, replace = False)
random_nonfraud_indices = np.array(random_nonfraud_indices)
# appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_nonfraud_indices])
# undersample dataset
under_sample_data = df_eth_escal.iloc[under_sample_indices,:]
# now split X, y variables from the under sample data
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'FLAG']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'FLAG']

In [None]:
# split dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, test_size = 0.3, random_state = 0)

In [None]:
# instantiate model
model = LogisticRegression()
# fit 
model.fit(X_train_undersample, y_train_undersample)
# predict
y_pred = model.predict(X_test_undersample)

In [None]:
# Matriz de confusión 
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
cm = confusion_matrix(y_test_undersample, y_pred)

classification_report = classification_report(y_test_undersample, y_pred)
print("CLASSIFICATION REPORT")
print(classification_report)

fig, ax = plt.subplots(figsize=(20,10))
ax.matshow(cm)
plt.title('Matriz de Confusión', fontsize=20)
plt.ylabel('Etiqueta Verdadera', fontsize=15)
plt.xlabel('Etiqueta Predicha', fontsize=15)
for (i, j), z in np.ndenumerate(cm):
    ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')

# One-Class SVM

In [None]:
valid_escal=df_eth_escal[df_eth_escal['FLAG']==0]
fraud_escal=df_eth_escal[df_eth_escal['FLAG']==1]

train_feat = valid_escal.loc[0:5341, :] #Se toma el 70% del dataset de trasacciones validas
train_feat = train_feat.drop('FLAG', 1)

Y_1 = valid_escal.loc[5341:, 'FLAG']
Y_2 = fraud_escal['FLAG']

In [None]:
# modelizacion
#modelo_one_svm = svm.OneClassSVM(kernel='poly', gamma=0.001, nu=0.95)
modelo_one_svm = svm.OneClassSVM(kernel='rbf', gamma=0.001, nu=0.95)

In [None]:
X_test_1 = valid_escal.loc[5341:, :].drop('FLAG',1)
X_test_2 = fraud_escal.drop('FLAG',1)
X_test = X_test_1.append(X_test_2)
Y_test= Y_1.append(Y_2)

In [None]:
# Training the algorithm with the features
y_score=modelo_one_svm.fit(train_feat)

In [None]:
fraud_pred = modelo_one_svm.predict(X_test)

In [None]:
# Check the number of outliers predicted by the algorithm

unique, counts = np.unique(fraud_pred, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
Y_test= Y_test.to_frame()
Y_test=Y_test.reset_index()
fraud_pred = pd.DataFrame(fraud_pred)
fraud_pred= fraud_pred.rename(columns={0: 'prediction'})

In [None]:
TP = FN = FP = TN = 0
for j in range(len(Y_test)):
    if Y_test['FLAG'][j]== 0 and fraud_pred['prediction'][j] == 1:
        TP = TP+1
    elif Y_test['FLAG'][j]== 0 and fraud_pred['prediction'][j] == -1:
        FN = FN+1
    elif Y_test['FLAG'][j]== 1 and fraud_pred['prediction'][j] == 1:
        FP = FP+1
    else:
        TN = TN +1

In [None]:
accuracy = (TP+TN)/(TP+FN+FP+TN)
print (accuracy)
sensitivity = TP/(TP+FN)
print (sensitivity)
specificity = TN/(TN+FP)
print (specificity)