In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams

%matplotlib inline

#Setup grafico
rcParams['figure.figsize'] = 20, 10

In [None]:
#I dati sono disponibili sul sito Kaggle: https://www.kaggle.com/mlg-ulb/creditcardfraud
df = pd.read_csv("creditcard.csv")

# Exploration

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
#Let's check for missing values
df.isnull().values.any()

In [None]:
count_classes = pd.value_counts(df['Class'], sort = True)
count_classes

In [None]:
LABELS = ["Normal", "Fraud"]
count_classes.plot(kind = 'bar',rot=0)
plt.title("Transaction class distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

In [None]:
frauds = df[df.Class == 1]
normal = df[df.Class == 0]

In [None]:
frauds.shape

In [None]:
normal.shape

In [None]:
frauds.describe()

How different are the amount of money used in different transaction classes?

In [None]:
frauds.Amount.describe()

In [None]:
normal.Amount.describe()

Let's have a more graphical representation:

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')

bins = 50

ax1.hist(frauds.Amount, bins = bins)
ax1.set_title('Fraud')

ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
#plt.show();

Do fraudulent transactions occur more often during certain time?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')

ax1.scatter(frauds.Time, frauds.Amount)
ax1.set_title('Fraud')

ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

# Uso di scikit-learn

## Dataset split

In [None]:
df.shape

In [None]:
etichetta = 'Class'
predictors = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10','V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
X = df[predictors] #Tutte le colonne tranne l'etichetta
Y = df[etichetta] #La sola colonna con l'etichetta

In [None]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_Y,test_Y= train_test_split(X,Y,test_size=0.3,random_state=45)

print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

## Model generation

In [None]:
#Usiamo Random Forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Configurazione del modello Random Forest

RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

clf = RandomForestClassifier(n_jobs=NO_JOBS, 
                             random_state=42,
                             criterion=RFC_METRIC,
                             n_estimators=NUM_ESTIMATORS,
                             verbose=True)

In [None]:
#Effettua il training!
clf.fit(train_X, train_Y)

In [None]:
#Proviamo il modello sui "nuovi dati"
preds = clf.predict(test_X[predictors])
preds

## Model evaluation (basics)

In [None]:
#Questo NON è il modo canonico di valutare un modello ma è utile per capire alcune cose :-)

In [None]:
#Accurancy
test_Y = test_Y.values
count=0
count_ones=0
for i in range(0, len(test_Y)-1):
    if (test_Y[i]==preds[i]):#quanti ne ho indovinati?
        count=count+1
print("% Valori predetti correttamente: " + str(count/test_Y.size))

In [None]:
#Compara le etichette positive reali e le etichette determinate dal modello (richiamo)
count=0
count_ones=0
count_ones_ok=0
for i in range(0, len(test_Y)-1):
    if (test_Y[i]==1):#quante frodi ci sono?
        count_ones+=1
    if ((test_Y[i]==1) and (test_Y[i]==preds[i])): #quanti ne ho indovinati tra le frodi?
        count_ones_ok+=1
print("Transazioni malevole nel testing set:" + str(count_ones))
print("Transazioni malevole predette correttamente:" + str(count_ones_ok))
print("% Transazioni malevole scovate: " + str(count_ones_ok/count_ones))