In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams

%matplotlib inline

#Setup grafico
rcParams['figure.figsize'] = 20, 10

In [23]:
#I dati sono disponibili sul sito Kaggle: https://www.kaggle.com/mlg-ulb/creditcardfraud
df = pd.read_csv("creditcard.csv")

# Exploration

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
#Let's check for missing values
df.isnull().values.any()

In [None]:
count_classes = pd.value_counts(df['Class'], sort = True)
count_classes

In [None]:
LABELS = ["Normal", "Fraud"]
count_classes.plot(kind = 'bar',rot=0)
plt.title("Transaction class distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

In [None]:
frauds = df[df.Class == 1]
normal = df[df.Class == 0]

In [None]:
frauds.shape

In [None]:
normal.shape

In [None]:
frauds.describe()

How different are the amount of money used in different transaction classes?

In [None]:
frauds.Amount.describe()

In [None]:
normal.Amount.describe()

Let's have a more graphical representation:

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')

bins = 50

ax1.hist(frauds.Amount, bins = bins)
ax1.set_title('Fraud')

ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
#plt.show();

Do fraudulent transactions occur more often during certain time?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')

ax1.scatter(frauds.Time, frauds.Amount)
ax1.set_title('Fraud')

ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

# Uso di scikit-learn

## Dataset split

In [24]:
df.shape

(284807, 31)

In [25]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True )

In [26]:
train_df.shape

(227845, 31)

In [27]:
test_df.shape

(56962, 31)

In [28]:
train_df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
223361,143352.0,1.955041,-0.380783,-0.315013,0.330155,-0.509374,-0.086197,-0.627978,0.035994,1.05456,...,0.238197,0.968305,0.053208,-0.278602,-0.044999,-0.21678,0.045168,-0.047145,9.99,0
165061,117173.0,-0.400975,-0.626943,1.555339,-2.017772,-0.107769,0.16831,0.017959,-0.401619,0.040378,...,-0.153485,0.421703,0.113442,-1.004095,-1.176695,0.361924,-0.370469,-0.144792,45.9,0
238186,149565.0,0.072509,0.820566,-0.561351,-0.709897,1.080399,-0.359429,0.787858,0.117276,-0.131275,...,-0.314638,-0.872959,0.083391,0.148178,-0.431459,0.11969,0.206395,0.070288,11.99,0
150562,93670.0,-0.535045,1.014587,1.750679,2.76939,0.500089,1.00227,0.847902,-0.081323,0.371579,...,0.063525,0.443431,-0.072754,0.448192,-0.655203,-0.181038,-0.093013,-0.064931,117.44,0
138452,82655.0,-4.026938,1.897371,-0.429786,-0.029571,-0.855751,-0.480406,-0.435632,1.31376,0.536044,...,-0.480691,-0.230369,0.250717,0.066399,0.470787,0.245335,0.286904,-0.322672,25.76,0
35440,38121.0,-0.795091,1.061833,0.640795,0.594567,0.551601,0.0976,0.633501,0.083765,-0.027452,...,0.026523,0.321166,-0.246997,-0.779914,-0.160225,-0.236537,0.442701,0.268076,40.22,0
119552,75491.0,1.174701,0.275037,0.364384,1.078177,-0.137806,-0.423289,0.089835,-0.090439,-0.0111,...,0.109522,0.400376,-0.095922,0.102617,0.616368,-0.269566,0.036606,0.021232,14.9,0
189362,128362.0,0.119471,0.026912,0.02587,-0.638642,1.569986,1.551793,0.573425,0.192499,0.266783,...,0.493721,1.870004,-0.079431,-1.905932,-0.616224,0.629626,-0.049543,-0.192198,44.3,0
150524,93605.0,-0.685023,1.448337,0.167864,4.622496,1.383526,0.187905,1.847737,-1.222425,0.231182,...,-0.279236,0.609252,0.265805,-0.142988,-1.094047,0.119107,-0.527683,-0.036831,60.09,0
223666,143475.0,0.025493,-0.317142,0.883483,-2.527437,-0.532859,-0.858682,0.01032,-0.267504,-2.281117,...,-0.048195,0.379341,-0.09851,0.001591,-0.432567,-0.22772,0.188823,0.176748,10.0,0


In [29]:
test_df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43428,41505.0,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,...,1.190739,-1.12767,-2.358579,0.673461,-1.4137,-0.462762,-2.018575,-1.042804,364.19,1
49906,44261.0,0.339812,-2.743745,-0.13407,-1.385729,-1.451413,1.015887,-0.524379,0.22406,0.899746,...,-0.213436,-0.942525,-0.526819,-1.156992,0.311211,-0.746647,0.040996,0.102038,520.12,0
29474,35484.0,1.39959,-0.590701,0.168619,-1.02995,-0.539806,0.040444,-0.712567,0.002299,-0.971747,...,0.102398,0.168269,-0.166639,-0.81025,0.505083,-0.23234,0.011409,0.004634,31.0,0
276481,167123.0,-0.432071,1.647895,-1.669361,-0.349504,0.785785,-0.630647,0.27699,0.586025,-0.484715,...,0.358932,0.873663,-0.178642,-0.017171,-0.207392,-0.157756,-0.237386,0.001934,1.5,0
278846,168473.0,2.01416,-0.137394,-1.015839,0.327269,-0.182179,-0.956571,0.043241,-0.160746,0.363241,...,-0.238644,-0.6164,0.347045,0.061561,-0.360196,0.17473,-0.078043,-0.070571,0.89,0
101565,67878.0,-0.64133,-0.057304,1.489998,-1.688131,-1.151043,0.259996,-1.391069,-2.334075,1.168644,...,-1.231634,0.257164,-0.371953,-0.038566,1.397514,-0.665947,0.031003,0.180357,100.0,0
260880,159763.0,2.023952,-0.12014,-1.086918,0.423019,-0.142901,-1.127752,0.178493,-0.303234,0.564509,...,-0.276175,-0.697708,0.335631,-0.017196,-0.324904,0.200023,-0.071566,-0.058224,16.99,0
214337,139631.0,-0.688944,1.292153,-0.564281,-1.457526,2.258333,-0.32327,1.678984,-0.104128,-1.285351,...,-0.00688,-0.171568,-0.720019,-0.419435,1.211991,0.670916,-0.103986,0.030084,8.95,0
201575,133944.0,2.119362,0.142639,-2.373337,0.541949,0.608419,-1.775564,0.955775,-0.599383,0.01042,...,0.264264,0.898266,-0.168063,0.059311,0.626949,0.729035,-0.12912,-0.094713,10.0,0
81055,58769.0,-5.584256,-4.732413,-0.448452,-0.121442,-0.707412,-0.114376,-1.554628,1.402126,-0.031693,...,0.041651,0.621789,0.223467,-0.770137,0.621182,-0.028738,0.505194,-1.898323,101.49,0


## Model generation

In [30]:
#Usiamo Random Forest
from sklearn.ensemble import RandomForestClassifier

In [31]:
etichetta = 'Class'
predictors = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10','V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

In [32]:
train_X = train_df[predictors] #Dati storici per il training
train_Y = train_df[etichetta].values #Etichette per il training

In [33]:
#Configurazione del modello Random Forest

RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

clf = RandomForestClassifier(n_jobs=NO_JOBS, 
                             random_state=42,
                             criterion=RFC_METRIC,
                             n_estimators=NUM_ESTIMATORS,
                             verbose=True)

In [34]:
#Effettua il training!
clf.fit(train_X, train_Y)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   30.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
                       oob_score=False, random_state=42, verbose=True,
                       warm_start=False)

In [35]:
#Organizza i dati di test
test_X = test_df[predictors] #Ecco dei "nuovi" dati!
test_Y = test_df[etichetta].values #Etichette per il testing

In [36]:
#Proviamo il modello sui "nuovi dati"
preds = clf.predict(test_X[predictors])
preds

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

## Model evaluation (basics)

In [None]:
#Questo NON è il modo canonico di valutare un modello ma è utile per capire alcune cose :-)

In [37]:
#Compara le etichette reali e le etichette determinate dal modello
count=0
count_ones=0
for i in range(0, len(test_Y)-1):
    if (test_Y[i]==preds[i]):#quanti ne ho indovinati?
        count=count+1
print("% Valori predetti correttamente: " + str(count/test_Y.size))

% Valori predetti correttamente: 0.9995435553526912


In [38]:
#Compara le etichette reali e le etichette determinate dal modello
count=0
count_ones=0
count_ones_ok=0
for i in range(0, len(test_Y)-1):
    if (test_Y[i]==1):#quante frodi ci sono?
        count_ones+=1
    if ((test_Y[i]==1) and (test_Y[i]==preds[i])): #quanti ne ho indovinati tra le frodi?
        count_ones_ok+=1
print("Transazioni malevole nel testing set:" + str(count_ones))
print("Transazioni malevole predette correttamente:" + str(count_ones_ok))
print("% Transazioni malevole scovate: " + str(count_ones_ok/count_ones))

Transazioni malevole nel testing set:98
Transazioni malevole predette correttamente:75
% Transazioni malevole scovate: 0.7653061224489796
