In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
sns.set()

%matplotlib inline

#Setup grafico
rcParams['figure.figsize'] = 20, 10

In [None]:
#I dati sono disponibili sul sito Kaggle: https://www.kaggle.com/mlg-ulb/creditcardfraud
df = pd.read_csv("creditcard.csv")

# Exploration

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
#Let's check for missing values
df.isnull().values.any()

# Uso di scikit-learn

## Dataset split

In [None]:
df.shape

In [None]:
etichetta = 'Class'
predictors = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10','V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
X = df[predictors] #Tutte le colonne tranne l'etichetta
Y = df[etichetta] #La sola colonna con l'etichetta

In [None]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_Y,test_Y= train_test_split(X,Y,test_size=0.3,random_state=45)

print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

## Model generation (Random Forest)

In [None]:
#Usiamo Random Forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Configurazione del modello Random Forest

RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

clf = RandomForestClassifier(n_jobs=NO_JOBS, 
                             random_state=42,
                             criterion=RFC_METRIC,
                             n_estimators=NUM_ESTIMATORS,
                             verbose=True)

In [None]:
#Effettua il training!
clf.fit(train_X, train_Y)

In [None]:
#Proviamo il modello sui "nuovi dati"
preds = clf.predict(test_X[predictors])
preds

## Model evaluation (The right way)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
(tn, fp, fn, tp)= confusion_matrix(test_Y, preds).ravel()
(tn, fp, fn, tp)

In [None]:
#Attenzione! E' diversa dalle slide, vedi https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
matrix = confusion_matrix(test_Y, preds)
matrix

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score = round(precision_score(test_Y, preds), 2)
recall_score = round(recall_score(test_Y, preds), 2)
print("Precision: " + str(precision_score))
print("Recall: " + str(recall_score))

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_X, train_Y)

In [None]:
#Proviamo il modello sui "nuovi dati"
preds_logr = model.predict(test_X[predictors])
preds_logr

In [None]:
(tn, fp, fn, tp)= confusion_matrix(test_Y, preds_logr).ravel()
(tn, fp, fn, tp)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score = round(precision_score(test_Y, preds_logr), 2)
recall_score = round(recall_score(test_Y, preds_logr), 2)
print("Precision: " + str(precision_score))
print("Recall: " + str(recall_score))