In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
sns.set()

%matplotlib inline

#Setup grafico
rcParams['figure.figsize'] = 20, 10

In [None]:
#I dati sono disponibili sul sito Kaggle: https://www.kaggle.com/mlg-ulb/creditcardfraud
df = pd.read_csv("..\data\creditcard.csv")

# Exploration

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
#Let's check for missing values
df.isnull().values.any()

In [None]:
LABELS = ["Normal", "Fraud"]
count_classes.plot(kind = 'bar',rot=0)
plt.title("Transaction class distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency");

In [None]:
frauds.shape

In [None]:
normal.shape

In [None]:
frauds.describe()

How different are the amount of money used in different transaction classes?

In [None]:
frauds.Amount.describe()

In [None]:
normal.Amount.describe()

Let's have a more graphical representation:

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')

bins = 50

ax1.hist(frauds.Amount, bins = bins)
ax1.set_title('Fraud')

ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
#plt.show();

Do fraudulent transactions occur more often during certain time?

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')

ax1.scatter(frauds.Time, frauds.Amount)
ax1.set_title('Fraud')

ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

# Uso di scikit-learn

## Dataset split

In [None]:
df.shape

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True )

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.head(10)

In [None]:
test_df.head(10)

## Model generation (Random Forest)

In [None]:
#Usiamo Random Forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
etichetta = 'Class'
predictors = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10','V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

In [None]:
train_X = train_df[predictors] #Dati storici per il training
train_Y = train_df[etichetta].values #Etichette per il training

In [None]:
#Configurazione del modello Random Forest

RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

clf = RandomForestClassifier(n_jobs=NO_JOBS, 
                             random_state=42,
                             criterion=RFC_METRIC,
                             n_estimators=NUM_ESTIMATORS,
                             verbose=True)

In [None]:
#Effettua il training!
clf.fit(train_X, train_Y)

In [None]:
#Organizza i dati di test
test_X = test_df[predictors] #Ecco dei "nuovi" dati!
test_Y = test_df[etichetta].values #Etichette per il testing

In [None]:
#Proviamo il modello sui "nuovi dati"
preds = clf.predict(test_X[predictors])
preds

## Model evaluation (The right way)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
(tn, fp, fn, tp)= confusion_matrix(test_Y, preds).ravel()
(tn, fp, fn, tp)

In [None]:
#Attenzione! E' diversa dalle slide, vedi https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
matrix = confusion_matrix(test_Y, preds)
matrix

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score = round(precision_score(test_Y, preds), 2)
recall_score = round(recall_score(test_Y, preds), 2)
print("Precision: " + str(precision_score))
print("Recall: " + str(recall_score))

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_X, train_Y)

In [None]:
#Proviamo il modello sui "nuovi dati"
preds_logr = model.predict(test_X[predictors])
preds_logr

In [None]:
(tn, fp, fn, tp)= confusion_matrix(test_Y, preds_logr).ravel()
(tn, fp, fn, tp)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score = round(precision_score(test_Y, preds_logr), 2)
recall_score = round(recall_score(test_Y, preds_logr), 2)
print("Precision: " + str(precision_score))
print("Recall: " + str(recall_score))