In [None]:
!pip install -q scikit_plot

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

#setting style to seaborn
sns.set_style("dark")

In [None]:
# data https://www.kaggle.com/mlg-ulb/creditcardfraud?select=creditcard.csv
# alternate https://www.dropbox.com/s/b44o3t3ehmnx2b7/creditcard.csv?dl=1

#importing the data to be used in the analysis and split it into training and test data.
file_path = "https://www.dropbox.com/s/b44o3t3ehmnx2b7/creditcard.csv?dl=1"

# importing the dataset to a dataframe
df = pd.read_csv(file_path)
print("Original Dataset dimensions:", df.shape)

#splitting test data
test = df.sample(frac=0.15, random_state=0)
df = df.drop(test.index)

print("Train data dimensions: ", df.shape)
print("Test data dimensions: ", test.shape)

In [None]:
#Checking the first entries for the dataset
df.head()

In [None]:
#checking the statistical summary for the dataset

"""
Note:
Time = Number of seconds elapsed between this transaction and the first transaction in the dataset
Amount = Transaction amount

Class:
- Grouped into two segments:
0 and 1 as transaction type indicators.\

0 - Normal Transaction
1 - Fraudulent Transacti
"""
df.describe()

In [None]:
#Calculating the column with the most null entries
df.isnull().sum().max()

In [None]:
#Checking the number of entries per different classes
print(df.Class.value_counts())

print("\nFrauds represents {}% of the dataset \n".format(round(df[df.Class == 1].shape[0]*100/df.shape[0],2)))

fig, ax = plt.subplots(figsize=(7,5))
sns.countplot(x="Class", data=df, ax=ax)
plt.tight_layout();

In [None]:
# try to enhance the unbalanced dataset(enhancment)
# checking the distribution for the variable Time in normal and fraudulent transactions:


#Distirbution for "Time" per class

n_bins=40

fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(15,7))

df[df.Class == 0].Time.hist(ax=ax1, bins=n_bins, grid=False)
ax1.set_title("Normal")

df[df.Class == 1].Time.hist(ax=ax2, bins=n_bins, grid=False)
ax2.set_title("Fraudulent")
ax2.set_xlabel("Time(Seconds)")
ax2.set_ylabel("# of Entries")

plt.tight_layout()

In [None]:
"""
The difference in the number of entries may be explained by the different periods of the day, as day and night,
when the number of transactions vastly differs.

Plotting a boxplot for the Amount variable in normal and fraudulent transactions:
"""

#Calculating the superior limit for Amount Variable
q3 = df[df.Class == 1].Amount.quantile(.75)
q1 = df[df.Class == 1].Amount.quantile(.25)
IQR = q3 - q1

sup_limit = q3 + 1.5*IQR

In [None]:
#plottig the boxplot for the the normal and fraudulent distibution
fig, ax = plt.subplots(figsize=(7,10))
sns.boxplot(x="Class", y="Amount", data=df, ax=ax, showmeans=True)
ax.set_ylim(-20, sup_limit)
ax.set_xticklabels(["Normal", "Fraudulent"])

plt.tight_layout()

In [None]:
"""

Although the median is lower for the fraudulent transactions (represented by the black line inside each box),
the mean (represented by the green triangle) is higher for fraudulent transactions than for normal ones.

We can also plot a density plot for each variable, separating fraudulent and normal transactions. 
Here, we are searching for variables that are significantly different for normal and fraudulent transactions:

"""



#plotting the density plot

columns_names = df.drop(labels=["Class"], axis=1).columns

df_normal = df[df.Class == 0]
df_fraud = df[df.Class == 1]

fig, ax = plt.subplots(nrows=5,ncols=6, figsize=(18,18))
fig.subplots_adjust(hspace=1, wspace=1)

idx = 0

for col in columns_names:
  idx+=1
  plt.subplot(5,6,idx)
  sns.kdeplot(df_normal[col], label = "Normal", shade=True)
  sns.kdeplot(df_fraud[col], label = "Fraud", shade=True)

plt.tight_layout()

In [None]:
"""

Some variables as 'V14' and 'V4' have pretty different behavior for the two classes.

After the initial exploratory analysis we can state that:

    (1)The variables Time and Amount are not normalized and will need to be transformed before training the model.
    (2)The dataset is extremely unbalanced, representing a challenge for the analysis.
    (3)The mean amount for fraudulent transactions is higher than the normal transaction mean amount.
    (4)Some variables, as V14 and V4, have a clear different behavior for normal and fraudulent transactions.

Based on that, we can now prepare the data before training the model.

==> Preparing the data <==

First, we will normalize the Time and Amount variables. Since their dimensions are different from all the other variables, our model will be biased by these columns if we don't normalize them.

"""

#normalizing "Amount" and "Time" variables
df_copy = df.copy()


std_scaler = StandardScaler()
df_copy["std_amount"] = std_scaler.fit_transform(df_copy.Amount.values.reshape(-1,1))
df_copy["std_time"] = std_scaler.fit_transform(df_copy.Time.values.reshape(-1,1))

df_copy.drop(["Time","Amount"], axis=1, inplace=True)

In [None]:
#checking the first entries
df_copy.head()

In [None]:
#splitting the dataset into train and validation
np.random.seed(2)
X = df_copy.drop("Class", axis=1)
y = df_copy["Class"]

X_train, X_val, y_train, y_val = train_test_split(X,y, shuffle=True, stratify=y)

In [None]:
"""

Last but not least, since the fraudulent transaction only accounts for 0,17% of the dataset,
we should balance the dataset to have better results with our models.

Among others, there are two ways in which we can solve this problem:

    1. Over Sampling - Creates new entries for the minority class based on the existing samples.
    2. Under Sampling - Randomly deletes entries for the majority class.

Here we will choose the under sampling method and apply it to the data:

"""

#Balancing the dataset
rus = RandomUnderSampler()

X_rus, y_rus = rus.fit_resample(X_train, y_train)

In [None]:
#Plotting balanced values
print(pd.Series(y_rus).value_counts())

fig, ax = plt.subplots(figsize=(7,5))

sns.countplot(x=y_rus, ax=ax)
ax.set_xticklabels(labels=["Normal","Fraudulent"])

plt.tight_layout()

In [None]:
"""

Now we have the same number for fraudulent and normal transactions. To better understand
the influence of unbalanced data, let´s plot a correlation matrix for the balanced and unbalanced dataset:

"""

#plotting the correlation matrix for unbalanced and balanced data

imb_corr = X_train.corr()
corr = pd.DataFrame(X_rus).corr()

fig, ax = plt.subplots(nrows=1,ncols=2,figsize=(18,8))
fig.suptitle("Correlation Matrix")

sns.heatmap(corr, ax=ax[1], cmap="coolwarm", linewidths=.1,
            xticklabels=imb_corr.columns, yticklabels=imb_corr.columns)
ax[1].set_title("Balanced Data")

sns.heatmap(imb_corr, ax=ax[0], cmap="coolwarm", linewidth=.1)
ax[0].set_title("Unbalanced Data")

plt.show()

In [None]:
"""

Training two models that use different algorithms to classify
the data: Logistic Regression and Decision Tree Classifier. Afterward, trying to analyze which
model better classifies news transactions.

"""

#Building the first model using Logisitic Regression
lr_model = LogisticRegression()

lr_model.fit(X_rus, y_rus)

y_pred = lr_model.predict(X_val)

In [None]:
"""

Metrics for evaluating classification models are:

->Precision - the proportion of predicted Positives that are truly Positive
->Recall - the proportion of actual positives correctly classified
->f1-score - the harmonic mean of precision and recall
->accuracy - the proportion of true results among the total number of cases examined
->ROC score - indicates how well the probabilities from the positive classes are separated
  from the negative classes
  
"""

#Checking the metrics for the first model
print("Classification Report for Logisitic Regression Model: \n\n", classification_report(y_val, y_pred, digits=4))

#ROC
print("ROC Curve: \n\n", round(roc_auc_score(y_val, y_pred),4), "\n")

#plotting the confusion matrix
skplt.metrics.plot_confusion_matrix(y_val, y_pred, normalize=True);

In [None]:
#Building other model using Decision Tree Classifier

tree_depth = 4

dt_model = DecisionTreeClassifier(criterion="entropy", max_depth=tree_depth)

dt_model.fit(X_rus, y_rus)

y_pred = dt_model.predict(X_val)

In [None]:
#plotting the Decision Tree

# creating the dot
dot = export_graphviz(dt_model, filled=True, rounded=True, feature_names=X.columns, class_names=["Normal", "Fraudulent"])

#plotting
graph = pydotplus.graph_from_dot_data(dot)
Image(graph.create_png())

In [None]:
#Checking the metrics for the second model
print("Classification Report for the Decision Tree Classifier: \n\n", classification_report(y_val, y_pred, digits=4))

#ROC
print("ROC Curve: \n\n", round(roc_auc_score(y_val, y_pred),4), "\n")

#plotting the confusion matrix
skplt.metrics.plot_confusion_matrix(y_val, y_pred, normalize=True);

In [None]:
"""

Model validation:

We should check the metrics against data that the model has not seen before to validate it.
We will do that using the test data.

Let's first normalize the variables Time and Amount at the test data:

"""

# normalizing test data

test_copy = test.copy()

std_scaler = StandardScaler()
test_copy["std_amount"] = std_scaler.fit_transform(test_copy.Amount.values.reshape(-1,1))
test_copy["std_time"] = std_scaler.fit_transform(test_copy.Time.values.reshape(-1,1))

test_copy.drop(["Amount", "Time"], axis=1, inplace=True)

test_copy.head()

In [None]:
#Splitting the data in X and y
X_test = test_copy.drop(["Class"],axis=1)
y_test = test_copy["Class"]

#Prediciting test data for Logisitic Regression model
y_pred = lr_model.predict(X_test)

In [None]:
"""

Finally, let's check the accuracy and confusion matrix for the logistic regression model:

"""

#Checking the metrics for the first model
print("Classification Report for Logisitic Regression Model: \n\n", classification_report(y_test, y_pred, digits=4))

#ROC
print("ROC Curve: \n\n", round(roc_auc_score(y_test, y_pred),4), "\n")

#plotting the confusion matrix
skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True);

In [None]:
#prediciting data for the Decision Tree Model
y_pred = dt_model.predict(X_test)

In [None]:
#Checking the metrics for the second model
print("Classification Report for the Decision Tree Classifier: \n\n", classification_report(y_test, y_pred, digits=4))

#ROC
print("ROC Curve: \n\n", round(roc_auc_score(y_test, y_pred),4), "\n")

#plotting the confusion matrix
skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True);