In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
from matplotlib import gridspec 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
%matplotlib inline

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info

In [None]:
from pandas_profiling import ProfileReport

design_report = ProfileReport(df)
design_report.to_file(output_file='credit card detection.html') #for making a pandas profile report

In [None]:
df.shape

In [None]:
print('Proportion of the classes in the data:')
print(df['Class'].value_counts() / len(df))

In [None]:
print(df.isnull().sum().max()) #there are no Null Values so we dont need to worry about them

In [None]:
df.columns

In [None]:
print("Frauds in imbalanced dataset are",round(sum(df['Class']==1)/len(df)*100,2),"%")
print("Valid Transactions in imbalanced dataset are",round(sum(df['Class']==0)/len(df)*100,2),"%")

In [None]:
plt.figure(figsize=(20,2))
sns.countplot(y=df['Class'])
plt.savefig('countofdata.png')
plt.show()

In [None]:
class_0 = df.loc[df['Class'] == 0]["Time"]
class_1 = df.loc[df['Class'] == 1]["Time"]
plt.figure(figsize = (14,4))
plt.title('Credit Card Transactions Time Density Plot')
sns.distplot(class_0,kde=True,bins=480,color = 'red')
sns.distplot(class_1,kde=True,bins=480,color = 'blue')
plt.show()

In [None]:
timedelta = pd.to_timedelta(df['Time'], unit='s')
df['Time_min'] = (timedelta.dt.components.minutes).astype(int)
df['Time_hour'] = (timedelta.dt.components.hours).astype(int)

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(df[df['Class'] == 0]["Time_hour"], 
             color='g')
sns.distplot(df[df['Class'] == 1]["Time_hour"], 
             color='r')
plt.title('Fraud x Normal Transactions by Hours', fontsize=17)
plt.xlim([-1,25])

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(df[df['Class'] == 0]["Time_min"], 
             color='g')
sns.distplot(df[df['Class'] == 1]["Time_min"], 
             color='r')
plt.title('Fraud x Normal Transactions by Minutes', fontsize=17)
plt.xlim([-1,60])

In [None]:
fig,ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

In [None]:
pip install mlxtend  

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
plt.figure(figsize=(10,6))
corr_mat = df.corr()
corr_mat
sns.heatmap(corr_mat,cmap = 'coolwarm')

In [None]:
df = df.drop(['Time','Time_hour','Time_min'], axis = 1)
X = np.array(df.loc[:, df.columns != 'Class'])
y = np.array(df.loc[:, df.columns == 'Class']).reshape(-1, 1)
# standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 2, shuffle = True, stratify = y)
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(solver = 'lbfgs')

In [None]:
lr.fit(X_train,y_train.ravel())

In [None]:
train_pred = lr.predict(X_train)
y_pred=lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,roc_auc_score, precision_recall_curve, roc_curve, auc, average_precision_score

In [None]:
print(classification_report(y_test, y_pred)) 
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
print('Confusion Matrix - Training Dataset')
print(pd.crosstab(y_train.ravel(), train_pred, rownames = ['True'], colnames = ['Predicted'], margins = True))

In [None]:
average_precision = average_precision_score(y_test, y_pred)

In [None]:
print("Area under the curve : %f" % (roc_auc_score(y_test, y_pred)))

In [None]:
from mlxtend.plotting import plot_confusion_matrix
con_mat=confusion_matrix(y_test, y_pred)
def confus_matrix(CM):
    fig, ax = plot_confusion_matrix(conf_mat= CM)
    plt.title("The Confusion Matrix of full dataset using best_parameters")
    plt.ylabel("Actual")
    plt.xlabel("Predicted")
    plt.show()
    print("The accuracy is "+str((CM[1,1]+CM[0,0])/(CM[0,0] + CM[0,1]+CM[1,0] + CM[1,1])*100) + " %")
    print("The recall from the confusion matrix is "+ str(CM[1,1]/(CM[1,0] + CM[1,1])*100) +" %")
confus_matrix(con_mat)

In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred)
roc_auc_rf = auc(fpr_rf, tpr_rf)
plt.figure(figsize=(8,8))
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_rf, tpr_rf, lw=1, label='{} curve (AUC = {:0.2f})'.format('RF',roc_auc_rf))

plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

In [None]:
print('Accuracy score for Training Dataset = ', accuracy_score(train_pred, y_train))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_pred, y_test))

In [None]:
134/330

That is a whopping 41%! We are classifying 41% of the fraud cases as not fraud. This is going to cost some serious losses to the credit card company. You can observe this similarly in the confusion matrix of the Testing Dataset.

The higher accuracy is not due to correct classification. The model has predicted the majority class for almost all the examples. And since about 99.8% of the examples actually belong to this class, it leads to such high accuracy scores.

We have obtained great accuracy and prediction, But this is hypothetical, as there was lot of imbalanced data.
Imbalanced classes are a common problem in machine learning classification where there are a disproportionate ratio 
of observations in each class. Class imbalance can be found in many different areas including medical diagnosis, spam filtering, and fraud detection.

In [None]:
#Now, we will apply different imbalanced data handling techniques and see their accuracy and recall results.

In [None]:
#Using SMOTE Algorithm

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

# import SMOTE module from imblearn library 
# pip install imblearn (if you don't have imblearn in your system) 
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 


Look! that SMOTE Algorithm has oversampled the minority instances and made it equal to majority class.
Both categories have equal amount of records. More specifically, the minority class 
has been increased to the total number of majority class.
Now see the accuracy and recall results after applying SMOTE algorithm (Oversampling).

In [None]:
smote_logistic = LogisticRegression()
smote_logistic.fit(X_train_res,y_train_res)

In [None]:
train_pred_sm = smote_logistic.predict(X_train_res)
y_smote= smote_logistic.predict(X_test)

In [None]:
# Checking accuracy
accuracy_score(y_test,y_smote)

In [None]:
# f1 score
f1_score(y_test, y_smote)

In [None]:
pd.Series(y_train_res).value_counts().plot.bar()

In [None]:
print('Accuracy score for Training Dataset = ', accuracy_score(train_pred_sm,y_train_res))
print('Accuracy score for Testing Dataset = ', accuracy_score(y_smote, y_test))

In [None]:
average_precision = average_precision_score(y_test, y_pred)

In [None]:
print("Area under the curve : %f" % (roc_auc_score(y_test, y_pred)))

In [None]:
from mlxtend.plotting import plot_confusion_matrix
con_mat=confusion_matrix(y_test, y_smote)
def confus_matrix(CM):
    fig, ax = plot_confusion_matrix(conf_mat= CM)
    plt.title("The Confusion Matrix of full dataset using best_parameters")
    plt.ylabel("Actual")
    plt.xlabel("Predicted")
    plt.show()
    print("The accuracy is "+str((CM[1,1]+CM[0,0])/(CM[0,0] + CM[0,1]+CM[1,0] + CM[1,1])*100) + " %")
    print("The recall from the confusion matrix is "+ str(CM[1,1]/(CM[1,0] + CM[1,1])*100) +" %")
confus_matrix(con_mat)

In [None]:
precision, recall, _ = precision_recall_curve(y_test, y_smote)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_smote)
roc_auc_rf = auc(fpr_rf, tpr_rf)
plt.figure(figsize=(8,8))
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_rf, tpr_rf, lw=1, label='{} curve (AUC = {:0.2f})'.format('RF',roc_auc_rf))

plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

In [None]:
print('Confusion Matrix - Training Dataset')
print(pd.crosstab(y_train_res, train_pred_sm, rownames = ['True'], colnames = ['Predicted'], margins = True))

In [None]:
16685/190490

16685 out of 190490 fraud cases have been classified as not fraud. This is a mere 8.7% compared to the previous 41%.

A vast improvement!

Same is the case with the Testing Dataset.

In [None]:
print('Confusion Matrix - Testing Dataset')
print(pd.crosstab(y_test.ravel(), y_smote, rownames = ['True'], colnames = ['Predicted'], margins = True))

In [None]:
12/162

Roughly 7.4% of the fraud classes have been classified as not fraud.

CONCLUSION

Implementing SMOTE on our imbalanced dataset helped us with the imbalance of our labels (more no fraud than fraud transactions).

Also, in our undersample data our model is unable to detect for a large number of cases non fraud transactions correctly and instead, 
misclassifies those non fraud transactions as fraud cases. Imagine that people that were making regular purchases got their card blocked due to the reason that our model classified that transaction as a fraud transaction, this will be a huge disadvantage for the financial institution. The number of customer complaints and customer disatisfaction will increase. 
The next step of this analysis will be to do an outlier removal on our oversample dataset and see if our accuracy in the test set improves.