In [None]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

import itertools, time, datetime
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Non-Default", "Default"]
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot

In [None]:
data = pd.read_csv('UCI_Credit_Card.csv',sep=',')

print(data.columns)

In [None]:
data1= data.sample(frac = 0.1,random_state=1)

data1.shape

In [None]:
data.describe()

## Exploratory Data Analysis

In [None]:
data.shape

# Check the shape of the imported datafram

Let us now check the missing values in the dataset

In [None]:
data.isnull().values.any()

# Check to see if there are any missing values

In [None]:
data.head()

In [None]:
count_classes = pd.value_counts(data['default.payment.next.month'], sort = True)

count_classes.plot(kind = 'bar', rot=0)

plt.title("Transaction Class Distribution")

plt.xticks(range(2), LABELS)

plt.xlabel("Class")

plt.ylabel("Frequency");

In [None]:
Default = data[data['default.payment.next.month']==1]

Normal = data[data['default.payment.next.month']==0]


In [None]:
Default.shape

# Number of default accounts

In [None]:
Normal.shape

# Number of current accounts

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs Amount by class')
ax1.scatter(Fraud.Time, Fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(Normal.Time, Normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()


In [None]:
init_notebook_mode(connected=True)
plotly.offline.init_notebook_mode(connected=True)

In [None]:
data1.shape

In [None]:
data1.hist(figsize=(20,20))
plt.show()

In [None]:
correlation_matrix = data1.corr()

fig = plt.figure(figsize=(12,9))

sns.heatmap(correlation_matrix,vmax=0.8,square = True)

plt.show()

Get all the columns from the dataframe

In [None]:
columns = data1.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting 
target = "default.payment.next.month"
# Define a random state 
state = np.random.RandomState(42)
X = data1[columns]
Y = data1[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

In [None]:
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, 
                                         max_iter=-1, random_state=state)
   
}

In [None]:
n_outliers = len(Default)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Support Vector Machine":
        clf.fit(X)
        y_pred = clf.predict(X)
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Current accounts , 1 for Default accounts
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))

In [None]:
y_target = data['default.payment.next.month'].values

columns = data.columns.tolist()
columns.remove('default.payment.next.month')

x_attributes = data[columns].values


## meaning of stratify = _y_target. returns test and training data having the same proportions of class label '_y_target'
x_train,x_test,y_train, y_test = train_test_split(x_attributes, y_target, test_size =0.30, stratify = y_target, random_state = 1)

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
def runGridSearchAndPredict(pipeline, x_train, y_train, x_test, y_test, param_grid):
    response = {}

    gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv = 10, n_jobs = 1, scoring = 'accuracy')

    search = gridsearch.fit(x_train,y_train)

    print("Grid Search Best parameters ", search.best_params_)
    print("Grid Search Best score ", search.best_score_)

    y_prediction = gridsearch.predict(x_test)

    print("Accuracy score %s" %accuracy_score(y_test,y_prediction))
    print("F1 score %s" %f1_score(y_test,y_prediction))
    print("Classification report  \n %s" %(classification_report(y_test, y_prediction)))

    plotLearningCurve(x_train, y_train, search.best_estimator_)
    
    return response

In [None]:
classifiers = [
    LogisticRegression(random_state = 1),
    DecisionTreeClassifier(random_state = 1, criterion = 'gini'),
    KNeighborsClassifier(metric = 'minkowski'),
]

classifier_names = [
            'logisticregression',
            'decisiontreeclassifier',
            'kneighborsclassifier',
]

classifier_param_grid = [
            
            {'logisticregression__C':[100,200,300,50,20,600]},
            {'decisiontreeclassifier__max_depth':[6,7,8,9,10,11]},
            {'kneighborsclassifier__n_neighbors':[4,6,7,8]},
]

In [None]:
model_metrics = {}

for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):

    pipeline = Pipeline([('scaler', StandardScaler()),(model_name, model)])

    result = runGridSearchAndPredict(pipeline, x_train, y_train, x_test, y_test, model_param_grid)

    y_prediction = result['y_prediction']

    _matrix = confusion_matrix(y_true = y_test ,y_pred = y_prediction)

    model_metrics[model_name] = {}
    model_metrics[model_name]['confusion_matrix'] = _matrix
    model_metrics[model_name]['accuracy_score'] = result['accuracy_score']