In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/31012021-insurance/data-stage1-31012021.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df = reduce_mem_usage(df)
df.info()

In [None]:
X = df.drop('Response', axis = 1).values
y = df['Response'].values

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

Library Modelling

In [None]:
df_results = pd.DataFrame(columns = ['Method', 'Accuracy', 'F1_score', 'AUC'])

In [None]:
df_results

Logistic

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

# Hyperparameters
penalty = ['l2']
C = [0.2,0.22,0.24, 0.26, 0.28, 0.3, 0.32, 0.36]

# Dict
hyperparameters = dict(penalty=penalty, C=C)

classifier = LogisticRegression(random_state = 42)

clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, scoring='roc_auc', verbose = 1, n_jobs=-1)
best_model = clf.fit(X_train, y_train)

print(best_model.best_estimator_)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, best_model.predict(X_test)))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num

In [None]:
df_results = df_results.append({ 'Method' : 'Logistic Regression',
                               'Accuracy' : accuracy_score(y_test, best_model.predict(X_test)),
                               'F1_score' : f1_score(y_test, best_model.predict(X_test)),
                               'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
                            }, ignore_index = True)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])

plt.title('Logistic Regression')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))

In [None]:
import pickle
filename = 'logistic.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
filename = 'logistic.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_

KNN

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform

# Hyperparameters
n_neighbors = [3, 5, 7, 9, 11, 13]
metric = ['euclidean', 'manhattan', 'minkowski']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']

# Dict
hyperparameters = dict(n_neighbors=n_neighbors, metric=metric, algorithm = algorithm)

classifier = KNeighborsClassifier()

clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, scoring='roc_auc', verbose = 1, n_jobs = -1)
best_model = clf.fit(X_train, y_train)

print(best_model.best_estimator_)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, best_model.predict(X_test)))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num

In [None]:
df_results = df_results.append({ 'Method' : 'KNN',
                               'Accuracy' : accuracy_score(y_test, best_model.predict(X_test)),
                               'F1_score' : f1_score(y_test, best_model.predict(X_test)),
                               'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
                            }, ignore_index = True)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])

plt.title('KNN')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))

In [None]:
import pickle
filename = 'knn.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
filename = 'knn.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_

Kernel SVM

In [None]:
from sklearn.svm import SVC

# Hyperparameters
kernel = ['rbf']
C = [0.1, 0.3, 0.5, 0.7]

# Dict
hyperparameters = dict(kernel=kernel, C=C)

classifier = SVC(random_state = 42, probability = True)
# classifier.fit(X_train, y_train)
clf = RandomizedSearchCV(classifier, hyperparameters, cv = 2, random_state=42, scoring='roc_auc', verbose = 1, n_jobs= -1)
best_model = clf.fit(X_train, y_train)

print(best_model.best_estimator_)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, best_model.predict(X_test)))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num

In [None]:
df_results = df_results.append({ 'Method' : 'SVM',
                               'Accuracy' : accuracy_score(y_test, best_model.predict(X_test)),
                               'F1_score' : f1_score(y_test, best_model.predict(X_test)),
                               'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
                            }, ignore_index = True)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])

plt.title('SVM')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))

In [None]:
import pickle
filename = 'svm.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
filename = 'svm.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, best_model.predict(X_test)))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num

In [None]:
df_results = df_results.append({ 'Method' : 'Naive Bayes',
                               'Accuracy' : accuracy_score(y_test, best_model.predict(X_test)),
                               'F1_score' : f1_score(y_test, best_model.predict(X_test)),
                               'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
                            }, ignore_index = True)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])

plt.title('Naive Bayes')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))

In [None]:
import pickle
filename = 'naiveb.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
filename = 'naiveb.sav'
best_model = pickle.load(open(filename, 'rb'))
print("Done")
best_model.best_estimator_

Decision Tree

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

#Hyper Parameter

max_depth = [int(x) for x in np.linspace(1, 110, num = 30)] # Maximum number of levels in tree
min_samples_split = [2, 5, 10, 100] # Minimum number of samples required to split a node
min_samples_leaf = [1, 2, 4, 10, 20, 50] # Minimum number of samples required at each leaf node
max_features = ['auto', 'sqrt'] # Number of features to consider at every split
criterion= ['gini', 'entropy']
# Dict
hyperparameters = {
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features,
               'criterion' : criterion
                }

classifier = DecisionTreeClassifier(random_state = 42)

clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, n_iter = 15, scoring='roc_auc', verbose = 1, n_jobs = -1)
best_model = clf.fit(X_train, y_train)

print(best_model.best_estimator_)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, best_model.predict(X_test)))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num

In [None]:
df_results = df_results.append({ 'Method' : 'Decision Tree',
                               'Accuracy' : accuracy_score(y_test, best_model.predict(X_test)),
                               'F1_score' : f1_score(y_test, best_model.predict(X_test)),
                               'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
                            }, ignore_index = True)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])

plt.title('Logistic Regression')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))

In [None]:
importance = best_model.best_estimator_.feature_importances_
feat_importances = pd.Series(importance, index= pd.Series(df.drop('Response', axis = 1).columns))
# feat_importances.plot(kind ="barh")
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')

In [None]:
import pickle
filename = 'dectree.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
filename = 'dectree.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_

# Bagging

Random forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

#Hyper Parameter

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] # Number of trees in random forest
max_features = ['auto', 'sqrt', 'log2'] # Number of features to consider at every split
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)] # Maximum number of levels in tree
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)] # Minimum number of samples required to split a node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)] # Minimum number of samples required at each leaf node
bootstrap = [True, False] # Method of selecting samples for training each tree
n_jobs = [-1]

#Menjadikan ke dalam bentuk dictionary
random_search = {'criterion': ['entropy','gini'],
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_split,
               'min_samples_split': min_samples_leaf,
               'n_estimators': n_estimators,
                'max_features' : max_features}

# random_search = {'criterion': ['entropy','gini'],
#                'max_depth': [10],
#                'min_samples_leaf': [6],
#                'min_samples_split': [7],
#                'n_estimators': [300]}

classifier = RandomForestClassifier(random_state = 42)

clf = RandomizedSearchCV(classifier, random_search, cv = 5, random_state=42, scoring='roc_auc', verbose = 4, n_jobs = -1)
best_model = clf.fit(X_train, y_train)

print(best_model.best_estimator_)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, y_pred))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, y_pred)) # generate the precision, recall, f-1 score, num

In [None]:
df_results = df_results.append({ 'Method' : 'Random Forest',
                               'Accuracy' : accuracy_score(y_test, best_model.predict(X_test)),
                               'F1_score' : f1_score(y_test, best_model.predict(X_test)),
                               'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
                            }, ignore_index = True)

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:,1])

plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))

In [None]:
roc_auc_score(y_test, y_pred_proba[:,1])

In [None]:
import pickle
filename = 'rforest1.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
filename = 'rforest1.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_

ANN

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from keras.metrics import AUC
from sklearn.model_selection import cross_val_score

In [None]:
def create_baseline():
    model = Sequential()
    model.add(Dense(15, input_dim = 15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=[AUC()])
    return model

In [None]:
cvscores = []
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
for train, test in kfold.split(X_train,y_train):
    model = create_baseline()
    history = model.fit(X_train, y_train, epochs = 3, batch_size = 32, verbose = 1, validation_data =(X_test,y_test))
    scores = model.evaluate(X_test, y_test, verbose = 1)
    print("\n %s: %.2f%%\n---------\n" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("AUC Result for Testing")
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
# kfold = StratifiedKFold(n_splits = 3, shuffle = True)
# results = cross_val_score(model,  X, y, cv = kfold, scoring = 'roc_auc')
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model.summary()

In [None]:
y_pred = model.predict(X_test, batch_size = 32)
y_pred = np.where(y_pred >= 0.5, 1, 0)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, y_pred))

print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))

from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, y_pred)) # generate the precision, recall, f-1 score, num

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, model.predict(X_test, batch_size = 32)[:,0])

plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))

In [None]:
import h5py

#Serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
#Serialize weights to HDF5
model.save_weights("ANN.h5")
print("Saved model to disk")

In [None]:
df_results = df_results.append({ 'Method' : 'ANN',
                               'Accuracy' : accuracy_score(y_test, y_pred),
                               'F1_score' : f1_score(y_test, y_pred),
                               'AUC' : np.mean(cvscores)/100
                            }, ignore_index = True)