# Machine Learning Classification

In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score

In [2]:
dataset = pd.read_csv('Data after PP/Data')

In [3]:
dataset = dataset.drop(['Unnamed: 0'],axis=1)

In [4]:
dataset

Unnamed: 0,mu1,lam1,mu2,sigma,mu3,sigma3,w1,w2,w3,n_spikes,Target
0,0.099999,0.010000,1.376800e-01,0.069408,0.368091,0.172628,0.286221,4.179009e-01,2.958779e-01,1015.0,0.0
1,0.019293,0.019831,6.633874e-02,0.000100,0.100001,0.000520,1.000000,1.192093e-07,1.421085e-14,10364.0,0.0
2,0.043575,0.029764,1.779872e-01,0.000100,0.100064,0.000260,1.000000,1.192093e-07,1.421085e-14,4589.0,0.0
3,0.024164,0.022119,6.418069e-02,0.000100,0.100010,0.000718,1.000000,1.192093e-07,1.421085e-14,8274.0,0.0
4,0.048184,0.030814,4.696582e-03,0.000100,0.100139,0.000218,1.000000,1.192093e-07,1.421085e-14,4151.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
543,0.100000,0.011880,1.038845e-01,0.058294,0.483309,0.271518,0.332834,4.549999e-01,2.121657e-01,1029.0,1.0
544,0.018542,0.018771,1.807094e-01,0.000100,0.100000,0.000106,1.000000,1.192093e-07,1.421085e-14,10784.0,1.0
545,0.099969,0.039687,9.343772e-02,0.040354,0.267501,0.112234,0.630343,2.080073e-01,1.616492e-01,1523.0,1.0
546,0.056240,0.031443,1.643654e-07,0.000100,0.100001,0.000100,1.000000,1.192093e-07,1.421085e-14,3555.0,1.0


In [5]:
dataset['Target'].value_counts()

0.0    317
1.0    231
Name: Target, dtype: int64

## Train test split

In [6]:
X = dataset.drop(['Target'],axis=1)
y= dataset['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

## Standardization

In [7]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Model Comparison

In [8]:
cv = StratifiedKFold(4)

## 1) Decision Tree Classifier

In [9]:
params = {'max_depth':np.linspace(2,10,dtype='int'),'criterion':['gini','entropy']}

clf = DecisionTreeClassifier(random_state=12)

gs = GridSearchCV(clf, params, scoring='roc_auc',cv=cv,n_jobs=-1)

gs=gs.fit(X_train,y_train)

In [10]:
print("Best parameters set found :",gs.best_params_,'\n')
print("Classification report on Test set\n")

y_true, y_pred = y_test, gs.predict(X_test)

accuracy = round(accuracy_score(y_true, y_pred),3)
recall = round(recall_score(y_true, y_pred),3)
precision = round(precision_score(y_true, y_pred),3)
roc_auc = round(roc_auc_score(y_true, y_pred),3)

print('Accuracy: ',accuracy)
print('Recall: ',recall)
print('Precision: ',precision)
print('ROC AUC: ',roc_auc)



Best parameters set found : {'criterion': 'entropy', 'max_depth': 4} 

Classification report on Test set

Accuracy:  0.912
Recall:  0.931
Precision:  0.871
ROC AUC:  0.915


## 2) Logistic Regression

In [11]:
params = {'C':np.logspace(-3,3,50),'penalty':['l1','l2','elasticnet']}

clf = LogisticRegression(solver='liblinear',random_state=12)

gs = GridSearchCV(clf, params, scoring='roc_auc',cv=cv,n_jobs=-1)

gs=gs.fit(X_train,y_train)

In [12]:
print("Best parameters set found :",gs.best_params_,'\n')
print("Classification report on Test set\n")

y_true, y_pred = y_test, gs.predict(X_test)

accuracy = round(accuracy_score(y_true, y_pred),3)
recall = round(recall_score(y_true, y_pred),3)
precision = round(precision_score(y_true, y_pred),3)
roc_auc = round(roc_auc_score(y_true, y_pred),3)

print('Accuracy: ',accuracy)
print('Recall: ',recall)
print('Precision: ',precision)
print('ROC AUC: ',roc_auc)



Best parameters set found : {'C': 0.21209508879201905, 'penalty': 'l1'} 

Classification report on Test set

Accuracy:  0.825
Recall:  0.655
Precision:  0.905
ROC AUC:  0.802


## 3) Support Vector Machines

In [13]:
params = {'C':np.logspace(-3,3,50),'kernel':['linear','rbf']}

clf = SVC(random_state=12)

gs = GridSearchCV(clf, params, scoring='roc_auc',cv=cv,n_jobs=-1)

gs=gs.fit(X_train,y_train)

In [14]:
print("Best parameters set found :",gs.best_params_,'\n')
print("Classification report on Test set\n")

y_true, y_pred = y_test, gs.predict(X_test)

accuracy = round(accuracy_score(y_true, y_pred),3)
recall = round(recall_score(y_true, y_pred),3)
precision = round(precision_score(y_true, y_pred),3)
roc_auc = round(roc_auc_score(y_true, y_pred),3)

print('Accuracy: ',accuracy)
print('Recall: ',recall)
print('Precision: ',precision)
print('ROC AUC: ',roc_auc)



Best parameters set found : {'C': 8.286427728546842, 'kernel': 'rbf'} 

Classification report on Test set

Accuracy:  0.927
Recall:  0.931
Precision:  0.9
ROC AUC:  0.928


## 4) Random Forest

In [15]:
params = {'n_estimators':np.linspace(10,150,dtype='int',num=10),'max_depth':np.linspace(2,5,dtype='int'),'criterion':['gini','entropy']}

clf = RandomForestClassifier(n_jobs=-1,random_state=12)

gs = GridSearchCV(clf, params, scoring='roc_auc',cv=cv,verbose=10,n_jobs=-1)

gs=gs.fit(X_train,y_train)

Fitting 4 folds for each of 1000 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1401s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0004s.) Setting batch_size=4.


ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.

In [None]:
print("Best parameters set found :",gs.best_params_,'\n')
print("Classification report on Test set\n")

y_true, y_pred = y_test, gs.predict(X_test)

accuracy = round(accuracy_score(y_true, y_pred),3)
recall = round(recall_score(y_true, y_pred),3)
precision = round(precision_score(y_true, y_pred),3)
roc_auc = round(roc_auc_score(y_true, y_pred),3)

print('Accuracy: ',accuracy)
print('Recall: ',recall)
print('Precision: ',precision)
print('ROC AUC: ',roc_auc)

## 5) Neural Net

In [16]:
X_train.shape

(411, 10)

In [17]:
import torch
print(torch.cuda.is_available())


ModuleNotFoundError: No module named 'torch'

In [18]:
from tensorflow import keras

model = keras.Sequential(
    [
        keras.layers.Dense(
            20, activation="relu", input_shape=(X_train.shape[1],)
        ),
        keras.layers.Dense(100, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(50, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                220       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               2100      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)      

In [19]:
metrics = [
    'accuracy',
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]


history=model.fit(
    X_train,
    y_train,
    batch_size=10,
    epochs=200,
    callbacks=callbacks,
    validation_data=(X_test, y_test),

)

Train on 411 samples, validate on 137 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch

In [20]:
# list all data in history
print(history.history.keys())
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


dict_keys(['loss', 'acc', 'fn', 'fp', 'tn', 'tp', 'precision', 'recall', 'val_loss', 'val_acc', 'val_fn', 'val_fp', 'val_tn', 'val_tp', 'val_precision', 'val_recall'])


KeyError: 'accuracy'

In [21]:
y_true, y_pred = y_test, model.predict_classes(X_test)

accuracy = round(accuracy_score(y_true, y_pred),3)
recall = round(recall_score(y_true, y_pred),3)
precision = round(precision_score(y_true, y_pred),3)
roc_auc = round(roc_auc_score(y_true, y_pred),3)

print('Accuracy: ',accuracy)
print('Recall: ',recall)
print('Precision: ',precision)
print('ROC AUC: ',roc_auc)

Accuracy:  0.956
Recall:  0.931
Precision:  0.964
ROC AUC:  0.953
