In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
train_data = pd.read_csv('data//train.csv')
train_data.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [3]:
training_data = train_data.drop(['id', 'target'], axis=1)
training_data_results = train_data['target']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(training_data, training_data_results, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
print(X_train_scaled.head())
print(X_test_scaled.head())

    gravity        ph      osmo      cond      urea      calc
0 -0.932744 -0.271945 -0.866679 -0.563507 -0.646711 -0.844550
1 -0.173427 -0.364694  0.196437  0.387993 -0.087629 -0.190738
2  1.041479  1.474843  1.259554  0.910648  0.844175 -0.450993
3 -1.540197  0.052680  0.524560 -0.509901  1.015627  0.929630
4  1.952659 -1.091234  0.192062  0.267380  0.613087  2.735547
    gravity        ph      osmo      cond      urea      calc
0  1.041479  0.129971  1.298928  0.803437  1.425620  1.691352
1  0.434026 -1.323108  0.940181  0.991057  0.747267  0.929630
2  0.434026 -1.137609  0.288312 -0.027451  1.201987 -0.168521
3 -0.173427  0.949261 -0.437933 -0.777930  0.262729  0.418641
4 -1.084607  1.304801 -1.291051 -0.818134 -0.915071 -0.450993


# Logistic Regression

In [6]:
LRModel = LogisticRegression(random_state = 42)
LRModel.fit(X_train_scaled, y_train)

In [7]:
y_predLR = LRModel.predict(X_test_scaled)
y_predLR

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1], dtype=int64)

In [8]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)

    if train:
        res = clf.predict(X_train)
        print('Train Results\n')
        print('Accuracy Score: {0:.4f} \n'.format(accuracy_score(y_train, res)))
        print('Classification Report: \n{}\n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n{}\n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f} \n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring = 'accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Accuracy SD: \t\t {0:.4f}'.format(np.std(res)))

     
    elif train == False:
        res_test = clf.predict(X_test)
        print('Test Results\n')
        print('Accuracy Score: {0:.4f} \n'.format(accuracy_score(y_test, res_test)))
        print('Classification Report: \n{}\n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n{}\n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f} \n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))   

In [9]:
print_score(LRModel, X_train_scaled, X_test_scaled, y_train, y_test, train = True)
print('\n *************************************************************************\n')
print_score(LRModel, X_train_scaled, X_test_scaled, y_train, y_test, train = False)

Train Results

Accuracy Score: 0.7281 

Classification Report: 
              precision    recall  f1-score   support

           0       0.72      0.85      0.78       185
           1       0.75      0.58      0.65       146

    accuracy                           0.73       331
   macro avg       0.73      0.71      0.71       331
weighted avg       0.73      0.73      0.72       331


Confusion Matrix: 
[[157  28]
 [ 62  84]]

ROC AUC: 0.7120 

Average Accuracy: 	 0.7129
Accuracy SD: 		 0.0696

 *************************************************************************

Test Results

Accuracy Score: 0.7831 

Classification Report: 
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        45
           1       0.79      0.71      0.75        38

    accuracy                           0.78        83
   macro avg       0.78      0.78      0.78        83
weighted avg       0.78      0.78      0.78        83


Confusion Matrix: 
[[38  7]
 

# Decision Tree

In [10]:
DTModel = DecisionTreeClassifier(random_state = 42)
DTModel.fit(X_train_scaled, y_train)

In [11]:
y_predDTC = DTModel.predict(X_test_scaled)
y_predDTC

array([1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1], dtype=int64)

In [12]:
print_score(DTModel, X_train_scaled, X_test_scaled, y_train, y_test, train = True)
print('\n *************************************************************************\n')
print_score(DTModel, X_train_scaled, X_test_scaled, y_train, y_test, train = False)

Train Results

Accuracy Score: 1.0000 

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       185
           1       1.00      1.00      1.00       146

    accuracy                           1.00       331
   macro avg       1.00      1.00      1.00       331
weighted avg       1.00      1.00      1.00       331


Confusion Matrix: 
[[185   0]
 [  0 146]]

ROC AUC: 1.0000 

Average Accuracy: 	 0.6737
Accuracy SD: 		 0.0445

 *************************************************************************

Test Results

Accuracy Score: 0.6386 

Classification Report: 
              precision    recall  f1-score   support

           0       0.66      0.69      0.67        45
           1       0.61      0.58      0.59        38

    accuracy                           0.64        83
   macro avg       0.64      0.63      0.63        83
weighted avg       0.64      0.64      0.64        83


Confusion Matrix: 
[[31 14]
 

# Random Forest Classifier

In [13]:
RFModel = RandomForestClassifier()
RFModel.fit(X_train_scaled, y_train)

In [14]:
y_predRFC = RFModel.predict(X_test_scaled)
y_predRFC

array([1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1], dtype=int64)

In [15]:
print_score(RFModel, X_train_scaled, X_test_scaled, y_train, y_test, train = True)
print('\n *************************************************************************\n')
print_score(RFModel, X_train_scaled, X_test_scaled, y_train, y_test, train = False)

Train Results

Accuracy Score: 1.0000 

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       185
           1       1.00      1.00      1.00       146

    accuracy                           1.00       331
   macro avg       1.00      1.00      1.00       331
weighted avg       1.00      1.00      1.00       331


Confusion Matrix: 
[[185   0]
 [  0 146]]

ROC AUC: 1.0000 

Average Accuracy: 	 0.7310
Accuracy SD: 		 0.0687

 *************************************************************************

Test Results

Accuracy Score: 0.7229 

Classification Report: 
              precision    recall  f1-score   support

           0       0.72      0.80      0.76        45
           1       0.73      0.63      0.68        38

    accuracy                           0.72        83
   macro avg       0.72      0.72      0.72        83
weighted avg       0.72      0.72      0.72        83


Confusion Matrix: 
[[36  9]
 

# SVM

In [16]:
SVMModel = SVC(kernel = 'linear', random_state = 42)
SVMModel.fit(X_train_scaled, y_train)

In [17]:
y_predSVM = SVMModel.predict(X_test_scaled)
y_predSVM

array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1], dtype=int64)

In [18]:
print_score(SVMModel, X_train_scaled, X_test_scaled, y_train, y_test, train = True)
print('\n *************************************************************************\n')
print_score(SVMModel, X_train_scaled, X_test_scaled, y_train, y_test, train = False)

Train Results

Accuracy Score: 0.7311 

Classification Report: 
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       185
           1       0.79      0.53      0.63       146

    accuracy                           0.73       331
   macro avg       0.75      0.71      0.71       331
weighted avg       0.74      0.73      0.72       331


Confusion Matrix: 
[[165  20]
 [ 69  77]]

ROC AUC: 0.7096 

Average Accuracy: 	 0.7189
Accuracy SD: 		 0.0683

 *************************************************************************

Test Results

Accuracy Score: 0.7711 

Classification Report: 
              precision    recall  f1-score   support

           0       0.77      0.82      0.80        45
           1       0.77      0.71      0.74        38

    accuracy                           0.77        83
   macro avg       0.77      0.77      0.77        83
weighted avg       0.77      0.77      0.77        83


Confusion Matrix: 
[[37  8]
 

# K Nearest Neighbor

In [19]:
KNNModel = KNeighborsClassifier(n_neighbors = 4)
KNNModel.fit(X_train_scaled, y_train)

In [20]:
y_predKNN = KNNModel.predict(X_train_scaled)
y_predKNN

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [21]:
print_score(KNNModel, X_train_scaled, X_test_scaled, y_train, y_test, train = True)
print('\n *************************************************************************\n')
print_score(KNNModel, X_train_scaled, X_test_scaled, y_train, y_test, train = False)

Train Results

Accuracy Score: 0.7734 

Classification Report: 
              precision    recall  f1-score   support

           0       0.74      0.91      0.82       185
           1       0.84      0.60      0.70       146

    accuracy                           0.77       331
   macro avg       0.79      0.75      0.76       331
weighted avg       0.79      0.77      0.77       331


Confusion Matrix: 
[[169  16]
 [ 59  87]]

ROC AUC: 0.7547 

Average Accuracy: 	 0.7040
Accuracy SD: 		 0.0697

 *************************************************************************

Test Results

Accuracy Score: 0.7229 

Classification Report: 
              precision    recall  f1-score   support

           0       0.70      0.84      0.77        45
           1       0.76      0.58      0.66        38

    accuracy                           0.72        83
   macro avg       0.73      0.71      0.71        83
weighted avg       0.73      0.72      0.72        83


Confusion Matrix: 
[[38  7]
 

# XGBoost

In [56]:
XGBModel = XGBClassifier(max_depth = 1, min_child_weight = 18, eval_metric = 'rmse', learning_rate = 0.03, max_leaves = 2, n_estimators = 10, n_jobs = 1, random_state = 42)
XGBModel.fit(X_train_scaled, y_train)

In [57]:
y_predXGB = XGBModel.predict(X_test_scaled)
y_predXGB

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1])

In [58]:
print_score(XGBModel, X_train_scaled, X_test_scaled, y_train, y_test, train = True)
print('\n *************************************************************************\n')
print_score(XGBModel, X_train_scaled, X_test_scaled, y_train, y_test, train = False)

Train Results

Accuracy Score: 0.7402 

Classification Report: 
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       185
           1       0.73      0.64      0.69       146

    accuracy                           0.74       331
   macro avg       0.74      0.73      0.73       331
weighted avg       0.74      0.74      0.74       331


Confusion Matrix: 
[[151  34]
 [ 52  94]]

ROC AUC: 0.7300 

Average Accuracy: 	 0.7159
Accuracy SD: 		 0.0658

 *************************************************************************

Test Results

Accuracy Score: 0.7952 

Classification Report: 
              precision    recall  f1-score   support

           0       0.80      0.82      0.81        45
           1       0.78      0.76      0.77        38

    accuracy                           0.80        83
   macro avg       0.79      0.79      0.79        83
weighted avg       0.79      0.80      0.79        83


Confusion Matrix: 
[[37  8]
 

# Neural Network Classifier

In [60]:
NNCModel = Sequential()
NNCModel.add(Dense(64, input_dim = X_train_scaled.shape[1], activation = 'relu'))
NNCModel.add(Dense(32, activation = 'relu'))
NNCModel.add(Dense(1, activation = 'sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [63]:
NNCModel.compile(optimizer = 'Nadam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [64]:
history = NNCModel.fit(X_train_scaled, y_train, epochs = 10, batch_size = 32)

Epoch 1/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.4533 - loss: 0.7550
Epoch 2/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5174 - loss: 0.6914 
Epoch 3/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6279 - loss: 0.6696 
Epoch 4/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7181 - loss: 0.6309 
Epoch 5/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7173 - loss: 0.6166 
Epoch 6/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7446 - loss: 0.6024 
Epoch 7/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6951 - loss: 0.6112 
Epoch 8/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7134 - loss: 0.5975 
Epoch 9/10
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━

In [65]:
y_pred_prob = NNCModel.predict(X_test_scaled)
y_predNNC = (y_pred_prob > 0.5).astype(int).flatten()
y_predNNC

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


array([1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1])