In [5]:
# Imports
from data_utils import prepare_data, prep_pca_data
from ml_utils import *

## Reading data

Using prepare_data to read, cleanup, and normalize data. After processing, there are 455 training data points and 114 testing data points.

In [2]:
TRAIN_DATA_LOC = './data/training_data.csv'
TEST_DATA_LOC = './data/testing_data.csv'

orig_train_features, train_labels, orig_test_features, test_labels = prepare_data(TRAIN_DATA_LOC, 
                                                                        TEST_DATA_LOC)

print(orig_train_features.shape, orig_test_features.shape)

(455, 30) (114, 30)


## Training ML models with original 30 features

In [3]:
train_features = orig_train_features
test_features = orig_test_features

lr_train_metrics, lr_test_metrics = classify_logistic_reg(train_features, 
                                                          train_labels, 
                                                          test_features, 
                                                          test_labels)

dt_train_metrics, dt_test_metrics = classify_decision_tree(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

rf_train_metrics, rf_test_metrics = classify_random_forest(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

svm_train_metrics, svm_test_metrics = classify_svm(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

knn_train_metrics, knn_test_metrics = classify_knn(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

print(f'Logistic Regression:')
print(f'Train Metrics: Acc={lr_train_metrics[0]}, P={lr_train_metrics[1]}, R={lr_train_metrics[2]}')
print(f'Test Metrics: Acc={lr_test_metrics[0]}, P={lr_test_metrics[1]}, R={lr_test_metrics[2]}')

print(f'\nDecision Tree:')
print(f'Train Metrics: Acc={dt_train_metrics[0]}, P={dt_train_metrics[1]}, R={dt_train_metrics[2]}')
print(f'Test Metrics: Acc={dt_test_metrics[0]}, P={dt_test_metrics[1]}, R={dt_test_metrics[2]}')

print(f'\nRandom Forest:')
print(f'Train Metrics: Acc={rf_train_metrics[0]}, P={rf_train_metrics[1]}, R={rf_train_metrics[2]}')
print(f'Test Metrics: Acc={rf_test_metrics[0]}, P={rf_test_metrics[1]}, R={rf_test_metrics[2]}')

print(f'\nSVM:')
print(f'Train Metrics: Acc={svm_train_metrics[0]}, P={svm_train_metrics[1]}, R={svm_train_metrics[2]}')
print(f'Test Metrics: Acc={svm_test_metrics[0]}, P={svm_test_metrics[1]}, R={svm_test_metrics[2]}')

print(f'\nKNN:')
print(f'Train Metrics: Acc={knn_train_metrics[0]}, P={knn_train_metrics[1]}, R={knn_train_metrics[2]}')
print(f'Test Metrics: Acc={knn_test_metrics[0]}, P={knn_test_metrics[1]}, R={knn_test_metrics[2]}')

Logistic Regression:
Train Metrics: Acc=0.9736263736263736, P=0.9820359281437125, R=0.9479768786127167
Test Metrics: Acc=0.956140350877193, P=1.0, R=0.8717948717948718

Decision Tree:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9210526315789473, P=0.9166666666666666, R=0.8461538461538461

Random Forest:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9385964912280702, P=0.9210526315789473, R=0.8974358974358975

SVM:
Train Metrics: Acc=0.9846153846153847, P=0.9825581395348837, R=0.976878612716763
Test Metrics: Acc=0.9736842105263158, P=1.0, R=0.9230769230769231

KNN:
Train Metrics: Acc=0.9846153846153847, P=0.9940476190476191, R=0.9653179190751445
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975


## Prepare PCA data

In [6]:
pca_train_features, pca_test_features = prep_pca_data(train_features, test_features)


## Training ML models with PCA - Top 5 

In [7]:
train_features = pca_train_features[:, :5]
test_features = pca_test_features[:, :5]

lr_train_metrics, lr_test_metrics = classify_logistic_reg(train_features, 
                                                          train_labels, 
                                                          test_features, 
                                                          test_labels)

dt_train_metrics, dt_test_metrics = classify_decision_tree(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

rf_train_metrics, rf_test_metrics = classify_random_forest(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

svm_train_metrics, svm_test_metrics = classify_svm(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

knn_train_metrics, knn_test_metrics = classify_knn(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

print(f'Logistic Regression:')
print(f'Train Metrics: Acc={lr_train_metrics[0]}, P={lr_train_metrics[1]}, R={lr_train_metrics[2]}')
print(f'Test Metrics: Acc={lr_test_metrics[0]}, P={lr_test_metrics[1]}, R={lr_test_metrics[2]}')

print(f'\nDecision Tree:')
print(f'Train Metrics: Acc={dt_train_metrics[0]}, P={dt_train_metrics[1]}, R={dt_train_metrics[2]}')
print(f'Test Metrics: Acc={dt_test_metrics[0]}, P={dt_test_metrics[1]}, R={dt_test_metrics[2]}')

print(f'\nRandom Forest:')
print(f'Train Metrics: Acc={rf_train_metrics[0]}, P={rf_train_metrics[1]}, R={rf_train_metrics[2]}')
print(f'Test Metrics: Acc={rf_test_metrics[0]}, P={rf_test_metrics[1]}, R={rf_test_metrics[2]}')

print(f'\nSVM:')
print(f'Train Metrics: Acc={svm_train_metrics[0]}, P={svm_train_metrics[1]}, R={svm_train_metrics[2]}')
print(f'Test Metrics: Acc={svm_test_metrics[0]}, P={svm_test_metrics[1]}, R={svm_test_metrics[2]}')

print(f'\nKNN:')
print(f'Train Metrics: Acc={knn_train_metrics[0]}, P={knn_train_metrics[1]}, R={knn_train_metrics[2]}')
print(f'Test Metrics: Acc={knn_test_metrics[0]}, P={knn_test_metrics[1]}, R={knn_test_metrics[2]}')

Logistic Regression:
Train Metrics: Acc=0.9692307692307692, P=0.9818181818181818, R=0.9364161849710982
Test Metrics: Acc=0.956140350877193, P=1.0, R=0.8717948717948718

Decision Tree:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.8947368421052632, P=0.813953488372093, R=0.8974358974358975

Random Forest:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9385964912280702, P=0.9210526315789473, R=0.8974358974358975

SVM:
Train Metrics: Acc=0.9802197802197802, P=0.9767441860465116, R=0.9710982658959537
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975

KNN:
Train Metrics: Acc=0.9824175824175824, P=0.9881656804733728, R=0.9653179190751445
Test Metrics: Acc=0.9473684210526315, P=0.9714285714285714, R=0.8717948717948718


## Training ML models with PCA - Top 10 features

In [8]:
train_features = pca_train_features[:, :10]
test_features = pca_test_features[:, :10]

lr_train_metrics, lr_test_metrics = classify_logistic_reg(train_features, 
                                                          train_labels, 
                                                          test_features, 
                                                          test_labels)

dt_train_metrics, dt_test_metrics = classify_decision_tree(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

rf_train_metrics, rf_test_metrics = classify_random_forest(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

svm_train_metrics, svm_test_metrics = classify_svm(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

knn_train_metrics, knn_test_metrics = classify_knn(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

print(f'Logistic Regression:')
print(f'Train Metrics: Acc={lr_train_metrics[0]}, P={lr_train_metrics[1]}, R={lr_train_metrics[2]}')
print(f'Test Metrics: Acc={lr_test_metrics[0]}, P={lr_test_metrics[1]}, R={lr_test_metrics[2]}')

print(f'\nDecision Tree:')
print(f'Train Metrics: Acc={dt_train_metrics[0]}, P={dt_train_metrics[1]}, R={dt_train_metrics[2]}')
print(f'Test Metrics: Acc={dt_test_metrics[0]}, P={dt_test_metrics[1]}, R={dt_test_metrics[2]}')

print(f'\nRandom Forest:')
print(f'Train Metrics: Acc={rf_train_metrics[0]}, P={rf_train_metrics[1]}, R={rf_train_metrics[2]}')
print(f'Test Metrics: Acc={rf_test_metrics[0]}, P={rf_test_metrics[1]}, R={rf_test_metrics[2]}')

print(f'\nSVM:')
print(f'Train Metrics: Acc={svm_train_metrics[0]}, P={svm_train_metrics[1]}, R={svm_train_metrics[2]}')
print(f'Test Metrics: Acc={svm_test_metrics[0]}, P={svm_test_metrics[1]}, R={svm_test_metrics[2]}')

print(f'\nKNN:')
print(f'Train Metrics: Acc={knn_train_metrics[0]}, P={knn_train_metrics[1]}, R={knn_train_metrics[2]}')
print(f'Test Metrics: Acc={knn_test_metrics[0]}, P={knn_test_metrics[1]}, R={knn_test_metrics[2]}')

Logistic Regression:
Train Metrics: Acc=0.9714285714285714, P=0.9878048780487805, R=0.9364161849710982
Test Metrics: Acc=0.956140350877193, P=1.0, R=0.8717948717948718

Decision Tree:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.8947368421052632, P=0.813953488372093, R=0.8974358974358975

Random Forest:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9210526315789473, P=0.8947368421052632, R=0.8717948717948718

SVM:
Train Metrics: Acc=0.9846153846153847, P=0.9825581395348837, R=0.976878612716763
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975

KNN:
Train Metrics: Acc=0.9868131868131869, P=0.9940828402366864, R=0.9710982658959537
Test Metrics: Acc=0.956140350877193, P=1.0, R=0.8717948717948718


## Training ML models with PCA - Top 15 features

In [9]:
train_features = pca_train_features[:, :15]
test_features = pca_test_features[:, :15]

lr_train_metrics, lr_test_metrics = classify_logistic_reg(train_features, 
                                                          train_labels, 
                                                          test_features, 
                                                          test_labels)

dt_train_metrics, dt_test_metrics = classify_decision_tree(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

rf_train_metrics, rf_test_metrics = classify_random_forest(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

svm_train_metrics, svm_test_metrics = classify_svm(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

knn_train_metrics, knn_test_metrics = classify_knn(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

print(f'Logistic Regression:')
print(f'Train Metrics: Acc={lr_train_metrics[0]}, P={lr_train_metrics[1]}, R={lr_train_metrics[2]}')
print(f'Test Metrics: Acc={lr_test_metrics[0]}, P={lr_test_metrics[1]}, R={lr_test_metrics[2]}')

print(f'\nDecision Tree:')
print(f'Train Metrics: Acc={dt_train_metrics[0]}, P={dt_train_metrics[1]}, R={dt_train_metrics[2]}')
print(f'Test Metrics: Acc={dt_test_metrics[0]}, P={dt_test_metrics[1]}, R={dt_test_metrics[2]}')

print(f'\nRandom Forest:')
print(f'Train Metrics: Acc={rf_train_metrics[0]}, P={rf_train_metrics[1]}, R={rf_train_metrics[2]}')
print(f'Test Metrics: Acc={rf_test_metrics[0]}, P={rf_test_metrics[1]}, R={rf_test_metrics[2]}')

print(f'\nSVM:')
print(f'Train Metrics: Acc={svm_train_metrics[0]}, P={svm_train_metrics[1]}, R={svm_train_metrics[2]}')
print(f'Test Metrics: Acc={svm_test_metrics[0]}, P={svm_test_metrics[1]}, R={svm_test_metrics[2]}')

print(f'\nKNN:')
print(f'Train Metrics: Acc={knn_train_metrics[0]}, P={knn_train_metrics[1]}, R={knn_train_metrics[2]}')
print(f'Test Metrics: Acc={knn_test_metrics[0]}, P={knn_test_metrics[1]}, R={knn_test_metrics[2]}')

Logistic Regression:
Train Metrics: Acc=0.9714285714285714, P=0.9819277108433735, R=0.9421965317919075
Test Metrics: Acc=0.956140350877193, P=1.0, R=0.8717948717948718

Decision Tree:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.8859649122807017, P=0.8095238095238095, R=0.8717948717948718

Random Forest:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9035087719298246, P=0.8333333333333334, R=0.8974358974358975

SVM:
Train Metrics: Acc=0.9868131868131869, P=0.9883040935672515, R=0.976878612716763
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975

KNN:
Train Metrics: Acc=0.9846153846153847, P=0.9940476190476191, R=0.9653179190751445
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975


## Training ML models with PCA - Top 20 features

In [13]:
train_features = pca_train_features[:, :20]
test_features = pca_test_features[:, :20]

lr_train_metrics, lr_test_metrics = classify_logistic_reg(train_features, 
                                                          train_labels, 
                                                          test_features, 
                                                          test_labels)

dt_train_metrics, dt_test_metrics = classify_decision_tree(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

rf_train_metrics, rf_test_metrics = classify_random_forest(train_features, 
                                                           train_labels, 
                                                           test_features, 
                                                           test_labels)

svm_train_metrics, svm_test_metrics = classify_svm(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

knn_train_metrics, knn_test_metrics = classify_knn(train_features, 
                                                   train_labels, 
                                                   test_features, 
                                                   test_labels)

print(f'Logistic Regression:')
print(f'Train Metrics: Acc={lr_train_metrics[0]}, P={lr_train_metrics[1]}, R={lr_train_metrics[2]}')
print(f'Test Metrics: Acc={lr_test_metrics[0]}, P={lr_test_metrics[1]}, R={lr_test_metrics[2]}')

print(f'\nDecision Tree:')
print(f'Train Metrics: Acc={dt_train_metrics[0]}, P={dt_train_metrics[1]}, R={dt_train_metrics[2]}')
print(f'Test Metrics: Acc={dt_test_metrics[0]}, P={dt_test_metrics[1]}, R={dt_test_metrics[2]}')

print(f'\nRandom Forest:')
print(f'Train Metrics: Acc={rf_train_metrics[0]}, P={rf_train_metrics[1]}, R={rf_train_metrics[2]}')
print(f'Test Metrics: Acc={rf_test_metrics[0]}, P={rf_test_metrics[1]}, R={rf_test_metrics[2]}')

print(f'\nSVM:')
print(f'Train Metrics: Acc={svm_train_metrics[0]}, P={svm_train_metrics[1]}, R={svm_train_metrics[2]}')
print(f'Test Metrics: Acc={svm_test_metrics[0]}, P={svm_test_metrics[1]}, R={svm_test_metrics[2]}')

print(f'\nKNN:')
print(f'Train Metrics: Acc={knn_train_metrics[0]}, P={knn_train_metrics[1]}, R={knn_train_metrics[2]}')
print(f'Test Metrics: Acc={knn_test_metrics[0]}, P={knn_test_metrics[1]}, R={knn_test_metrics[2]}')

Logistic Regression:
Train Metrics: Acc=0.9736263736263736, P=0.9820359281437125, R=0.9479768786127167
Test Metrics: Acc=0.956140350877193, P=1.0, R=0.8717948717948718

Decision Tree:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9035087719298246, P=0.8333333333333334, R=0.8974358974358975

Random Forest:
Train Metrics: Acc=1.0, P=1.0, R=1.0
Test Metrics: Acc=0.9035087719298246, P=0.8333333333333334, R=0.8974358974358975

SVM:
Train Metrics: Acc=0.9868131868131869, P=0.9883040935672515, R=0.976878612716763
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975

KNN:
Train Metrics: Acc=0.9846153846153847, P=0.9940476190476191, R=0.9653179190751445
Test Metrics: Acc=0.9649122807017544, P=1.0, R=0.8974358974358975
