In [1]:
import hnswlib
import numpy as np
import pandas as pd
import pickle as pkl
import seaborn as sns
import sweetviz as sv
import xgboost as xgb
import category_encoders as ce
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import FastICA
from sklearn.naive_bayes import GaussianNB
from pandas_profiling import ProfileReport
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
heart_dis_pred = pd.read_csv("heart_cleaned_encoded_smote.csv")

In [4]:
predictors = heart_dis_pred.drop("HeartDisease_No",axis=1)
target = heart_dis_pred["HeartDisease_No"]

X_train,X_test,Y_train,Y_test = train_test_split(predictors,target,test_size=0.20,random_state=42)

In [5]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(374300, 25)
(374300,)
(93576, 25)
(93576,)


# ANN with Scaling + SMOTE

In [6]:
data = pd.concat([X_train,Y_train],axis = 1)
train = np.array(data)
data = pd.concat([X_test,Y_test],axis = 1)
test = np.array(data)
dim = len(train[0])

In [7]:
%%time
p = hnswlib.Index(space='l2', dim=dim)
p.init_index(max_elements=len(train), ef_construction=10000, M=dim, random_seed = 100)
p.set_ef(1000)
p.add_items(train,train[:,25])

# prediction for train and evaluate
labels, distances = p.knn_query(train[:,:25], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == train[:,25]), "\n")

# prediction for test and evaluate
labels, distances = p.knn_query(np.array(test)[:,:25], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.array(test)[:,25]), "\n")

[0 1 0 ... 1 1 1]
Recall for two batches: 0.9490488912636922 

[0 1 0 ... 1 0 0]
Recall for two batches: 0.9501795332136446 

Wall time: 5.24 s


In [8]:
pkl.dump(p, open('model_ann.pkl', 'wb'))

# ANN with Scaling + SMOTE + PCA

In [9]:
train = pd.read_csv("heart_cleaned_encoded_trained_pca.csv")
train = np.array(train)
test = pd.read_csv("heart_cleaned_encoded_tested_pca.csv")
test = np.array(test)
dim = len(train[0])

In [10]:
%%time
p = hnswlib.Index(space='l2', dim=dim)
p.init_index(max_elements=len(train), ef_construction=10000, M=dim, random_seed = 100)
p.set_ef(1000)
p.add_items(train,train[:,21])

# prediction for train and evaluate
labels, distances = p.knn_query(train[:,:21], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == train[:,21]), "\n")

# prediction for test and evaluate
labels, distances = p.knn_query(np.array(test)[:,:21], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.array(test)[:,21]), "\n")

[1 1 1 ... 0 0 0]
Recall for two batches: 0.7267117783344305 

[1 1 1 ... 1 1 1]
Recall for two batches: 0.9143982864022264 

Wall time: 6.01 s


In [11]:
pkl.dump(p, open('model_ann_pca.pkl', 'wb'))

# ANN with Scaling + SMOTE + ICA

In [12]:
heart_dis_train_pred = pd.read_csv("heart_cleaned_encoded_trained_ica.csv")
train = np.array(heart_dis_train_pred)
heart_dis_test_pred = pd.read_csv("heart_cleaned_encoded_tested_ica.csv")
test = np.array(heart_dis_test_pred)
dim = len(train[0])

In [13]:
%%time
p = hnswlib.Index(space='l2', dim=dim)
p.init_index(max_elements=len(train), ef_construction=10000, M=dim, random_seed = 100)
p.set_ef(1000)
p.add_items(train,train[:,21])

# prediction for train and evaluate
labels, distances = p.knn_query(train[:,:21], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == train[:,21]), "\n")

# prediction for test and evaluate
labels, distances = p.knn_query(np.array(test)[:,:21], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.array(test)[:,21]), "\n")

[1 1 1 ... 0 0 0]
Recall for two batches: 0.7266433841445169 

[1 1 1 ... 1 1 1]
Recall for two batches: 0.9143982864022264 

Wall time: 5.99 s


In [14]:
pkl.dump(p, open('model_ann_ica.pkl', 'wb'))

# ANN with Scaling + SMOTE + LDA

In [15]:
heart_dis_train_pred = pd.read_csv("heart_cleaned_encoded_trained_lda.csv")
train = np.array(heart_dis_train_pred)
heart_dis_test_pred = pd.read_csv("heart_cleaned_encoded_tested_lda.csv")
test = np.array(heart_dis_test_pred)
dim = len(train[0])

In [16]:
%%time
p = hnswlib.Index(space='l2', dim=dim)
p.init_index(max_elements=len(train), ef_construction=10000, M=dim, random_seed = 100)
p.set_ef(1000)
p.add_items(train,train[:,2])

# prediction for train and evaluate
labels, distances = p.knn_query(train[:,:2], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == train[:,2]), "\n")

# prediction for test and evaluate
labels, distances = p.knn_query(np.array(test)[:,:2], k=1)
print(labels.reshape(-1))
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.array(test)[:,2]), "\n")

[1 1 1 ... 0 0 0]
Recall for two batches: 0.7267737605690396 

[1 1 1 ... 1 1 0]
Recall for two batches: 0.9143826513860441 

Wall time: 14.8 s


In [17]:
pkl.dump(p, open('model_ann_lda.pkl', 'wb'))