In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import nltk 
import re 
import string
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tag.stanford import StanfordNERTagger
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import seaborn as sns
import datetime
# importing neural network libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential ,Model, load_model
from tensorflow.keras.layers import Embedding, Dense, Dropout, Embedding, LSTM, Input, Bidirectional, Conv1D, MaxPooling1D, AveragePooling1D, GlobalAveragePooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
data = pd.read_csv('heart_dataset.csv')
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,condition
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
data.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'condition'],
      dtype='object')

In [4]:
data=data.dropna()

In [5]:
num_class = len(np.unique(data.condition.values))

In [6]:
feature_data = np.asarray(data[['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose']])

In [7]:
feature_data

array([[  1.  ,  39.  ,   4.  , ...,  26.97,  80.  ,  77.  ],
       [  0.  ,  46.  ,   2.  , ...,  28.73,  95.  ,  76.  ],
       [  1.  ,  48.  ,   1.  , ...,  25.34,  75.  ,  70.  ],
       ...,
       [  1.  ,  50.  ,   1.  , ...,  25.97,  66.  ,  86.  ],
       [  1.  ,  51.  ,   3.  , ...,  19.71,  65.  ,  68.  ],
       [  0.  ,  52.  ,   2.  , ...,  21.47,  80.  , 107.  ]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(feature_data, data['condition'], test_size=0.1,random_state=42)

In [9]:
MAX_LENGTH = 500
tokenizer = Tokenizer()
#tokenizer.fit_on_texts(X_train)

In [10]:
#X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_LENGTH)
X_train

array([[  0,   0,   0, ...,  27,  75,  70],
       [  0,   0,   0, ...,  25,  75,  83],
       [  0,   0,   0, ...,  35,  73,  75],
       ...,
       [  0,   0,   0, ...,  28,  48,  76],
       [  0,   0,   0, ...,  27,  67, 104],
       [  0,   0,   0, ...,  30,  60,  69]])

In [11]:
X_train.shape

(3290, 500)

In [12]:
#X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH)
X_test

array([[  0,   0,   0, ...,  20,  80,  76],
       [  0,   0,   0, ...,  20,  67,  83],
       [  0,   0,   0, ...,  19,  70, 111],
       ...,
       [  0,   0,   0, ...,  28,  70,  78],
       [  0,   0,   0, ...,  21,  75,  73],
       [  0,   0,   0, ...,  39,  85,  90]])

In [13]:
y_train = to_categorical(y_train, num_classes=num_class)
y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [14]:
y_train.shape

(3290, 2)

In [15]:
y_test = to_categorical(y_test, num_classes=2)
y_test.shape

(366, 2)

In [16]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
#X_train.reshape((3290, 500, 1))

In [17]:
sequence_inputs = Input(shape=(MAX_LENGTH,))
# embedding_layer = Embedding(vocab_size,
#                             128,
#                             input_length=MAX_LENGTH, trainable=False)(sequence_inputs) 

x = Dense(32, activation='relu')(sequence_inputs)
x = Dense(16, activation='relu')(x)
x = Dropout(0.2)(x)

predictions = Dense(num_class, activation='softmax')(x)
model = Model(inputs=[sequence_inputs], outputs=predictions)
model.compile(optimizer='adamax',
              loss='binary_crossentropy',
              metrics=['acc'])

print(model.summary())

filepath = 'model2.hdf5'
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=10, shuffle=True, callbacks=[checkpointer])

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 500)]             0         
_________________________________________________________________
dense (Dense)                (None, 32)                16032     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 16,594
Trainable params: 16,594
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
 1/47 [..............................] - ETA: 0s - loss: 2.0821 - acc: 0.7969
Epoch 00001: val_acc i

<tensorflow.python.keras.callbacks.History at 0x2cfdb836ca0>

In [18]:
model = load_model('model2.hdf5')

In [None]:
test_data = pd.read_csv('test_data.csv')

In [None]:
test_data=test_data.dropna()

In [None]:
test_feature_data = np.asarray(test_data[['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose']])

In [None]:
X_test = pad_sequences(test_feature_data, maxlen=MAX_LENGTH)
X_test

In [None]:
y_test = to_categorical(test_data['condition'], num_classes=num_class)
y_test

In [19]:
preds = model.predict(X_test)

In [20]:
preds.shape

(366, 2)

In [21]:
preds = np.argmax(preds, axis=1)

In [22]:
y_test = np.argmax(y_test, axis=1)

In [23]:
print(metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       310
           1       0.00      0.00      0.00        56

    accuracy                           0.85       366
   macro avg       0.42      0.50      0.46       366
weighted avg       0.72      0.85      0.78       366



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
preds