### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras import Model, Input, optimizers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, GRU, concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

pd.options.mode.chained_assignment = None

### Load data

In [None]:
df = pd.read_csv(r'data_extracted.csv')
df.head()

### Text and Numeric data

In [None]:
text_input = df["TEXT"].values
numeric_input = df[["NUMBER_OF_PROCEDURES", "NUMBER_OF_DIAGNOSIS",\
                    "ICU_LOS", "MARRIED", "DIVORCED", "SINGLE", "SEPARATED",\
                    "WIDOWED", "UNKNOWN (DEFAULT)","LIFE PARTNER"]]
print(text_input[0])
numeric_input.values

### Tokenize the text data

In [None]:
max_sequence = 300
max_words = 50000
X = text_input
tokenizer = Tokenizer(num_words = max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen = max_sequence)
X

### Input and Output Arrays

In [None]:
x = X                 # text input   (  , 250)                    
z = numeric_input.values     # numeric inpute(  , 10)
y = df["LENGTH_OF_STAY"].values

print(x.shape,y.shape,z.shape)

### Exploratory analysis

In [None]:
print('Output (Length of stay):\n','Mean : {}, Median : {}, Max : {}, Min : {}, Range: {}'\
      .format(np.mean(y), np.median(y), np.max(y), np.min(y), np.max(y) - np.min(y)), '\n')
for i in range(z.shape[1]):
    print('{}:'.format(numeric_input.columns[i]))
    print('Mean : {}, Median : {}, Max : {}, Min : {}, Range: {}'\
          .format(np.mean(z[:,i]), np.median(z[:,i]), np.max(z[:,i]), np.min(z[:,i]), \
                  np.max(z[:,i]) - np.min(z[:,i])), '\n')
plt.subplot(1,3,1)
plt.scatter(range(len(y)), y, s= 1, color = 'g', label = 'scatter')
plt.hlines(y = np.mean(y), xmin = 0, xmax = len(y), colors = 'b', label = 'mean')
plt.title('Output : scatter plot & mean'), plt.legend(loc = 1)
plt.subplot(1,3,3)
plt.hist(y, bins = 6, label = 'hist')
plt.vlines(x = np.mean(y), ymin = 0, ymax = len(y), colors = 'g', label = 'mean')
plt.title('Output : histogram & mean'), plt.legend(loc = 1)
plt.show()

### Train/Test Split

In [None]:
test_size = 0.2
x_train, x_test, z_train, z_test, y_train, y_test = train_test_split(x,z,y, test_size = test_size, random_state = 0)
print('\nNumber of training data:',x_train.shape[0])
print('\nNumber of text data:',x_test.shape[0])
z_train.shape

### Model structure

#### Linear Regression Models

In [None]:
lr = LinearRegression()

yhat_cv = cross_val_predict(lr, z_train, y_train, cv = 4)
yhat_cv_text = cross_val_predict(lr, x_train, y_train, cv = 4)

score_cv = cross_val_score(lr, z_train, y_train, cv = 4)
score_cv_text = cross_val_score(lr, x_train, y_train, cv = 4)

R2 = r2_score(y_train, yhat_cv)
MSE = mean_squared_error(y_train, yhat_cv)
MSE_text = mean_squared_error(y_train, yhat_cv_text)

lr.fit(z_train, y_train)
yhat_train = lr.predict(z_train)
score_train = lr.score(z_train, y_train)
print('\nLinear regression training score (R squared) of numeric data ( , 10) is',score_train)

lr.fit(x_train, y_train)
yhat_train_text = lr.predict(x_train)
score_train_text = lr.score(x_train, y_train)
print('\nLinear regression training score (R squared) of text data ( , 250) is', score_train_text)

#### Polynomial linear regression

In [None]:
pf = PolynomialFeatures(degree = 2)
z_train_poly = pf.fit_transform(z_train)
z_test_poly = pf.fit_transform(z_test)

x_train_poly = pf.fit_transform(x_train)
x_test_poly = pf.fit_transform(x_test)

poly_num = LinearRegression()
poly_num.fit(z_train_poly, y_train)
yhat_poly_num = poly_num.predict(z_train_poly)

poly_text = LinearRegression()
poly_text.fit(x_train_poly, y_train)
yhat_poly_text = poly_text.predict(x_train_poly)

print('Numeric:')
print('\nNumeric: R squared error for polynomial regression is:',poly_num.score(z_train_poly, y_train))
print('Actual output:', y_train[:5])
print('\nPredicted output:', yhat_poly_num[:5])
print('\nText:')
print('\nR squared error for polynomial regression is:',poly_text.score(x_train_poly, y_train))
print('Actual output:', y_train[:5])
print('\nPredicted output:', yhat_poly_text[:5])

#### DNN : Numeric data only

In [None]:
m_num = Sequential() 
num_input = Input(shape = (10, ))
numl = Dense(512, activation = 'linear')(num_input)
d1 = Dropout(0.2)(numl)
numl2 = Dense(256, activation = 'relu')(d1)
d2 = Dropout(0.2)(numl2)
numl3 = Dense(128, activation = 'relu')(d2)
d3 = Dropout(0.2)(numl3)
numl4 = Dense(64, activation = 'relu')(d3)
d4 = Dropout(0.2)(numl4)
output = Dense(1, activation = "linear")(d4)
m_num = Model(inputs = num_input, outputs = output)
m_num.summary()

In [None]:
callback = EarlyStopping(monitor='val_loss', patience=5)
adam = optimizers.Adam(lr = 0.00001)
m_num.compile(optimizer = adam, loss = 'mae', metrics=['mse', 'mae'])

'''fit the model with 20% validation'''
m_num_history = m_num.fit(z_train, y_train, batch_size=100, epochs=10, validation_split = 0.2, callbacks = [callback])

In [None]:
yhat_train_num = m_num.predict(z_train)
yhat_test_num = m_num.predict(z_test)
plt.scatter(z_train[:,1], y_train, s = 5 , color = 'g', label = 'actual')
plt.scatter(z_train[:,1], yhat_train_num, s=5, color = 'b', label ='predicted')
plt.legend(loc = 1)
print('\nRsquared score on train is:',r2_score(y_train, yhat_train_num))
print('\nRsquared score on test is:',r2_score(y_test, yhat_test_num))
print('\nActual output:\n', y_train[:5])
print('\nPredicted output:\n', yhat_train_num[:5])
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.plot(range(len(m_num_history.history['val_loss'])),m_num_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(range(len(m_num_history.history['loss'])),m_num_history.history['loss'],'-o', label = 'train_loss')
plt.legend(loc = 1, fontsize = 10)
plt.ylabel('MSE'), plt.title('DNN: Numeric data')
plt.subplot(2,1,2)
plt.plot(range(len(m_num_history.history['val_mae'])),m_num_history.history['val_mae'],'-o', label = 'validation MAE')
plt.plot(range(len(m_num_history.history['mae'])),m_num_history.history['mae'],'-o', label = 'train MAE')
plt.legend(loc = 1, fontsize = 10)
plt.ylabel('MAE')
plt.show()

#### DNN : Text only

In [None]:
m_text = Sequential() 
text_input = Input(shape = (300, ))
numl = Dense(1000, activation = 'softmax')(text_input)
d1 = Dropout(0.4)(numl)
numl2 = Dense(500, activation = 'relu')(d1)
d2 = Dropout(0.4)(numl2)
numl3 = Dense(100, activation = 'relu')(d2)
d3 = Dropout(0.3)(numl3)
output = Dense(1, activation = "linear")(d3)
m_text = Model(inputs = text_input, outputs = output)
# m_text.summary()

#### Train the model

In [None]:
callback = EarlyStopping(monitor='val_loss', patience=5)
adam = optimizers.Adam(lr = 0.1)
m_text.compile(optimizer = adam, loss = 'mse', metrics=['mse', 'mae'])

'''fit the model with 20% validation'''
m_text_history = m_text.fit(x_train, y_train, batch_size=100, epochs=100, validation_split = 0.2, callbacks = [callback])

#### Score

In [None]:
yhat_train_text = m_text.predict(x_train)
yhat_test_text = m_text.predict(x_test)
plt.scatter(range(len(y_train)), y_train, s = 5 , color = 'g', label = 'actual')
plt.scatter(range(len(y_train)), yhat_train_text, s=5, color = 'b', label ='predicted')
plt.legend(loc = 1)
print('\nRsquared score on train is:',r2_score(y_train, yhat_train_text))
print('\nRsquared score on test is:',r2_score(y_test, yhat_test_text))
print('\nActual output:\n', y_train[:5])
print('\nPredicted output:\n', yhat_train_text[:5])
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.plot(range(len(m_text_history.history['val_loss'])),m_text_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(range(len(m_text_history.history['loss'])),m_text_history.history['loss'],'-o', label = 'train_loss')
plt.legend(loc = 1, fontsize = 10)
plt.ylabel('MSE'), plt.title('DNN: Text data')
plt.subplot(2,1,2)
plt.plot(range(len(m_text_history.history['val_mae'])),m_text_history.history['val_mae'],'-o', label = 'validation MAE')
plt.plot(range(len(m_text_history.history['mae'])),m_text_history.history['mae'],'-o', label = 'train MAE')
plt.legend(loc = 1, fontsize = 10)
plt.ylabel('MAE')
plt.show()

#### RNN : Text only

In [None]:
embedding_size = 250

nlp_input = Input(shape=(max_sequence,))
emb = Embedding(max_words, embedding_size, input_length = max_sequence)(nlp_input)  # text input layer
l1 = Dense(100, activation = 'elu')(emb)
nl2 = LSTM(100, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True)(l1)
d1 = Dropout(0.2)(nl2)
nl3 = LSTM(50, activation = 'relu', kernel_initializer = 'glorot_uniform', return_sequences=True)(d1)
d2 = Dropout(0.2)(nl3)
nl4 = LSTM(50, activation = 'tanh', dropout = 0.2, recurrent_dropout = 0.15, return_sequences = True)(d2)
nl5 = GRU(50, activation = 'relu', recurrent_activation = 'tanh')(nl4)

output = Dense(1, activation = "linear")(nl5)

rnn_text = Model(inputs = [nlp_input], outputs = [output])
rnn_text.summary()

In [None]:
solver = optimizers.Adam(learning_rate = 1)
callback = EarlyStopping(monitor='val_loss', patience=5)
rnn_text.compile(optimizer = solver,loss = 'mse', metrics = ['mae','mse'])
rnn_text_history = rnn_text.fit(x_train, y_train, batch_size = 100, epochs = 1, validation_split = 0.2, callbacks = [callback])

In [None]:
yhat_train_rnn_text = rnn_text.predict(x_train)
yhat_test_rnn_text = rnn_text.predict(x_test)
plt.scatter(x_train[:,1], y_train, s = 5 , color = 'g', label = 'actual')
plt.scatter(x_train[:,1], yhat_train_rnn_text, s=5, color = 'b', label ='predicted')
plt.legend(loc = 1)
print('\nRsquared score on train is:',r2_score(y_train, yhat_train_rnn_text))
print('\nRsquared score on test is:',r2_score(y_test, yhat_test_rnn_text))
print('\nActual output:\n', y_train[:5])
print('\nPredicted output:\n', yhat_train_rnn_text[:5])
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.plot(range(len(rnn_text_history.history['val_loss'])),rnn_text_history.history['val_loss'],'-o', label = 'val_loss')
plt.plot(range(len(rnn_text_history.history['loss'])),rnn_text_history.history['loss'],'-o', label = 'train_loss')
plt.legend(loc = 1, fontsize = 10)
plt.ylabel('MSE'), plt.title('RNN : Text data')
plt.subplot(2,1,2)
plt.plot(range(len(rnn_text_history.history['val_mae'])),rnn_text_history.history['val_mae'],'-o', label = 'validation MAE')
plt.plot(range(len(rnn_text_history.history['mae'])),rnn_text_history.history['mae'],'-o', label = 'train MAE')
plt.legend(loc = 1, fontsize = 10)
plt.ylabel('MAE')
plt.show()

#### RNN : Text & DNN : Numeric

In [None]:
embedding_size = 100

'''Sequential model'''
m = Sequential() 

'''Numeric input layer'''
num_input = Input(shape = (10, ))
numl = Dense(100, activation = 'relu')(num_input)
dl = Dropout(0.2)(numl)

'''Text input layer'''
nlp_input = Input(shape=(max_sequence,))
emb = Embedding(max_words, embedding_size, input_length = max_sequence)(nlp_input)  # text input layer
nl2 = LSTM(200, activation = 'tanh', kernel_initializer = 'glorot_uniform', return_sequences=True)(emb)
d1 = Dropout(0.2)(nl2)
nl3 = LSTM(100, activation = 'tanh', kernel_initializer = 'glorot_uniform', return_sequences=True)(d1)
d2 = Dropout(0.2)(nl3)
nl4 = LSTM(50, activation = 'tanh', dropout = 0.2, recurrent_dropout = 0.2, return_sequences = True)(d2)
nl5 = GRU(50, activation = 'tanh', recurrent_activation = 'tanh')(nl4)

merge = concatenate([dl, nl5])   # merge Dense and LSTM layers

output = Dense(1, activation = "linear")(merge)

m = Model(inputs = [num_input, nlp_input], outputs = [output])

m.summary()

In [None]:
solver = optimizers.Adam(learning_rate = 1)
callback = EarlyStopping(monitor='val_loss', patience=5)
m.compile(optimizer = solver,loss = 'mae', metrics = ['mae','mse'])
history = m.fit([z_train, x_train], y_train, batch_size = 100, epochs = 1, validation_split = 0.2, callbacks = [callback])

In [None]:
yhat_train = m.predict([z_train, x_train])
yhat_test = m.predict([z_test, x_test])
print('Rsquared score on train is:',r2_score(y_train, yhat_train))
print('Rsquared score on test is:',r2_score(y_test, yhat_test))
plt.scatter(z_train[:,1], y_train, s = 5 , color = 'g', label = 'actual')
plt.scatter(z_train[:,1], yhat_train, s=5, color = 'b', label ='predicted')
plt.legend(loc = 1)
plt.show()

In [None]:
Weights = []
for layer in m.layers:
    Weights.append({layer : layer.get_weights()})
Weights