# Diabetic patients readmission rates preditction


In [70]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import optimize
from sklearn import datasets as skdataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier


import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import Dataset
from learn2learn.algorithms.maml import MAML
from learn2learn.data import TaskDataset

from tqdm import tqdm

# Project introduction

- Overview: <br>
This project is focusing on developing a predictive model to ascertain the likelihood of readmission for diabetes patients.
<br>

- Target:<br>
The main goal of this project is developing a powerful machine learning model which can predict the readmission rate of patient 

# Data loading
The following cells are used to load training and testing data for our prediction

In [71]:
train_data = pandas.read_csv("Dataset/diabetic_data_training.csv")
test_data = pandas.read_csv("Dataset/diabetic_data_test.csv")
mapping_info = pandas.read_csv("Dataset/IDS_mapping.csv", header=None)

In [None]:
# Function to handle different data types for plotting
def plot_column(ax, column, df):
    if df[column].dtype == 'object':
        # Check if binary
        if df[column].nunique() == 2:
            # Binary data visualization
            df[column].value_counts().plot(kind='bar', ax=ax)
        else:
            # Categorical data visualization
            df[column].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=ax)
    elif df[column].dtype == 'int64' or df[column].dtype == 'float64':
        # Numeric data visualization
        df[column].plot(kind='hist', bins=20, ax=ax)
    else:
        ax.text(0.5, 0.5, f"Unhandled data type for column: {column}", 
                fontsize=12, ha='center')
    ax.set_title(column)

# Creating a 4x4 subplot layout
fig, axes = plt.subplots(nrows=13, ncols=4, figsize=(20, 65))
fig.tight_layout(pad=5.0)

# Iterate through each column and plot
for i, col in enumerate(train_data.columns):
    # Adjust this line to select different subsets of columns  
    plot_column(axes[i//4, i%4], col,train_data)


plt.show()


# Data preprocessing
The following cells are used to preprocess the training and testing data. There are two main goals in our preprocessing data section of the code
- Change the string type data in our dataset to integer type data 
- Apply some applicable method to full up the missing value

In [None]:
# this part is used to change all string type data to integer type
# for the missing value, we will skip and process it at next step
df = train_data.copy()

df.drop(columns = ['weight','encounter_id','patient_nbr', 'examide', 'citoglipton',
'glimepiride-pioglitazone'],inplace=True)
df.replace('?', np.nan, inplace=True)

df_test = test_data.copy()
df_test.drop(columns = ['weight','encounter_id','patient_nbr', 'examide', 'citoglipton',
'glimepiride-pioglitazone'],inplace=True)
df_test.replace('?', np.nan, inplace=True)

In [None]:
df['age'] = (df['age'].str.extract(r'(\d+)-(\d+)')[0].astype(int)+df['age'].str.extract(r'(\d+)-(\d+)')[1].astype(int))//2
df_test['age'] = (df_test['age'].str.extract(r'(\d+)-(\d+)')[0].astype(int)+df_test['age'].str.extract(r'(\d+)-(\d+)')[1].astype(int))//2


In [None]:
df = df[df['race'] != '?']
print(df.any(axis=1).sum())

One-Hot Encoding For race:
cons: One-hot encoding can lead to a significant increase in the dataset's dimensionality (a problem known as the "curse of dimensionality"), especially if the categorical feature has many unique values. This can increase the computational cost and may require more data to achieve good performance.
Dems Redct Would be apply, so it doesn't matter
pros: Map to a fix number implies an ordinal relationship between the categories which may not exist, but is ideal for non-ordinal categorical data. It's suitable for many machine learning models, especially those that assume no ordinal relationship between categories


1. random forest, remove ?
2. randomly assign ? to a class by disstribution

General Missing value
1. multiple imputation To be decide when training if less than 1h 5 epoch
2. mean
3. fullly remove
4. wrong -> fix ?


Encoding for age:
1. Asumming normal distribution, map to a random age in the range
2. Map to mean age in the range

In [None]:
def one_hot_encode(feature):
    """
    Takes a series and one-hot encodes it.
    
    Parameters:
    df (pandas.series): series containing a colum of the feature matrix.

    Returns:
    np.ndarray: A ndarray one-hot encoded.
    """
    encoded_df = pd.get_dummies(feature).values
    return encoded_df

In [None]:
df = df[df['gender'] != 'Unknown/Invalid']
Name = ['race','gender','change','diabetesMed']
df_encoded = pd.get_dummies(df, columns=Name, prefix=Name)

df_test_encoded = pd.get_dummies(df_test, columns=Name, prefix=Name)

In [None]:
category_mapping = {category: i for i, category in enumerate(df_encoded['medical_specialty'].unique())}
df_encoded['medical_specialty'] = df_encoded['medical_specialty'].map(category_mapping)
df_test_encoded['medical_specialty'] = df_test_encoded['medical_specialty'].map(category_mapping)

In [None]:
category_mapping = {category: i for i, category in enumerate(df_encoded['payer_code'].unique())}
df_encoded['payer_code'] = df_encoded['payer_code'].map(category_mapping)
df_test_encoded['payer_code'] = df_test_encoded['payer_code'].map(category_mapping)

In [None]:
medicion_mapping = {'No':0,'Down':1,'Steady':2,'Up':3}
max_glu_serum_mapping = {'>200': 201, '>300': 301, 'normal': 0,}
A1Cresult_mapping = {'>8':9,'>7':7.5,'normal':6}
readmitted_mapping = {'NO':0,'<30':1,'>30':2}

In [None]:
df_encoded['max_glu_serum'] = df_encoded['max_glu_serum'].map(max_glu_serum_mapping)

df_test_encoded['max_glu_serum'] = df_test_encoded['max_glu_serum'].map(max_glu_serum_mapping)

In [None]:
df_encoded["A1Cresult"] = df_encoded['A1Cresult'].map(A1Cresult_mapping)

df_test_encoded["A1Cresult"] = df_test_encoded['A1Cresult'].map(A1Cresult_mapping)

In [None]:
df_encoded['readmitted'] = df_encoded['readmitted'].map(readmitted_mapping)

df_test_encoded['readmitted'] = df_test_encoded['readmitted'].map(readmitted_mapping)

In [None]:
for index,name in enumerate(df_encoded.columns):
    if(index >= 19 and index <= 38):
        df_encoded[name] = df_encoded[name].map(medicion_mapping)

for index,name in enumerate(df_test_encoded.columns):
    if(index >= 19 and index <=38):
        df_test_encoded[name] = df_test_encoded[name].map(medicion_mapping)

encode payer_code, medical_specialty
1. Find correlation internally with other feature in group of non-missing value
2. Use identified feature predict payer-code, medical_specialty
3. Prediction algorithm to be decide, could be KNN

encode diag_1,diag_2,diag_3
1. one hot
2. ????????????? TBD
3. remove missing

encoding all medicine:
map to 0-3

In [None]:

names = ['admission_type_id','discharge_disposition_id','admission_source_id']
for name in names:
    category_means = df_encoded.groupby(name)['readmitted'].mean().reset_index()
    category_means.columns = [name,name+'_readmitted_Mean']
    df_encoded = pd.merge(df_encoded, category_means, on=name, how='left')
    df_encoded = df_encoded.drop(name, axis=1)

In [None]:
names = ['admission_type_id','discharge_disposition_id','admission_source_id']
for name in names:
    category_means = df_test_encoded.groupby(name)['readmitted'].mean().reset_index()
    category_means.columns = [name,name+'_readmitted_Mean']
    df_test_encoded = pd.merge(df_test_encoded, category_means, on=name, how='left')
    df_test_encoded = df_test_encoded.drop(name, axis=1)

In [None]:
df_encoded.drop(columns=['diag_1'], inplace=True)
df_encoded.drop(columns=['diag_2'], inplace=True)
df_encoded.drop(columns=['diag_3'], inplace=True)
df_encoded.drop(columns=['number_diagnoses'], inplace=True)

df_test_encoded.drop(columns=['diag_1'], inplace=True)
df_test_encoded.drop(columns=['diag_2'], inplace=True)
df_test_encoded.drop(columns=['diag_3'], inplace=True)
df_test_encoded.drop(columns=['number_diagnoses'], inplace=True)

In [None]:
# this part will recognise the ?, the missing value in training data
# When we have recognised it, we will use **** method to full up it
# the method we can discuss: 
#   delete
#   mean,median or mode
#   knn to predict
#   Multiple Imputation
predict_nan = df_encoded.copy()


df_real_ms_index = predict_nan.index[predict_nan['medical_specialty'] != 0]
unique_rows_index = predict_nan.index[predict_nan['medical_specialty'].duplicated(keep=False)]
df_real_ms_index_total = df_real_ms_index.join(unique_rows_index,how = 'inner')

df_real_pc_index = predict_nan.index[predict_nan['payer_code'] != 0]
unique_rows_index = predict_nan.index[predict_nan['payer_code'].duplicated(keep=False)]
df_real_pc_index_total = df_real_pc_index.join(unique_rows_index,how = 'inner')

df_real_ms = predict_nan.loc[df_real_ms_index_total,['medical_specialty']]
df_real_pc = predict_nan.loc[df_real_pc_index_total,['payer_code']]

predict_nan.drop(columns = ['medical_specialty','payer_code'],inplace = True)

df_data_train_ms = predict_nan.loc[df_real_ms_index_total]
df_data_predict_ms = predict_nan.loc[~predict_nan.index.isin(df_real_ms_index)]

df_data_train_pc = predict_nan.loc[df_real_pc_index_total]
df_data_predict_pc = predict_nan.loc[~predict_nan.index.isin(df_real_pc_index)]

In [None]:
# this part will reduce the dimension our training data

missing_value_predict_model = HistGradientBoostingClassifier(max_iter=100)
missing_value_predict_model.fit(df_data_train_ms,df_real_ms['medical_specialty'])
df_encoded['medical_specialty'].loc[~df_encoded['medical_specialty'].index.isin(df_real_ms_index)] = missing_value_predict_model.predict(df_data_predict_ms)



missing_value_predict_model = HistGradientBoostingClassifier(max_iter=100)
missing_value_predict_model.fit(df_data_train_pc,df_real_pc)
df_encoded['payer_code'].loc[~df_encoded['payer_code'].index.isin(df_real_pc_index)] = missing_value_predict_model.predict(df_data_predict_pc)

Dems Redct
1. PCA/PPCA
2. LDA/QDA
3. following to T-SNE
3. Autoencoders
4. Unsupervised Algorithmn

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


# Simulating Data
np.random.seed(0)
num_samples = 1000
num_features = 5

# Numerical data
numeric_data = np.random.randn(num_samples, num_features)

# Categorical data (let's say, colors)
colors = ['Red', 'Green', 'Blue']
categorical_data = np.random.choice(colors, size=num_samples)

# Convert categorical data to one-hot encoding
encoder = OneHotEncoder(sparse=False)
categorical_encoded = encoder.fit_transform(categorical_data.reshape(-1, 1))

# Combining numerical and categorical data
combined_data = np.hstack((numeric_data, categorical_encoded))

# Standardize the numerical features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(combined_data)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of the variance
pca_result = pca.fit_transform(scaled_data)

# Apply t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
tsne_result = tsne.fit_transform(pca_result)

# Plotting the results
sns.set(rc={'figure.figsize':(10,8)})
sns.scatterplot(x=tsne_result[:,0], y=tsne_result[:,1], hue=categorical_data, palette='bright')
plt.title('t-SNE plot of the dataset')
plt.xlabel('t-SNE Axis 1')


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

input_shape = combined_data.shape[1]  # combined data from previous steps
encoding_dim = 32  # example of encoding dimension

# This is our input placeholder
input_data = Input(shape=(input_shape,))

# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_data)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(input_shape, activation='sigmoid')(encoded)

# This model maps an input to its reconstruction
autoencoder = Model(input_data, decoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(combined_data, combined_data, epochs=50, batch_size=256, shuffle=True)


# Model Building
We will build two models: 
1. A traditional machine learning model using Random Forest.
2. A deep learning model using PyTorch.


At the first step we will try to use the Random Forest method to get the result

In [None]:
# the code for Random Forest algorithm

In [None]:
# the code for Nerual Network

# Model Training


In [None]:
# training loop
# Random Forest
Y = df_encoded['readmitted']
df_normalized = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)
X = df_normalized.drop('readmitted', axis=1)

rf_classifier = HistGradientBoostingClassifier(max_iter=100, random_state=42)
rf_classifier.fit(X, Y)

# Nerual Network

# Model Evaluation

In [None]:
# use testing dataset to predict
Y_test = df_test_encoded['readmitted']
df_test_normalized = pd.DataFrame(scaler.fit_transform(df_test_encoded), columns=df_test_encoded.columns)
X_test = df_test_normalized.drop('readmitted', axis=1)

Y_pred = rf_classifier.predict(X_test)
print(Y_pred)
print(Y_test)
print(accuracy_score(Y_test,Y_pred))