In [None]:
import pandas as pd
import numpy as np

In [None]:
train_ = pd.read_csv("training_v2.csv")
predict_ = pd.read_csv("unlabeled.csv")

train_['label'] = 'train'
predict_['label'] = 'test'

train_.drop(['encounter_id', 'hospital_id', 'patient_id', 'icu_id', 'readmission_status'], inplace=True, axis=1)
predict_.drop(['encounter_id', 'hospital_id', 'patient_id', 'icu_id', 'readmission_status', 
'hospital_death'], inplace=True, axis=1)

print(train_.shape)
print(predict_.shape)

In [None]:
train_["height"] = np.where((train_.height.isna() & (train_.gender == 'F')), 160, train_["height"])
train_["height"] = np.where((train_.height.isna() & (train_.gender == 'M')), 180, train_["height"])
train_["height"] = np.where((train_.height.isna() & (train_.gender.isna())), 170, train_["height"])
train_["weight"] = np.where((train_.height.isna() & (train_.gender == 'F')), 65, train_["weight"])
train_["weight"] = np.where((train_.height.isna() & (train_.gender == 'M')), 82, train_["weight"])
train_["weight"] = np.where((train_.height.isna() & (train_.gender.isna())), 74, train_["weight"])

In [None]:
predict_["height"] = np.where((predict_.height.isna() & (predict_.gender == 'F')), 160, predict_["height"])
predict_["height"] = np.where((predict_.height.isna() & (predict_.gender == 'M')), 180, predict_["height"])
predict_["height"] = np.where((predict_.height.isna() & (predict_.gender.isna())), 170, predict_["height"])
predict_["weight"] = np.where((predict_.height.isna() & (predict_.gender == 'F')), 65, predict_["weight"])
predict_["weight"] = np.where((predict_.height.isna() & (predict_.gender == 'M')), 82, predict_["weight"])
predict_["weight"] = np.where((predict_.height.isna() & (predict_.gender.isna())), 74, predict_["weight"])

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))

sns.kdeplot(train_[train_.gender == 'F'].weight, label='Female', color='salmon', ax=ax[0])
sns.kdeplot(train_[train_.gender == 'M'].weight, label='Male', color='dodgerblue', ax=ax[0])
ax[0].set_title('Weight [kg]', fontsize=14)

sns.kdeplot(train_[train_.gender == 'F'].height, label='Female', color='salmon', ax=ax[1])
sns.kdeplot(train_[train_.gender == 'M'].height, label='Male', color='dodgerblue', ax=ax[1])
ax[1].set_title('Height [cm]', fontsize=14)

plt.show()

In [None]:
lst = train_.isna().sum() / len(train_)

p = pd.DataFrame(lst)

# When we reset the index, the old index is added as a column, and a new sequential index is used
p.reset_index(inplace=True)

p.columns = ['a', 'b']
low_count = p[p['b'] > 0.4]

todelete = low_count['a'].values

train_.drop(todelete, axis=1, inplace=True)
predict_.drop(todelete, axis=1, inplace=True)

train_.shape

In [None]:
train_.dropna(thresh=54, inplace=True)
train_.shape

In [None]:
train_test = pd.concat([train_, predict_], keys=['x', 'y'])
train_test.shape

In [None]:
dictionary_ = pd.read_csv("WiDS Datathon 2020 Dictionary.csv")
print(dictionary_.shape)

In [None]:
integer_cols = []
binary_cols = []
numeric_cols = []
string_cols = []

for i in range(dictionary_.shape[0]):
    if dictionary_.loc[i, 'Data Type'] == 'integer':
        integer_cols.append(dictionary_.loc[i, 'Variable Name'])

    if dictionary_.loc[i, 'Data Type'] == 'binary':
        binary_cols.append(dictionary_.loc[i, 'Variable Name'])

    if dictionary_.loc[i, 'Data Type'] == 'numeric':
        numeric_cols.append(dictionary_.loc[i, 'Variable Name'])

    if dictionary_.loc[i, 'Data Type'] == 'string':
        string_cols.append(dictionary_.loc[i, 'Variable Name'])


In [None]:
for col_name in numeric_cols:
    if col_name in train_test.columns.to_list():
        train_test[col_name] = train_test.groupby(['ethnicity', 'gender'], sort=False)[col_name].apply(lambda x: 
        x.fillna(x.mean()))

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_test = imputer.fit_transform(train_test) 

train_test = pd.DataFrame(train_test, columns=train_.columns)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

for col_name in string_cols:
    if col_name in train_test.columns.to_list():
        train_test[col_name] = enc.fit_transform(train_test[[col_name]])

In [None]:
def get_bmi_category(bmi):
    if bmi != bmi:  # NaN
        return np.nan
    elif bmi < 18.5:  # Underweight
        return 'Underweight'
    elif bmi < 25:  # Healthy weight
        return 'Healthy weight'
    elif bmi < 30:  # Overweight
        return 'Overweight'
    else:  # Obese
        return 'Obese'    
    
train_test["bmi_cat"] = train_test["bmi"].apply(get_bmi_category)

# value_counts() function returns a Series containing counts of unique values. 
# The resulting object will be in descending order so that the first element is the most frequently-occurring element.
train_test["bmi_cat"] = train_test["bmi_cat"].fillna(train_test["bmi_cat"].value_counts().index[0])

def get_blood_pressure_category(sysbp, diasbp):
    if ((sysbp < 90) & (diasbp < 60)):
        return 1  # Low blood pressure
    elif ((sysbp < 120) & (diasbp < 80)):
        return 2  # Normal
    elif ((sysbp < 140) & (diasbp < 90)):
        return 3  # Pre-Hypertension
    elif ((sysbp < 160) & (diasbp < 100)):
        return 4  # Stage 1 Hypertension
    else:
        return 5  # Stage 2 Hypertension

train_test['bp_cat'] = train_test[['d1_sysbp_max', 'd1_diasbp_max']].apply(
    lambda x: get_blood_pressure_category(x.d1_sysbp_max, x.d1_diasbp_max), axis=1)

train_test["bp_cat"] = train_test["bp_cat"].fillna(train_test["bp_cat"].value_counts().index[0])


In [None]:
# separate train and test
train = train_test[train_test['label'] == "train"]
predict = train_test[train_test['label'] == 'test']

train.reset_index(inplace=True)
train.drop(['label'], inplace=True, axis=1)

predict.reset_index(inplace=True)
predict.drop(['label'], inplace=True, axis=1)

train = train.astype('float64')
predict = predict.astype('float64')

print(train.shape)
print(predict.shape)

In [None]:
num_feature = []

for col_name in numeric_cols:
    if col_name in train_test.columns.to_list():
        num_feature.append(col_name)

In [None]:
# Create correlation matrix
corr_matrix = train[num_feature].corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

train.drop(to_drop, inplace=True, axis=1)
predict.drop(to_drop, inplace=True, axis=1)

print(train.shape)
print(predict.shape)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
components = pca.fit_transform(train)

In [None]:
import plotly.express as px

fig = px.scatter(components, x=0, y=1, color=train['hospital_death'], width=1000, height=1000)
fig.show()

In [None]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.5)
yhat = iso.fit_predict(train)

# select all rows that are not outliers
mask = yhat != -1
len(mask)

In [None]:
from sklearn.covariance import EllipticEnvelope

ee = EllipticEnvelope(contamination=0.5)
yhat = ee.fit_predict(train)

mask = yhat != -1
len(mask)

In [None]:
train['hospital_death'].value_counts() / len(train)

In [None]:
y_train = train['hospital_death']

train.drop(['hospital_death'], inplace=True, axis=1)

cat_feature = []

for col_name in train.columns.to_list():
    if col_name not in numeric_cols:
        cat_feature.append(col_name)

In [None]:
from imblearn.over_sampling import SMOTENC

sm = SMOTENC(categorical_features=cat_feature)
train_res, y_res = sm.fit_resample(train, y_train)

y_train.value_counts() / len(train)