In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install --upgrade pip

In [None]:
#!pip install dabl

In [None]:
import dabl
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, classification_report

import gc #python garbage collection
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows',200)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:20,.2f}'.format)
np.random.seed(123)

In [None]:
TARGET_COL = 'diabetes_mellitus'
df = pd.read_csv("/kaggle/input/widsdatathon2021/TrainingWiDS2021.csv")
test = pd.read_csv("/kaggle/input/widsdatathon2021/UnlabeledWiDS2021.csv")

print('The size of training set is: ', df.shape)
print('The size of the test set is: ', test.shape)

In [None]:
test_full = pd.read_csv("/kaggle/input/widsdatathon2021/UnlabeledWiDS2021.csv")

In [None]:
df.head(5)

In [None]:
#get data info
df[TARGET_COL].value_counts()

In [None]:
#get infomation about numeric data
df.describe().transpose()

In [None]:
#quick data profiling
#train_profile = ProfileReport(df, 'EDA')
#train_profile
#output the profiling file
#train_profile.to_file("output.html")

In [None]:
"""
choose collumn to drop before the importance selection based on data nature
readmission_status:unique value to drop
hospital_id
icu_id


"""
cols_drop = ['readmission_status', 'hospital_id', 'icu_id','Unnamed: 0']

In [None]:
#drop feature value is id or unique
df.drop(cols_drop, axis =1, inplace = True)
test.drop(cols_drop, axis =1, inplace = True)

* Find feature importance <br>
Based on data profiling,we notice there are many missing values in mutiple columns, so will use the randomforest to decide the feature importance.<br>
Then drop the values and impute values.

In [None]:
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
h2o.init()

In [None]:
h2o_df = h2o.H2OFrame(df)

In [None]:
#h2o_df.head(5)

In [None]:
# Define model
model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)

# Train model
model.train(y=TARGET_COL, training_frame=h2o_df)

In [None]:
#get feature importance
importance_df = model.varimp(use_pandas=True)

In [None]:
#read the feature importance table
importance_df = pd.read_csv('/kaggle/input/featureimportance/DataDictionaryWiDS2021.csv')

In [None]:
importance_df.head(3)

In [None]:
feature_to_drop = importance_df[importance_df['percentage']==0]['relative_importance']

In [None]:
#drop features which the importance score is 0
df.drop(feature_to_drop , axis =1, inplace = True)
test.drop(feature_to_drop, axis =1, inplace = True)

In [None]:
print(df.shape)
print(test.shape)

In [None]:
#drop target variable
X_train = df.drop(TARGET_COL, axis = 1)
y_train = df[TARGET_COL].copy()

In [None]:
y_train.value_counts()

* Check missing values

In [None]:
#find columns have missing value after drop those none important columns
missing_pct = X_train.isna().mean()
missing_pct  

In [None]:
#get list of column name which missing pct>25%
missing_pct.loc[missing_pct >=0.25]
missing_cols = ((missing_pct.loc[missing_pct >=0.25]).index).to_list()

In [None]:
#get categorical columns
missing_cat = [c for c in missing_cols if (X_train[c].nunique()>1) &
              (X_train[c].dtype != np.number) & (X_train[c].dtype !=int)]
missing_cat

In [None]:
def categorical_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', verbose=True):
    '''
    Helper function that gives a quick summary of a given column of categorical data
    Arguments
    =========
    dataframe: pandas dataframe
    x: str. horizontal axis to plot the labels of categorical data, y would be the count
    y: str. vertical axis to plot the labels of categorical data, x would be the count
    hue: str. if you want to compare it another variable (usually the target variable)
    palette: array-like. Colour of the plot
    Returns
    =======
    Quick Stats of the data and also the count plot
    '''
    if x == None:
        column_interested = y
    else:
        column_interested = x
    series = dataframe[column_interested]
    print(series.describe())
    print('mode: ', series.mode())
    if verbose:
        print('='*80)
        print(series.value_counts())

    sns.countplot(x=x, y=y, hue=hue, data=dataframe, palette=palette)
    plt.show()

In [None]:
missing_num =  [c for c in missing_cols if (X_train[c].nunique()>1) &
              (X_train[c].dtype == np.number)]
missing_num

In [None]:
#missing_num.append(TARGET_COL)
#missing_num

In [None]:
#missing_corr = df[missing_num].corr(method='pearson')

In [None]:
#for some features, even the corr is 0, but feature importance is high, so can't use corr
#missing_corr[TARGET_COL].sort_values(ascending = False)

In [None]:
len(missing_cols)

In [None]:
"""
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', size = 15)
colormap = sns.diverging_palette(10, 220, as_cmap = True)
sns.heatmap(missing_corr,
            cmap = colormap,
            square = True,
            annot = True,
            linewidths=0.1,vmax=1.0, linecolor='white',
            annot_kws={'fontsize':12 })
plt.show()
"""

* Missing Data Imputation

In [None]:
#MICE imputation
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.imputation import mice
from statsmodels.imputation.bayes_mi import BayesGaussMI, MI

https://www.kaggle.com/residentmario/simple-techniques-for-missing-data-imputation

In [None]:
#imp_data = mice.MICEData(df)

https://github.com/shineloveyc/fancyimpute

In [None]:
"""Matrix completion by iterative soft thresholding of SVD decompositions. 
Inspired by the softImpute package for R, which is based on Spectral 
Regularization Algorithms for Learning Large Incomplete Matrices by Mazumder et.
al."""
from fancyimpute import SoftImpute, BiScaler,IterativeImputer

In [None]:
#X_train_normalized = BiScaler().fit_transform(X_train.values)
#X_train_complete_soft = SoftImpute().fit_transform(X_train_normalized)
X_train_complete_mice = IterativeImputer().fit_transform(X_train)

In [None]:
#impute test set
#X_test_normalized = BiScaler().fit_transform(test.values)
#X_test_complete_soft = SoftImpute().fit_transform(X_test_normalized)
X_test_complete_mice = IterativeImputer().fit_transform(test)

In [None]:
X_test_df = pd.DataFrame(X_test_complete_mice, columns = test.columns)

* build the base line model

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
type(X_train_complete_mice)

In [None]:
train_df_h2o = pd.DataFrame(X_train_complete_mice, columns = X_train.columns)

In [None]:
train_df_h2o.shape

In [None]:
train_df_h2o.head(5)

In [None]:
#check missing value==>all imputed
train_df_h2o.info()

In [None]:
#combine preditive data with target data and fit into h2o dataframe
train_df_h2o[TARGET_COL] = y_train

In [None]:
h2o_train_df = h2o.H2OFrame(train_df_h2o)

In [None]:
h2o_train_df[TARGET_COL] = h2o_train_df[TARGET_COL].asfactor()

In [None]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(y=TARGET_COL, training_frame=h2o_train_df)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

* save model

There are two ways to save the leader model -- binary format and MOJO format. If you're taking your leader model to production, then we'd suggest the MOJO format since it's optimized for production use.

In [None]:
h2o.save_model(aml.leader, path = "./product_backorders_model_bin")

In [None]:
aml.leader.download_mojo(path = "./")

*training the test set

In [None]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
# Get the "All Models" Stacked Ensemble model
se = h2o.get_model([mid for mid in model_ids if "StackedEnsemble_AllModels" in mid][0])
# Get the Stacked Ensemble metalearner model
metalearner = h2o.get_model(se.metalearner()['name'])

In [None]:
h2o_test_df = h2o.H2OFrame(X_test_df)

In [None]:
predict = se.predict(h2o_test_df)

In [None]:
predict[:20]

In [None]:
predict_df = h2o.as_list(predict)

In [None]:
predict_df['encounter_id'] = test_full['encounter_id']

In [None]:
predict_df.head(5)

In [None]:
predict_df.rename(columns = {'p1':'diabetes_mellitus'}, inplace = True)

In [None]:
predict_df[['encounter_id', 'diabetes_mellitus']].to_csv('/kaggle/working/submission.csv', index=False)