In [None]:
import sys
import pandas as pd
import numpy as np

import seaborn as sns

#from scipy.stats import entropy
from matplotlib import pyplot as plt
%matplotlib inline

from scipy.stats import skew
from sklearn.model_selection import  KFold , GridSearchCV, train_test_split
from sklearn.ensemble import  RandomForestClassifier
import json
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.metrics import confusion_matrix, log_loss, make_scorer, accuracy_score, f1_score

from sklearn.preprocessing import scale

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
def getColumnsDataTypesForH2o(df_test):
    """
       This function gets pandas dataframe
       column types and returns
       a dictionary with keys as
       dataframe colum names and values
       as correspoding h2o dataframe column type
       Args:
           df_test: pandas dataframe
       Returns:
           col_types: dictionary
    """
    col_types = df_test.dtypes.to_dict()
    for x in col_types:
        if col_types[x] == 'object':
            col_types[x] = 'factor'
        if col_types[x] == 'int64':
            col_types[x] = 'float'
        if col_types[x] == 'float64':
            col_types[x] = 'float'
        if col_types[x] == 'bool':
            col_types[x] = 'factor'

    return col_types

In [None]:
sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

In [None]:
catList = ['cp_type', 'cp_dose']

countList =  list (set(train_features.columns) - set(catList))

countList.remove('sig_id')

In [None]:
lb=LabelEncoder()

for f in catList: 

    train_features[f]=lb.fit_transform(train_features[f])
    test_features[f]=lb.transform(test_features[f])

In [None]:
h2o.init()

In [None]:
labelList = list(train_targets_scored.columns[1:])

for label in labelList: 

   train_targets_scored[label] = train_targets_scored[label].astype('str')

train_features_all = pd.concat([train_features[countList + catList] , train_targets_scored[labelList]], axis = 1)


In [None]:
set(train_features.dtypes.values)

In [None]:
train_features_all_24 = train_features_all.loc[train_features_all.cp_time==24]

train_features_all_48 = train_features_all.loc[train_features_all.cp_time==48]

train_features_all_72 = train_features_all.loc[train_features_all.cp_time==72]


test_features_24 = test_features.loc[test_features.cp_time==24]

test_features_48 = test_features.loc[test_features.cp_time==48]

test_features_72 = test_features.loc[test_features.cp_time==72]

In [None]:

h2o_train_data_24 = h2o.H2OFrame(train_features_all, column_types=getColumnsDataTypesForH2o(train_features_all_24))
h2o_test_data_24 = h2o.H2OFrame(test_features[countList + catList],  column_types=getColumnsDataTypesForH2o(test_features_24[countList + catList]))

h2o_train_data_48 = h2o.H2OFrame(train_features_all, column_types=getColumnsDataTypesForH2o(train_features_all_48))
h2o_test_data_48 = h2o.H2OFrame(test_features[countList + catList],  column_types=getColumnsDataTypesForH2o(test_features_48[countList + catList]))

h2o_train_data_72 = h2o.H2OFrame(train_features_all, column_types=getColumnsDataTypesForH2o(train_features_all_72))
h2o_test_data_72 = h2o.H2OFrame(test_features[countList + catList],  column_types=getColumnsDataTypesForH2o(test_features_72[countList + catList]))

training_columns = countList + catList

In [None]:
#mask = test_features.cp_type == test_features.cp_type.value_counts().index[-1]

for label in labelList: 
    

    rf = H2ORandomForestEstimator(ntrees=30,
                                     max_depth=15,
                                     nfolds=2,
                                     seed=1234)
    # Train model
    rf.train(x=training_columns,
             y=label,
             training_frame=h2o_train_data_24)
        
    test_features_24[label] =  rf.predict(test_data=h2o_test_data_24).as_data_frame()['p1'] 
    
    
    rf = H2ORandomForestEstimator(ntrees=30,
                                     max_depth=15,
                                     nfolds=2,
                                     seed=1234)
    # Train model
    rf.train(x=training_columns,
             y=label,
             training_frame=h2o_train_data_48)
        
    test_features_48[label] =  rf.predict(test_data=h2o_test_data_48).as_data_frame()['p1'] 
    
    rf = H2ORandomForestEstimator(ntrees=30,
                                     max_depth=15,
                                     nfolds=2,
                                     seed=1234)
    # Train model
    rf.train(x=training_columns,
             y=label,
             training_frame=h2o_train_data_72)
        
    test_features_72[label] =  rf.predict(test_data=h2o_test_data_72).as_data_frame()['p1'] 

    #test_features.loc[mask][label] = 0
    
    print('label:', label)
    

## Saving predictions

In [None]:
 test_features_all = pd.concat([test_features_24, test_features_48, test_features_72], axis = 0)

In [None]:
test_features_all[
['sig_id'] + labelList ].to_csv('submission.csv', index=False)