In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from catboost import CatBoostClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
df.target.value_counts()/df.shape[0]

In [None]:
df.head()

## Look at the distribution at each group

In [None]:
df0 = df.loc[df.target==0,:]
df1 = df.loc[df.target==1,:]

In [None]:
df0.describe()

In [None]:
df1.describe()

Findings:

- the variable `f_28` has the biggest difference in the mean between label 0 and 1.
- the difference of mean of `f_20`, `f_21` and `f_25` for both groups is quite big too.
- the mean of `f_19`, `f_23`, `f_24`, and `f_26` is also a bit different for both groups.
- the mean of `f_00`, `f_01`, `f_02`, and `f_22` for both groups have the same magnitude, but opposite direction.

In [None]:
plt.figure(figsize=(16,12), dpi=300)
for i in range(1, 33):
    plt.subplot(8,4,i)
    try:
        sns.kdeplot(data=df, x=df.iloc[:,i], hue='target', alpha=0.8);
    except:
        print(f' variable {df.columns[i]} might be categorical. Continue')
        continue

Well.. **the good news is** all the numeric variables are normally distributed. But **the bad news** is that there is no such variable which has non-overlapping distributions between two groups. However, addition of some variables shows a better results, as shown below.

In [None]:
sns.kdeplot(data=df, x=df['f_02']+df['f_21'], hue='target');

In [None]:
sns.scatterplot(x=df['f_01'], y=df['f_28'], hue=df['target'], alpha=0.4);

In [None]:
triplets = [['f_00', 'f_01', 'f_02'], ['f_06', 'f_07', 'f_08'],
            ['f_02', 'f_21', 'f_23'], ['f_11', 'f_12', 'f_13'],
            ['f_18', 'f_19', 'f_20'], ['f_21', 'f_22', 'f_23'],
            ['f_22', 'f_23', 'f_24'], ['f_23', 'f_24', 'f_25']]

plt.figure(figsize=(8,8), dpi=250)
for i in range(1, 9):
    plt.subplot(4,2,i)
    sns.kdeplot(data=df, x=df[triplets[i-1]].sum(axis=1), hue='target', alpha=0.8)
    plt.title(f'the combination of {triplets[i-1]}')
    plt.tight_layout();

# Split data

In [None]:
rng = np.random.default_rng(99)
n = np.arange(df.shape[0])
rng.shuffle(n)
trainid = n[:720000]
validid = n[720000:]
len(trainid)

In [None]:
df.target[trainid].value_counts()/len(trainid) #to make sure the class weight has not changed

# Variable `f_27`

In [None]:
f27sort = df.loc[:, ['f_27', 'target']].sort_values(by='f_27')
f27sort.head()

In [None]:
#for char in [chr(i) for i in range(65, 91)]:
#    print(f'there are {sum(f27sort.f_27.str.contains(char))} observations involving character {char}')
plt.figure(dpi=75)
y = [sum(f27sort.f_27.str.contains(char)) for char in [chr(i) for i in range(65,91)]]
x = [chr(i) for i in range(65,91)]
plt.bar(x, y);

The most frequent letters used in `f_27` are **B**, **A**, then C through T (in this order). Furthermore, we have 20 letters only in this feature. Perhaps the letters represent a department or a section in a manufacturing process; if that's the case, then we have 20 departments.

In [None]:
for char in [chr(i) for i in range(65, 91)]: 
    print(f'there are {sum(f27sort.f_27.str.startswith(char))} observations starts with character {char}')

In [None]:
#for char in [chr(i) for i in range(65, 91)]:
#    print(f"there are {sum(f27sort.f_27.str.startswith(f'A{char}'))} observations starts with character A{char}")

In [None]:
#for char in [chr(i) for i in range(65, 91)]:
#    print(f"there are {sum(f27sort.f_27.str.startswith(f'B{char}'))} observations starts with character B{char}")

In [None]:
del f27sort

# Preprocessing
## Tokenize `f_27` (unigram)
I tried using bigram and trigram, but they didn't improve the score a lot.

In [None]:
tokenizer_uni = Tokenizer(char_level=True, split='') 
tokenizer_uni.fit_on_texts(df.f_27)

In [None]:
tokenizer_uni.word_index

In [None]:
df_letters = pd.DataFrame(tokenizer_uni.texts_to_sequences(df.f_27))
df_letters.shape

In [None]:
df_letters.rename(columns={0:'f_270', 1:'f_271', 2:'f_272', 3:'f_273', 4:'f_274', 
                           5:'f_275', 6:'f_276', 7:'f_277', 8:'f_278', 9:'f_279'},
                  inplace=True)
df_letters.head(3)

## Bag of Letters

In [None]:
countvec = CountVectorizer(analyzer='char')
bol = countvec.fit_transform(df.f_27)

In [None]:
bol = bol.toarray()
bol.shape

In [None]:
bol_colnames = [f'count_{char}' for char in [chr(x) for x in range(65, 85)]]
bol = pd.DataFrame(bol, columns=bol_colnames)
bol.head(3)

## Total unique letters
Adding a new feature which counts the total unique letters occur in `f_27`.

In [None]:
df['f_31'] = df_letters.nunique(axis=1)
#df.head(3)

## More new features
`f_32` : the addition of `f_21` and `f_02` <br>
`f_33` : the addition of `f_01` and `f_28`

Furthermore, eight more features, each of which is an addition of selected three variables, are added.

In [None]:
df['f_32'] = df['f_21'] + df['f_02']
df['f_33'] = df['f_01'] + df['f_28']

In [None]:
for i in range(8):
    df[f'f_{34+i}'] = df[triplets[i]].sum(axis=1)

In [None]:
df.shape

# Modeling using `df0`
Even though `df0` was a subset of the original train data labeled 0, `df0` is now the concatenation of `df_letters`, `df`, and `bol`. We have three base models fitted on this dataset.

In [None]:
df0 = pd.concat([df_letters, bol, df], axis=1)
cols = df0.columns.tolist()
cols.sort()
df0 = df0[cols]
df0.drop(['f_27', 'id'], axis=1, inplace=True)
df0.head(3)

In [None]:
df0.shape

## GBM

In [None]:
model = lgbm.LGBMClassifier(n_estimators=1500, random_state=9)
model.fit(df0.iloc[trainid,:-1], df0.iloc[trainid, -1],
            eval_set=[(df0.iloc[validid,:-1], df0.iloc[validid, -1])],
            eval_metric='AUC')

In [None]:
roc_auc_score(df.target[trainid], model.predict_proba(df0.iloc[trainid,:-1])[:,1])

In [None]:
model.booster_.save_model('baseLGBM.txt')

prediction with LGBM base:

In [None]:
pred_g_v = model.predict_proba(df0.iloc[validid,:-1])[:,1]

## Catboost

In [None]:
model = CatBoostClassifier(iterations=1500, random_seed=8, verbose=True, eval_metric='AUC')
model.fit(df0.iloc[trainid,:-1], df0.iloc[trainid, -1],
          eval_set=(df0.iloc[validid,:-1], df0.iloc[validid,-1]))

In [None]:
roc_auc_score(df0.target[trainid], model.predict_proba(df0.iloc[trainid,:-1])[:,1])

In [None]:
model.save_model('cat_model')

predicting using this base only:

In [None]:
pred_c_v = model.predict_proba(df0.iloc[validid,:-1])[:,1]

## Neural Network (Sequential)
just a basic neural network with 142 neurons in one hidden layer.

In [None]:
tf.random.set_seed(95)
model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense((df0.shape[1]-1)*2, activation='sigmoid'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC'])
history = model.fit(x=tf.convert_to_tensor(df0.iloc[trainid,:-1]), y=tf.convert_to_tensor(df0.iloc[trainid,-1]), 
                    validation_data=(tf.convert_to_tensor(df0.iloc[validid,:-1]), tf.convert_to_tensor(df0.iloc[validid,-1])), 
                    epochs=70, batch_size=400, workers=3, use_multiprocessing=True)

In [None]:
plt.figure(dpi=200)
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='valid loss')
plt.legend();

plt.figure(dpi=200)
plt.plot(history.history['auc'], label='AUC')
plt.plot(history.history['val_auc'], label='valid AUC')
plt.legend();

In [None]:
model.save('nn_model.h5')

prediction using this base only:

In [None]:
pred_nn_v = model.predict(tf.convert_to_tensor(df0.iloc[validid,:-1]))[:,0]

# Modeling using `df1`
`df1` used to be a subset of the original train dataset which label is 1, but now is the concatenation of `df` and `bol` only. The models trained in this section process `df_letters` and `df1` separately.

In [None]:
df1 = pd.concat([df, bol], axis=1)
cols = df1.columns.tolist()
cols.sort()
df1 = df1[cols]
df1.drop(['f_27', 'id'], axis=1, inplace=True)
df1.head(3)

In [None]:
n = df1.shape[1]-1
n

In [None]:
def pred_dnn_mod(model_):
    '''predicting using a DNN model'''
    a = model_.predict({'letters':tf.convert_to_tensor(df_letters.iloc[validid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[validid,:-1])})[:,0]
    return a

## CNN and LSTM
My intuition is that CNN could recognize shapes in images, and I notice that many sequences have the same letter for the first five sequence. Thus, I want to use CNN to recognize some features in the string sequence. I try three variations of this architecture. In particular, I play with the size of the filter, the strides, and the padding.

In [None]:
from tensorflow.keras.layers import Bidirectional
tf.random.set_seed(99)
letters_input = tf.keras.Input(shape=(10,), name='letters')
letters_input1 = tf.expand_dims(letters_input, axis=-1)
other_input = tf.keras.Input(shape=(n,), name='numeric_and_bol')

x1 = tf.keras.layers.Conv1D(16, 5, strides=5, padding='same', input_shape=(None, 10, 1))(letters_input1)
x1 = Bidirectional(tf.keras.layers.LSTM(5, return_sequences=True))(x1)
x1 = tf.keras.layers.GlobalMaxPool1D()(x1)

x2 = tf.keras.layers.Conv1D(16, 2, strides=2, padding='same')(letters_input1)
x2 = tf.keras.layers.LSTM(9, return_sequences=True)(x2)
x2 = tf.keras.layers.GlobalMaxPool1D()(x2)

x3 = tf.keras.layers.Conv1D(16, 3, strides=3, padding='same')(letters_input1)
x3 = tf.keras.layers.LSTM(8, return_sequences=True)(x3)
x3 = tf.keras.layers.GlobalMaxPool1D()(x3)

x4 = tf.keras.layers.Conv1D(16, 4, strides=4, padding='same')(letters_input1)
x4 = tf.keras.layers.LSTM(7, return_sequences=True)(x4)
x4 = tf.keras.layers.GlobalMaxPool1D()(x4)

concat1 = tf.keras.layers.concatenate([x1, x2, x3, x4])
concat1 = tf.keras.layers.Dense(80, activation='sigmoid')(concat1)

concat = tf.keras.layers.concatenate([concat1, other_input])
x = tf.keras.layers.BatchNormalization()(concat)
x = tf.keras.layers.Dense(128, activation='sigmoid')(x)
final = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[letters_input, other_input], outputs=final, name='cl_model')
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC'])
model.summary()

In [None]:
tf.keras.utils.plot_model(model, 'cnn_lstm.png', show_shapes=True)

In [None]:
history = model.fit({'letters':tf.convert_to_tensor(df_letters.iloc[trainid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[trainid,:-1])}, 
                    tf.convert_to_tensor(df1.target[trainid]), epochs=30, batch_size=300, workers=3, use_multiprocessing=True,
                    validation_data=({'letters':tf.convert_to_tensor(df_letters.iloc[validid,:]), 
                                      'numeric_and_bol':tf.convert_to_tensor(df1.iloc[validid,:-1])}, 
                                     tf.convert_to_tensor(df1.target[validid])))

In [None]:
model.save('CNN_LSTM.h5')

prediction with this base model:

In [None]:
pred_cl_v = pred_dnn_mod(model)

## CNN (filter size = 7) and LSTM

In [None]:
from tensorflow.keras.layers import Bidirectional
tf.random.set_seed(99)
letters_input = tf.keras.Input(shape=(10,), name='letters')
letters_input1 = tf.expand_dims(letters_input, axis=-1)
other_input = tf.keras.Input(shape=(n,), name='numeric_and_bol')

x1 = tf.keras.layers.Conv1D(16, 7, strides=1, padding='same', input_shape=(None, 10, 1))(letters_input1)
x1 = Bidirectional(tf.keras.layers.LSTM(10, return_sequences=True))(x1)
x1 = tf.keras.layers.GlobalMaxPool1D()(x1)

x2 = tf.keras.layers.Conv1D(16, 7, strides=2, padding='same')(letters_input1)
x2 = tf.keras.layers.LSTM(10, return_sequences=True)(x2)
x2 = tf.keras.layers.GlobalMaxPool1D()(x2)

x3 = tf.keras.layers.Conv1D(16, 7, strides=1)(letters_input1)
x3 = tf.keras.layers.LSTM(4, return_sequences=True)(x3)
x3 = tf.keras.layers.GlobalMaxPool1D()(x3)

concat1 = tf.keras.layers.concatenate([x1, x2, x3])
concat1 = tf.keras.layers.Dense(80, activation='sigmoid')(concat1)

concat = tf.keras.layers.concatenate([concat1, other_input])
x = tf.keras.layers.BatchNormalization()(concat)
x = tf.keras.layers.Dense(128, activation='sigmoid')(x)
final = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[letters_input, other_input], outputs=final, name='cl7_model')
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC'])
model.summary()

In [None]:
tf.keras.utils.plot_model(model, 'cnn7_lstm.png', show_shapes=True)

In [None]:
history = model.fit({'letters':tf.convert_to_tensor(df_letters.iloc[trainid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[trainid,:-1])}, 
                    tf.convert_to_tensor(df1.target[trainid]), epochs=30, batch_size=300, workers=3, use_multiprocessing=True,
                    validation_data=({'letters':tf.convert_to_tensor(df_letters.iloc[validid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[validid,:-1])}, 
                                     tf.convert_to_tensor(df1.target[validid])))

In [None]:
model.save('cnn7_lstm.h5')

In [None]:
pred_cl7_v = pred_dnn_mod(model)

## CNN (filter size = 6) and LSTM
I also tweak the architecture here a little bit: I concatenate all the string processing with the numeric input, then put them as an input to the next layer altogether.

In [None]:
from tensorflow.keras.layers import Bidirectional
tf.random.set_seed(99)
letters_input = tf.keras.Input(shape=(10,), name='letters')
letters_input1 = tf.expand_dims(letters_input, axis=-1)
other_input = tf.keras.Input(shape=(n,), name='numeric_and_bol')

x1 = tf.keras.layers.Conv1D(16, 6, strides=1, padding='same', input_shape=(None, 10, 1))(letters_input1)
x1 = Bidirectional(tf.keras.layers.LSTM(10, return_sequences=True))(x1)
x1 = tf.keras.layers.GlobalMaxPool1D()(x1)

x2 = tf.keras.layers.Conv1D(16, 6, strides=2, padding='same')(letters_input1)
x2 = tf.keras.layers.LSTM(5, return_sequences=True)(x2)
x2 = tf.keras.layers.GlobalMaxPool1D()(x2)

x3 = tf.keras.layers.Conv1D(16, 6, strides=1)(letters_input1)
x3 = tf.keras.layers.LSTM(3, return_sequences=True)(x3)
x3 = tf.keras.layers.GlobalMaxPool1D()(x3)

concat = tf.keras.layers.concatenate([x1, x2, x3, other_input])
x = tf.keras.layers.BatchNormalization()(concat)
x = tf.keras.layers.Dense(128, activation='sigmoid')(x)
final = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[letters_input, other_input], outputs=final, name='cl6_model')
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC'])
model.summary()

In [None]:
tf.keras.utils.plot_model(model, 'cnn6_lstm.png', show_shapes=True)

In [None]:
history = model.fit({'letters':tf.convert_to_tensor(df_letters.iloc[trainid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[trainid,:-1])}, 
                    tf.convert_to_tensor(df1.target[trainid]), epochs=30, batch_size=300, workers=3, use_multiprocessing=True,
                    validation_data=({'letters':tf.convert_to_tensor(df_letters.iloc[validid,:]), 
                                      'numeric_and_bol':tf.convert_to_tensor(df1.iloc[validid,:-1])}, 
                                     tf.convert_to_tensor(df1.target[validid])))

In [None]:
model.save('cnn6_lstm.h5')

In [None]:
pred_cl6_v = pred_dnn_mod(model)

## CRAN
Another alternative: CNN and LSTM, but the CNN results are used as weights for an Attention layer. The hidden state from LSTM is also used in the Attention layer. More details in [here](https://www.researchgate.net/publication/322247966_A_Convolutional_Attention_Model_for_Text_Classification).

In [None]:
from tensorflow.keras.layers import Bidirectional
tf.random.set_seed(99)
letters_input = tf.keras.Input(shape=(10,), name='letters') # for LSTM
letters_input1 = tf.expand_dims(letters_input, axis=-1) # for CNN
other_input = tf.keras.Input(shape=(n,), name='numeric_and_bol')

x1 = tf.keras.layers.Conv1D(16, 2, strides=2)(letters_input1)
x1 = tf.keras.layers.GlobalAveragePooling1D()(x1)

x2, state_h, state_c = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(letters_input1)
s = tf.keras.layers.Attention()([state_h, x1])
midpoint = tf.keras.layers.Dense(1, activation='sigmoid')(s)

concat = tf.keras.layers.concatenate([midpoint, other_input])
concat = tf.keras.layers.Dense(2*concat.shape[1], activation='sigmoid')(concat)
final = tf.keras.layers.Dense(1, activation='sigmoid')(concat)

model = tf.keras.Model(inputs=[letters_input, other_input], outputs=final, name='cran_model')
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics='AUC')
model.summary()

In [None]:
tf.keras.utils.plot_model(model, 'cran.png', show_shapes=True)

In [None]:
history = model.fit({'letters':tf.convert_to_tensor(df_letters.iloc[trainid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[trainid,:-1])}, 
                    tf.convert_to_tensor(df1.target[trainid]), epochs=30, batch_size=200, workers=3,
                    validation_data=({'letters':tf.convert_to_tensor(df_letters.iloc[validid,:]), 
                                      'numeric_and_bol':tf.convert_to_tensor(df1.iloc[validid,:-1])}, 
                                     tf.convert_to_tensor(df1.target[validid])))

In [None]:
model.save('cran.h5')

pred_a_v = pred_dnn_mod(model)

In [None]:
print(pred_a_v[:5])

## A variation of CRAN
CRAN model again, but this time the filter size for convolutional is 5.

In [None]:
tf.random.set_seed(99)
letters_input = tf.keras.Input(shape=(10,), name='letters') # for LSTM
letters_input1 = tf.expand_dims(letters_input, axis=-1) # for CNN
other_input = tf.keras.Input(shape=(n,), name='numeric_and_bol')

x1 = tf.keras.layers.Conv1D(10, 5, strides=5, padding='same')(letters_input1)
x1 = tf.keras.layers.GlobalAveragePooling1D()(x1)

x2, state_h, state_c = tf.keras.layers.LSTM(10, return_sequences=True, return_state=True)(letters_input1)
s = tf.keras.layers.Attention()([state_h, x1])
midpoint = tf.keras.layers.Dense(1, activation='sigmoid')(s)

concat = tf.keras.layers.concatenate([midpoint, other_input])
concat = tf.keras.layers.Dense(102, activation='sigmoid')(concat)
final = tf.keras.layers.Dense(1, activation='sigmoid')(concat)

model = tf.keras.Model(inputs=[letters_input, other_input], outputs=final, name='cran5_model')
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics='AUC')
model.summary()

In [None]:
tf.keras.utils.plot_model(model, 'cran5.png', show_shapes=True)

In [None]:
history = model.fit({'letters':tf.convert_to_tensor(df_letters.iloc[trainid,:]), 'numeric_and_bol':tf.convert_to_tensor(df1.iloc[trainid,:-1])}, 
                    tf.convert_to_tensor(df1.target[trainid]), epochs=30, batch_size=400, workers=3, use_multiprocessing=True,
                    validation_data=({'letters':tf.convert_to_tensor(df_letters.iloc[validid,:]), 
                                      'numeric_and_bol':tf.convert_to_tensor(df1.iloc[validid,:-1])}, 
                                     df1.target[validid]))

In [None]:
model.save('cran5.h5')

In [None]:
pred_a5_v = pred_dnn_mod(model)

# Blending (Stacking)
Stacking is done with Random Forest

In [None]:
print('-----in the blending/stacking stage-----')

In [None]:
preds = np.vstack((pred_g_v, pred_c_v, pred_nn_v, pred_cl_v, pred_cl7_v, pred_cl6_v, pred_a_v, pred_a5_v)).T
y = df.target.copy()

In [None]:
del df
del pred_g_v, pred_c_v, pred_nn_v, pred_cl_v, pred_cl7_v, pred_cl6_v, pred_a_v, pred_a5_v

In [None]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(random_state=9990)
cross_val_score(rf, preds, y[validid], scoring='roc_auc', verbose=1, n_jobs=-1)

In [None]:
rf.fit(preds, y[validid])

# Test dataset
## Preprocessing

In [None]:
print('-----preprocessing the test dataset-----')

In [None]:
df = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
df.head()

In [None]:
df.shape

In [None]:
#for char in [chr(x) for x in range(65, 91)]:
#    print(f"there are {sum(test.f_27.str.contains(char))} observations contains {char}")

In [None]:
# tokenize f_27
df_letters = pd.DataFrame(tokenizer_uni.texts_to_sequences(df.f_27))
df_letters.rename(columns={0:'f_270', 1:'f_271', 2:'f_272', 3:'f_273', 4:'f_274',
                           5:'f_275', 6:'f_276', 7:'f_277', 8:'f_278', 9:'f_279'},
                  inplace=True)
df_letters.head(3)

In [None]:
# bol
bol = countvec.transform(df.f_27)
bol = bol.toarray()
bol = pd.DataFrame(bol, columns=bol_colnames)
bol.head(3)

In [None]:
df['f_31'] = df_letters.nunique(axis=1)
df['f_32'] = df['f_21'] + df['f_02']
df['f_33'] = df['f_01'] + df['f_28']
for i in range(8):
    df[f'f_{34+i}'] = df[triplets[i]].sum(axis=1)

df.head(3)

In [None]:
df0 = pd.concat([df, df_letters, bol], axis=1)
cols = df0.columns.tolist()
cols.sort()
df0 = df0[cols]
df0.drop(['f_27', 'id'], axis=1, inplace=True)
df0.head()

In [None]:
df0.shape

In [None]:
df1 = pd.concat([df, bol], axis=1)
cols = df1.columns.tolist()
cols.sort()
df1 = df1[cols]
df1.drop(['f_27', 'id'], axis=1, inplace=True)
df1.head()

# Submission

In [None]:
subs = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
subs.head()

In [None]:
print('GBM prediction-----')
model = lgbm.Booster(model_file='baseLGBM.txt')
pred_g = model.predict(df0)

In [None]:
print('CAT prediction-----')
model = CatBoostClassifier()
model.load_model('cat_model')

pred_c = model.predict_proba(df0)[:,1]

In [None]:
print('NN prediction-----')
model = tf.keras.models.load_model('nn_model.h5')
pred_nn = model.predict(tf.convert_to_tensor(df0), workers=3)[:,0]

In [None]:
#before moving on...
df_letters = tf.convert_to_tensor(df_letters)
df1 = tf.convert_to_tensor(df1)

In [None]:
print('CL prediction-----')
model = tf.keras.models.load_model('CNN_LSTM.h5')
pred_cl = model.predict({'letters':df_letters, 'numeric_and_bol':df1}, workers=3)[:,0]

In [None]:
print('CL7 prediction-----')
model = tf.keras.models.load_model('cnn7_lstm.h5')
pred_cl7 = model.predict({'letters': df_letters, 'numeric_and_bol':df1}, workers=3)[:,0]

In [None]:
print('CL6 prrdiction-----')
model = tf.keras.models.load_model('cnn6_lstm.h5')
pred_cl6 = model.predict({'letters':df_letters, 'numeric_and_bol':df1}, workers=3)[:,0]

In [None]:
print('CRAN prediction-----')
model = tf.keras.models.load_model('cran.h5')
pred_a = model.predict({'letters':df_letters, 'numeric_and_bol':df1}, workers=3)[:,0]

In [None]:
print('CRAN5 prediction-----')
model = tf.keras.models.load_model('cran5.h5')
pred_a5 = model.predict({'letters':df_letters, 'numeric_and_bol':df1}, workers=3)[:,0]

In [None]:
preds = np.vstack((pred_g, pred_c, pred_nn, pred_cl, pred_cl7, pred_cl6, pred_a, pred_a5)).T

In [None]:
subs['target'] = rf.predict_proba(preds)[:,1]
subs.head()

In [None]:
subs.to_csv('mysubmission.csv', index=False)