In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
import warnings
import gc
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

In [None]:
seed = 47

In [None]:
def evaluate_model(model, x, y):
    y_pred_prob = model.predict(x)
    acc = accuracy_score(y, y_pred_prob)
    return {'accuracy' : acc}

In [None]:
def get_xgboost_model(params=None):
    if params is None:
        params = {'colsample_bytree': 0.1,
                  'eta': 0.12,
                  'gamma': 5, 
                  'max_depth': 2,
                  'min_child_weight': 9,
                  'n_estimators': 1000, 
                  'subsample': 0.9}          

    return XGBClassifier(**params,
                         objective='multi:softmax',
                         random_state=seed, 
                         tree_method='gpu_hist', 
                         predictor='gpu_predictor',
                         early_stopping_rounds=200,
                         verbosity=0)

In [None]:
def get_nn_model(n_layers=None, n_units=32, activation='swish'):
    model = tf.keras.Sequential()
    
    if n_layers is not None and n_layers > 0:
        for _ in range(n_layers):
            model.add(tf.keras.layers.Dense(units=n_units, activation=activation))
    model.add(tf.keras.layers.Dense(units=7, activation='softmax'))
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=[tf.metrics.SparseCategoricalAccuracy()])

    return model

In [None]:
def get_pipelines(model):
    pipelines = list()
    # normalize
    p = Pipeline([('s',MinMaxScaler()), ('m',model)])
    pipelines.append(('norm', p))
    # standardize
    p = Pipeline([('s',StandardScaler()), ('m',model)])
    pipelines.append(('std', p))
    # quantile
    p = Pipeline([('s',QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m',model)])
    pipelines.append(('quan', p))
    # pca
    p = Pipeline([('s',PCA()), ('m',model)])
    pipelines.append(('pca', p))
    # svd
    p = Pipeline([('s',TruncatedSVD()), ('m',model)])
    pipelines.append(('svd', p))
    
    p = Pipeline([('s',StandardScaler()), ('p', PowerTransformer()), ('m',model)])
    pipelines.append(('std-power', p))
    # scale and power
    p = Pipeline([('s',MinMaxScaler()), ('p', PowerTransformer()), ('m',model)])
    pipelines.append(('min-max-power', p))
    
    p = Pipeline([('p', PowerTransformer()), ('m',model)])
    pipelines.append(('power', p))
    
    return pipelines

In [None]:
def score_model(x, y, model):
    # define the cross-validation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
    # evaluate model
    scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [None]:
geomean = lambda x, axis : np.exp(np.mean(np.log(x), axis=axis))
harmonic_mean = lambda x, axis : len(x) / np.sum(1.0/x, axis=axis) 

funcs = {'mean' : np.mean, 
         'std' : np.std, 
         'var' : np.var, 
         'geo_mean' : geomean, 
         'harmonic_mean' : harmonic_mean, 
         'median' : np.median,
         'None_feature_engineering' : None}

In [None]:
# train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',')
random.seed(seed)
n = 4000000
s = 400000
skip = sorted(random.sample(range(1, n),n-s))

train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',', skiprows=skip)

# XGBoost Baseline

In [None]:
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1).values
y_train = train_df['Cover_Type'].values 
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)

In [None]:
params = {'colsample_bytree': 0.1,
          'eta': 0.12,
          'gamma': 5, 
          'max_depth': 2,
          'min_child_weight': 9,
          'n_estimators': 1000, 
          'subsample': 0.9}          

model = XGBClassifier(**params, 
                      objective='multi:softmax',
                      random_state=seed, 
                      tree_method='gpu_hist', 
                      predictor='gpu_predictor',
                      early_stopping_rounds=200,
                      verbosity=0)

model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

# Feature Engineering

Here wee will experiment creating synthetic features using central tendency statistics.

<h3>Feature Engineering XGBoost</h3>


In [None]:
results, names = list(), list()

for key in funcs.keys():
    x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
    if funcs[key] is not None:
        x_train[key] = funcs[key](x_train, axis=1)
    y_train = train_df['Cover_Type']
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = seed, shuffle=True)
    model = get_xgboost_model()
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    names.append(key)
    results.append(result['accuracy'])
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index = np.argmax(results)
print("Best Result: ", names[index], results[index])

# XGBoost - Testing different configurations

In [None]:
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type']
x_train['mean'] = np.mean(x_train, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)

# 1 - Testing different number of estimators

In [None]:
def get_models_n_estimators():
    models = dict()
    trees = [10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    for n in trees:
        params = {'n_estimators' : n}
        models[str(n)] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_n_estimators()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
n_estimators = int(names[index])
print("Best number of estimators", n_estimators)

# 2 - Testing different max_depth

In [None]:
def get_models_n_depths():
    models = dict()
    for depth in range(1,20):
        params = {'n_estimators' : n_estimators, 'max_depth' : depth}
        models[str(depth)] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_n_depths()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
max_depth = int(names[index])
print("Best max depth", max_depth)

# 3 - Testing different subsamples


In [None]:
def get_models_subsamples():
    models = dict()
    for subsample in np.arange(0.1, 1.1, 0.1):
        params = {'n_estimators' : n_estimators, 'max_depth' : max_depth, 'subsample' : subsample}
        key = '%.1f' % subsample
        models[key] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_subsamples()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
subsample = float(names[index])
print("Best subsample", subsample)

# 4 - Testing different learning rates

In [None]:
def get_models_lr():
    models = dict()
    rates = [0.0001, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.12, 0.13, 0.3, 0.5, 1.0]
    for r in rates:
        params = {'n_estimators' : n_estimators, 'max_depth' : max_depth, 'subsample' : subsample, 'eta' : r}
        key = '%.4f' % r
        models[key] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_lr()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
eta = float(names[index])
print("Best learning rate", eta)

# 5 - Testing different number of features

In [None]:
def get_models_nfeatures():
    models = dict()
    for i in np.arange(0.1, 1.1, 0.1):
        params = {'n_estimators' : n_estimators, 'max_depth' : max_depth, 'subsample' : subsample, 'eta' : eta, 'colsample_bytree' : i}
        key = '%.1f' % i
        models[key] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_nfeatures()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
colsample_bytree = float(names[index])
print("Best colsample_bytree", colsample_bytree)

# 6 - Testing different number of gamma

In [None]:
def get_models_n_gamma():
    models = dict()
    # for gamma in range(1,20):
    for gamma in np.arange(0.0, 1.1, 0.1):
        params = {'n_estimators' : n_estimators, 
                  'max_depth' : max_depth,
                  'subsample' : subsample,
                  'eta' : eta, 
                  'colsample_bytree' : colsample_bytree,
                  'gamma' : gamma}
        models[str(gamma)] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_n_gamma()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
gamma = float(names[index])
print("Best gamma", gamma)

# 7 - Testing different number of min_child_weight

In [None]:
def get_models_n_min_child_weight():
    models = dict()
    for min_child_weight in range(1,20):
        params = {'n_estimators' : n_estimators, 
                  'max_depth' : max_depth,
                  'subsample' : subsample,
                  'eta' : eta, 
                  'colsample_bytree' : colsample_bytree,
                  'gamma' : gamma,
                  'min_child_weight' : min_child_weight}
        models[str(min_child_weight)] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_n_min_child_weight()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
min_child_weight = int(names[index])
print("Best min_child_weight", min_child_weight)

# 8 - Testing different number of reg_alpha

In [None]:
def get_models_n_reg_alpha():
    models = dict()
    for reg_alpha in [0, 1e-5, 1e-2, 0.1, 0.01, 0.001, 0.003, 1, 10, 100]:
        params = {'n_estimators' : n_estimators, 
                  'max_depth' : max_depth,
                  'subsample' : subsample,
                  'eta' : eta, 
                  'colsample_bytree' : colsample_bytree,
                  'gamma' : gamma,
                  'min_child_weight' : min_child_weight,
                  'reg_alpha': reg_alpha}
        models[str(reg_alpha)] = get_xgboost_model(params)
    return models

In [None]:
models = get_models_n_min_child_weight()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, verbose=True)
    score = evaluate_model(model, x_test, y_test)
    results.append(score['accuracy'])
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
reg_alpha = int(names[index])
print("reg_alpha", reg_alpha)

In [None]:
params = {'n_estimators' : n_estimators,
          'max_depth' : max_depth,
          'subsample' : subsample,
          'eta' : eta, 
          'colsample_bytree' : colsample_bytree,
          'gamma' : gamma,
          'min_child_weight' : min_child_weight,
          'reg_alpha' : reg_alpha}
         
print('Best Params: ', params)

In [None]:
model = get_xgboost_model(params)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

In [None]:
params = {'n_estimators' : n_estimators,
          'max_depth' : max_depth,
          'subsample' : subsample,
          'eta' : eta, 
          'colsample_bytree' : colsample_bytree,
          'gamma' : gamma,
          'min_child_weight' : min_child_weight,
          'reg_alpha' : reg_alpha}
model = get_xgboost_model(params)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

# Submission

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',')
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type']
x_train['mean'] = np.mean(x_train, axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)

In [None]:
params = {'n_estimators' : n_estimators,
          'max_depth' : max_depth,
          'subsample' : subsample,
          'eta' : eta, 
          'colsample_bytree' : colsample_bytree,
          'gamma' : gamma,
          'min_child_weight' : min_child_weight,
          'reg_alpha' : reg_alpha}
        
model = get_xgboost_model(params)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

In [None]:
del train_df, x_train, y_train, x_test, y_test
gc.collect()

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', sep=',')
x_test = test_df.drop(['Id', 'Soil_Type7','Soil_Type15'], axis=1)
x_test['mean'] = np.mean(x_test, axis=1)

In [None]:
target = model.predict(x_test).squeeze()
ids = test_df['Id'].values
submission_xgboost = pd.DataFrame({'Id' : ids, 'Cover_Type' : target})

In [None]:
submission_xgboost.head()

In [None]:
submission_xgboost.to_csv('submission_xgboost.csv', index=False) # score 0.95378

In [None]:
del test_df, x_test
gc.collect()

# Neural Network Baseline

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',', skiprows=skip)

In [None]:
scaler = StandardScaler()
le = LabelEncoder()

x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)

y_train = train_df['Cover_Type'].values 
y_train = le.fit_transform(y_train)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units=7, activation='softmax'))
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])

model.compile(optimizer=tf.keras.optimizers.Adam(), 
               loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[tf.metrics.SparseCategoricalAccuracy()])

model.fit(x_train, y_train, batch_size=32, epochs=20)
model.evaluate(x_test, y_test)

<h3>Feature Engineering Neural Network Model</h3>

In [None]:
scaler = StandardScaler()
le = LabelEncoder()

results, names = list(), list()

for key in funcs.keys():
    x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
    if funcs[key] is not None:
        x_train[key] = funcs[key](x_train, axis=1)
    y_train = train_df['Cover_Type'].values
    y_train = le.fit_transform(y_train)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = seed, shuffle=True)
    x_train = scaler.fit_transform(x_train.values)
    x_test = scaler.transform(x_test)    
    model = get_nn_model()
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    names.append(key)
    results.append(result)
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index = np.argmax(results)
print("Best Result: ", names[index], results[index])

<h3>Neural Network Pipelines</h3>

In [None]:
transformers = {'Min-Max-Scaler': MinMaxScaler(), 
                'Standard-Scaler': StandardScaler(),
                'QuantileTransformer': QuantileTransformer(n_quantiles=100, output_distribution='normal'),
                'PCA': PCA(),
                'TruncatedSVD': TruncatedSVD(),
                'PowerTransformer': PowerTransformer(),
                'No-transformer': None}

In [None]:
results, names = list(), list()

for key in transformers.keys():
    x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
    y_train = train_df['Cover_Type'].values
    y_train = le.fit_transform(y_train)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
    
    if transformers[key] is not None:
        x_train = transformers[key].fit_transform(x_train.values)
        x_test = transformers[key].transform(x_test)    
    
    model = get_nn_model()
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    names.append(key)
    results.append(result)
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index = np.argmax(results)
print("Best Result: ", names[index], results[index])

# Neural Network - Testing different configurations

In [None]:
scaler = StandardScaler()
le = LabelEncoder()
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type'].values 
x_train['std'] = np.std(x_train, axis=1)
y_train = le.fit_transform(y_train)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 1 - Testing different number o layers

In [None]:
def get_models_n_layers():
    models = dict()
    for n_layers in [0, 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10]:
        models[n_layers] = get_nn_model(n_layers=n_layers)
    return models

In [None]:
models =  get_models_n_layers()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
n_layers = int(names[index])
print("Best number of layers", n_layers)

# 2 - Testing different number of units

In [None]:
def get_models_n_units():
    models = dict()
    for n_units in [8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
        models[n_units] = get_nn_model(n_layers=n_layers, n_units=n_units)
    return models

In [None]:
models =  get_models_n_units()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
n_units = int(names[index])
print("Best number of units", n_units)

# 3 - Testing different activation functions

In [None]:
def get_models_n_activations():
    models = dict()
    for activation in ["swish", "relu", "selu", "softplus", "elu"]:
        models[activation] = get_nn_model(n_layers=n_layers, n_units=n_units, activation=activation)
    return models

In [None]:
models = get_models_n_activations()
results, names = list(), list()

for i, (name, model) in enumerate(models.items()):
    model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
activation = names[index]
print("Best activation function", activation)

# 4 testing different number of batch

In [None]:
results, names = list(), list()
batches = [8, 16, 32, 64, 128, 256, 512]

for i, (name, batch_size) in enumerate(zip(batches, batches)):
    get_nn_model(n_layers=n_layers, n_units=n_units, activation=activation)
    model.fit(x_train, y_train, batch_size=batch_size, epochs=15, verbose=0)
    result = model.evaluate(x_test, y_test, verbose=0)[1]
    results.append(result)
    names.append(name)
    print(name, 'accuracy: %.3f' % (results[i]))

index = np.argmax(results)
batch_size = int(names[index])
print("Best batch_size", batch_size)

In [None]:
print("Best parameters")
print("n_layers:", n_layers)
print("n_units:", n_units)
print("activation:", activation)
print("batch_size:", batch_size)

# Submission

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', sep=',', skiprows=skip)
scaler = StandardScaler()
le = LabelEncoder()
x_train = train_df.drop(['Id', 'Soil_Type7','Soil_Type15', 'Cover_Type'], axis=1)
y_train = train_df['Cover_Type'].values 
x_train['std'] = np.std(x_train, axis=1)
y_train = le.fit_transform(y_train)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
model = get_nn_model(n_layers=n_layers, n_units=n_units, activation=activation)
model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
score = model.evaluate(x_test, y_test, verbose=0)[1]
print(score)

In [None]:
del train_df, x_train, y_train, x_test, y_test
gc.collect()

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', sep=',')
x_test = test_df.drop(['Id', 'Soil_Type7','Soil_Type15'], axis=1)
x_test['std'] = np.std(x_test, axis=1)
x_test = scaler.transform(x_test)

In [None]:
preds = model.predict(x_test)
target = np.argmax(preds, axis=-1)
ids = test_df['Id'].values
submission_nn = pd.DataFrame({'Id' : ids, 'Cover_Type' : target + 1})

In [None]:
submission_nn = pd.DataFrame({'Id' : ids, 'Cover_Type' : target + 1})

In [None]:
submission_nn.head()

In [None]:
submission_nn.to_csv('submission_nn.csv', index=False) # score 0.93079

In [None]:
del test_df, x_test
gc.collect()

# Ensemble XGBoost and Neural Network

In [None]:
df_submission_xgboost = pd.read_csv('submission_xgboost.csv')
df_submission_nn = pd.read_csv('submission_nn.csv')
ids = df_submission_xgboost['Id'].values
submission_ensemble = pd.DataFrame({'Id' : ids,
                           'Cover_Type' : np.array(df_submission_xgboost['Cover_Type'].values + df_submission_nn['Cover_Type'].values)//2})


In [None]:
submission_ensemble .head()

In [None]:
submission_ensemble.to_csv('submission_ensemble.csv', index=False) # 0.93155

# Submission Best Model - XGboost

In [None]:
submission_xgboost.to_csv('submission.csv', index=False)