In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# I'll be starting with tensorflow 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, datasets

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col=0)
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col=0)

print("Training data shape: ", train.shape)
print("Test data shape: ", test.shape)

In [None]:
# checking available data types
print("Data types:\n", train.dtypes.unique())

# checking for missing data
print("\nMissing data in train set:\n", np.sum(train.isna().sum()))
print("\nMissing data in test set:\n", np.sum(test.isna().sum()))

# checking for duplicate values
print("\nDuplicate values in train set:\n", train.duplicated().sum())
print("\nDuplicate values in test set:\n", test.duplicated().sum())

In [None]:
# Now, have to deal with duplicate values in the dataset
train_with_duplicates = train.copy()

# drop the duplicate values out of the dataset for both the train and test sets
train.drop_duplicates(keep='first', inplace=True)
# Dont't temper with the test set.
# test.drop_duplicates(keep='first', inplace=True)

print("Shape of new training data: ", train.shape)
print("Checking number of duplicates in new training data: ", train.duplicated().sum())

In [None]:
# CHECKING FOR THE DISTRIBUTION OF THE TARGETS FOR BOTH CASES
def plot(train, title):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xticks(rotation=30, ha='right')
    ax = sns.countplot(x=train['target'], data=train)
    print(pd.Series(train['target'], index=train.index).value_counts().sort_index() / len(train) * 100)
    plt.show()

In [None]:
plot(train, "Dataset without duplicates")

In [None]:
plot(train_with_duplicates, "Dataset with duplicates")

In [None]:
# Next, checking for memory usage 
# print("Memory usage(train_with_duplicates): {:5.2f} Mb".format(train_with_duplicates.memory_usage().sum()/1024**2))
# print("Memory usage(train): {:5.2f} Mb".format(train.memory_usage().sum()/1024**2))

# Still, the memory usage can be reduced further
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
    initial_mem = df.memory_usage().sum()/1024**2
    
    for col in df.columns:
        col_dtype = df[col].dtypes
        
        if col_dtype in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_dtype)[:3] == 'int':  
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum()/1024**2
    if verbose:
        print("Mem usage decreased to {:5.2f} Mb, ({:4.2f}%)".format(end_mem, 100*(initial_mem - end_mem)/initial_mem))
    return df

In [None]:
# Reducing memory usage for the dataset without duplicates
print("New memory usage for train set")
train = reduce_mem_usage(train)

print("New memory usage for test set")
test = reduce_mem_usage(test)
# Reducing memory usage for the dataset with duplicates
# train_with_duplicates = reduce_mem_usage(train_with_duplicates)

In [None]:
# Now, we can separate the features from the targets
features = train.columns[:-1]
target = 'target'

print("# of features: ", len(features))

In [None]:
# Target encoding, that is turn the bacteria into numeric labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train[target] = encoder.fit_transform(train[target])

# sample_weight = train.value_counts().values

In [None]:
X = train[features]
y = train[target]
X.shape, y.shape

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

In [None]:
def ExtraTreesModel(N_ESTIMATORS=300):
    model = ExtraTreesClassifier(
        n_estimators=N_ESTIMATORS,
        n_jobs=-1,
        verbose=0,
        random_state=1221
    )
    return model

def DNN():
    inputs = keras.Input(shape=X.shape[1])
    x = layers.Dense(512, activation='relu')(inputs)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(10, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    return model

In [None]:
def training_loop(model_name = "etc", N_SPLITS=10):
    sample_weight = train.value_counts().values
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=1223)
    scores = []

    for fold, (train_id, valid_id) in enumerate(tqdm(folds.split(X, y), total=N_SPLITS)):
        X_train = X.iloc[train_id]
        y_train = y.iloc[train_id]
        X_valid = X.iloc[valid_id]
        y_valid = y.iloc[valid_id]

        if model_name == "etc":
            # etc stands for ExtraTreeClassifier
            # Training
            model = ExtraTreesModel()
            if len(sample_weight) == 0:
                model.fit(X_train, y_train)
            else:
                sample_weight_train = sample_weight[train_id]
                model.fit(X_train, y_train, sample_weight=sample_weight_train)
            
            # validation
            valid_pred = model.predict(X_valid)
            valid_score = 0
            if len(sample_weight) == 0:
                valid_score = accuracy_score(valid_pred, y_valid)
            else:
                sample_weight_valid = sample_weight[valid_id]
                valid_score = accuracy_score(valid_pred, y_valid, sample_weight=sample_weight_valid)
                scores.append(valid_score)
            print(f"Fold {fold+1} \tAccuracy: {valid_score:.4f}")
            
        if model_name == "dnn" or model_name == "nn":
            y_train = keras.utils.to_categorical(y_train)
            y_valid = keras.utils.to_categorical(y_valid)
            model = DNN()
            earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min', restore_best_weights=True, verbose=1)
            learningRate = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, mode='min', min_lr=1e-5, verbose=1)
            model.compile(optimizer='rmsprop',
                         loss='categorical_crossentropy',
                         metrics=['accuracy'])
            model.fit(X_train, y_train,
                      validation_data=(X_valid, y_valid),
                      batch_size=128,
                      epochs=50,
                      callbacks = [earlyStopping, learningRate], verbose=0)
            print(f"Fold {fold+1} \tAccuracy: {model.evaluate(X_valid, y_valid)[1]:.4f}")
    
    return model

In [None]:
# dnn_model = training_loop(model_name='nn', N_SPLITS=3)

In [None]:
etc_model = training_loop()

In [None]:
# Neural network outputs are probabilistic. 
# Doing the same for the ExtraTreeClassifier
# dnn_probas = dnn_model.predict(test)
etc_probas = etc_model.predict_proba(test)

In [None]:
# # averaging/weighting the predictions
# preds1 = 0.5*(dnn_probas + etc_probas)
# preds2 = 0.65*dnn_probas + 0.35*etc_probas

In [None]:
# print(np.argmax(dnn_probas, axis=1))
# print(np.argmax(etc_probas, axis=1))
# print(np.argmax(preds1, axis=1))
# print(np.argmax(preds2, axis=1))

predictions = np.argmax(etc_probas, axis=1)
predictions

In [None]:
# convert the numbers into the respective names of the bacteria
labels = encoder.inverse_transform(predictions)
labels

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
sample['target'] = labels
sample.to_csv("submission.csv", index=False)

In [None]:
pd.read_csv("submission.csv").head()