In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import random

from IPython import display as ipd
from tqdm import tqdm
import lightgbm as lgb

from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold

from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

import warnings
warnings.filterwarnings("ignore")

### Utils

In [None]:
def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')

### Data load

In [None]:
RANDOM_SEED = 42
DEBUG = True
seeding(RANDOM_SEED)

train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
train.info()

In [None]:
## display missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

### Encode target

In [None]:
target = train.target
train.drop(['target'], axis=1, inplace=True) 

In [None]:
encoder = LabelEncoder()
target = pd.DataFrame(encoder.fit_transform(target), columns=['target'])
target = target.astype(int)

In [None]:
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.countplot(x='target', data=target)
plt.title('Target', fontsize=15)
plt.show()

### Getting more info about features after EDA:

https://www.kaggle.com/vladlee/tps-feb-2022-simple-eda

just wait for it, loading takes time :)

In [None]:
train_skewed_cols = ['A0T0G0C10','A0T0G1C9','A0T0G2C8','A0T0G8C2','A0T0G9C1','A0T0G10C0','A0T1G1C8','A0T1G8C1','A0T1G9C0','A0T2G0C8',
    'A0T2G8C0','A0T3G0C7','A0T3G7C0','A0T10G0C0','A1T0G0C9','A1T0G8C1','A1T0G9C0','A1T1G0C8','A1T1G8C0','A2T0G0C8','A2T0G8C0',
    'A2T1G0C7','A3T0G0C7','A3T0G7C0','A10T0G0C0']

train_categorical_cols = ['A0T0G9C1', 'A0T0G10C0', 'A0T1G0C9', 'A0T1G9C0', 'A0T2G0C8', 'A0T2G8C0', 'A0T10G0C0', 'A1T0G0C9', 'A1T0G9C0',
    'A1T1G8C0','A2T0G0C8','A2T0G8C0','A10T0G0C0']

test_skewed_cols = ['A0T0G0C10','A0T0G1C9','A0T0G2C8','A0T0G8C2','A0T0G9C1','A0T0G10C0','A0T1G1C8','A0T1G8C1',
    'A0T1G9C0','A0T2G0C8','A0T2G1C7','A0T2G8C0','A0T3G0C7','A0T3G7C0','A0T9G1C0','A1T0G0C9',
    'A1T0G8C1','A1T0G9C0','A1T1G0C8','A2T0G0C8','A2T0G8C0','A3T0G0C7','A3T0G7C0','A9T0G0C1','A10T0G0C0']

test_categorical_cols = [ 'A0T0G0C10','A0T0G1C9','A0T0G9C1','A0T0G10C0','A0T1G0C9','A0T1G9C0','A0T10G0C0',
    'A1T0G0C9','A1T0G9C0','A10T0G0C0']

In [None]:
skewed_cols = []
categorical_cols = []

for col in train_skewed_cols:
    if col not in skewed_cols:
        skewed_cols.append(col)

for col in test_skewed_cols:
    if col not in skewed_cols:
        skewed_cols.append(col)
        
for col in train_categorical_cols:
    if col not in categorical_cols:
        categorical_cols.append(col)

for col in test_categorical_cols:
    if col not in categorical_cols:
        categorical_cols.append(col)        
        
print(skewed_cols)
print()
print(categorical_cols)

In [None]:
train[skewed_cols] = train[skewed_cols]**2
test[skewed_cols] = test[skewed_cols]**2

## removing hegatives
train[categorical_cols] = train[categorical_cols]**2
test[categorical_cols] = test[categorical_cols]**2

In [None]:
train.drop( ['row_id'], axis=1, inplace=True)
test.drop( ['row_id'], axis=1, inplace=True)

In [None]:
## doublecheck

total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

### Model

In [None]:
TOTAL_SPLITS = 10
N_REPEATS = 1
NUM_BOOST_ROUND = 100
EARLY_STOPPING_ROUNDS = 50
VERBOSE_EVAL = 100

def run_train(X, y, run_params, splits, num_boost_round, verbose_eval, early_stopping_rounds ):
    scores = []
    models = []
    y_preds = []
    eval_results = {}  # to record eval results for plotting
    folds = StratifiedKFold(n_splits=splits)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'Fold {fold_n+1} started')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model = lgb.train(
            run_params, valid_names=["train", "valid"], 
            categorical_feature = categorical_cols,
            train_set=lgb.Dataset(X_train, y_train ), 
            num_boost_round = num_boost_round,
            valid_sets = [lgb.Dataset(X_valid, y_valid)],
            callbacks=[lgb.log_evaluation(verbose_eval), 
               lgb.early_stopping(early_stopping_rounds, False, True),
               lgb.record_evaluation(eval_result=eval_results)],
        )

        y_pred = np.round(model.predict(X_valid)).astype(int)
        y_hat = y_pred.argmax(axis = 1)
        
        score = accuracy_score(y_valid, y_hat)
        print(f'Accuracy score: {score}')        
        
        y_preds.append(y_hat)
        models.append(model)
        scores.append(score)
    return scores, models, y_preds


run_params = {
    'verbosity': -1,
    'num_class' : 10,
    'boosting_type': 'gbdt', 
    'objective': 'multiclass', 
    'metric': ['multi_logloss', 'multi_error'],
    'force_col_wise' : True,
    'eta': 0.13,
}


scores, models, y_preds = run_train(train, target, run_params, 
                    TOTAL_SPLITS, NUM_BOOST_ROUND, VERBOSE_EVAL, EARLY_STOPPING_ROUNDS)

### Classification Report

In [None]:
for model in models:
    y_pred = model.predict(train)
    print(classification_report(target, y_pred.argmax(axis = 1), target_names=encoder.classes_))

### Submit

In [None]:
predicted = []
for model in models:
    y_pred = model.predict(test)
    predicted.append(y_pred.argmax(axis = 1))
    
test_pred = np.mean(predicted, axis=0).astype(int)   

In [None]:
submission['target'] = encoder.inverse_transform(test_pred)
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)