In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from time import time 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from math import sqrt
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
import warnings
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import explained_variance_score
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
print('training set details:')
print('-' * 21)
print('shape:')
print()
print(train.shape)
print('*' * 50)
print('null columns:')
print()
print(train.isnull().sum())
print('*' * 50)
print('total nulls:')
print()
print(sum(train.isnull().sum()))
print('*' * 50)
print('info:')
print()
print(train.info())

In [None]:
print('testing set details:')
print('-' * 20)
print('shape:')
print()
print(test.shape)
print('*' * 50)
print('null columns:')
print()
print(test.isnull().sum())
print('*' * 50)
print('total nulls:')
print()
print(sum(test.isnull().sum()))
print('*' * 50)
print('info:')
print()
print(test.info())

In [None]:
train.describe()

In [None]:
train['loss'].describe().to_frame()

In [None]:
test.describe()

In [None]:
plt.subplots(figsize = (12,8))
sns.countplot(train['loss'])

In [None]:
from sklearn.feature_selection import VarianceThreshold as vt
features = train.iloc[:,1:101]
target = train['loss']
v = vt(threshold = 0.8)
v = v.fit(features, target)

cols = v.get_support(indices = True)
cols.shape

In [None]:
x_train = train.iloc[:, 1:101]
x_test = test.drop('id', axis = 1)

In [None]:
mm = MinMaxScaler().fit(x_train)
x_train_mm = mm.transform(x_train)
x_test_mm = mm.transform(x_test)

In [None]:
xtrain_data = pd.DataFrame(x_train_mm)
xtest_data = pd.DataFrame(x_test_mm)
xtrain_data['loss'] = train['loss']

In [None]:
print('xtrain_data shape:')
print()
print(xtrain_data.shape)
print('*' * 50)
print('xtest_data shape:')
print()
print(xtest_data.shape)

In [None]:
X_train1 = x_train.copy()
X_test1 = x_test.copy()
num_cols = x_train.columns
for i in num_cols:
    scale = StandardScaler().fit(X_train1[[i]])
    X_train1[i] = scale.transform(X_train1[[i]])
    X_test1[i] = scale.transform(X_test1[[i]])

X_train1["loss"] = train["loss"]

In [None]:

if "Set" not in X_train1.columns:
    X_train1["Set"] = np.random.choice(["train", "valid"], p =[.8, .2], size=(X_train1.shape[0],))

train_indices = X_train1[X_train1.Set=="train"].index
valid_indices = X_train1[X_train1.Set=="valid"].index

In [None]:
unused_feature = ['Set']
target = 'loss'
feature = [ col for col in X_train1.columns if col not in unused_feature+[target]] 

In [None]:
#train_data
X_train = X_train1[feature].values[train_indices]
y_train = X_train1[target].values[train_indices].reshape(-1, 1)
#valid data 
X_valid = X_train1[feature].values[valid_indices]
y_valid = X_train1[target].values[valid_indices].reshape(-1, 1)

In [None]:
!pip install torchvision
import torch

In [None]:
!pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetRegressor
max_epochs = 1000
Bs = 2048 
clf = TabNetRegressor(  verbose = 10 ,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":10, 
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax'
                           )

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['rmsle','rmse'],
    max_epochs=max_epochs,
    patience=50,
    batch_size=Bs, virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

In [None]:
print(f"BEST VALID SCORE  : {clf.best_cost}")

In [None]:
test_pred= X_test1.to_numpy()
predictions =  clf.predict(test_pred)

In [None]:
feat_importances = clf.feature_importances_
indices = np.argsort(feat_importances)

In [None]:
sample = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
predictions = pd.DataFrame(predictions) # from array to dataframe
sample['loss'] = predictions
sample.to_csv('tabnet_submission.csv',index=False)
sample