In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import everything
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import preprocessing

from xgboost import XGBRegressor

import warnings
warnings.simplefilter(action = 'ignore')
# %matplotlib inline
# plt.style.use('bmh') # there are some other style as well like : bmh , ggplot

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
data.head()

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')
test.head()

In [None]:
data.info(verbose = True , show_counts = True)

In [None]:
# f1 , f16 , f27 , f55 , f86
# instead of taking column name manually we can do code
data_int = data.select_dtypes(include = ['int64']).columns
print(data_int)

for i in [1,16,27,55,86]:
    s = f"f{i}"
    print(f"unique : {len(data[s].unique())}")

In [None]:
print(len(data.loss.unique()))

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(data.loss , kde = True)
plt.xlabel('loss' , fontsize = 20)
plt.show()

In [None]:
print(len(data[data.loss >= 35]))

In [None]:
# from the graph above we can see that loss is skewed so we'll use StarifiedKfold
# data.hist(figsize = (25,50) , xlabelsize = 8 , ylabelsize = 8)

In [None]:
def create_fold(data):
    data['kfold'] = -1
    data = data.sample(frac = 1.0).reset_index(drop = True)
    
    KF = model_selection.StratifiedKFold(n_splits = 5)
    
    for fold , (t_, v_) in enumerate(KF.split(X = data , y = data.loss)):
        data.loc[v_ , 'kfold'] = fold
        
    return data

In [None]:
data.head()

In [None]:
test = test.drop('id' , axis = 1)

In [None]:
def run(data , fold , test ,Y_PRED):
    data_train = data[data.kfold != fold].reset_index(drop = True)
    data_test = data[data.kfold == fold].reset_index(drop = True)
    
    feature = [i for i in data.columns if i not in ('id' , 'kfold' , 'loss')]
    
    scaler = preprocessing.StandardScaler()
    data_train[feature] = scaler.fit_transform(data_train[feature])
    data_test[feature] = scaler.fit_transform(data_test[feature])
    test[feature] = scaler.fit_transform(test[feature])
    
    model = XGBRegressor(n_estimators = 3000 , max_depth = 2 ,learning_rate = 0.05006731067627437, n_jobs = 4 , tree_method = 'gpu_hist' )
    model.fit(data_train[feature] , data_train.loss)
    
    y_pred = model.predict(data_test[feature])
    
    Y_PRED += model.predict(test[feature]).reshape((-1,1))
    
    error = metrics.mean_squared_error(data_test.loss , y_pred , squared = False)
    
    print(f"Error : {error}")

Y_PRED = np.zeros((test.shape[0] , 1) ,dtype = float)
data = create_fold(data)
for fold in range(5):
    run(data , fold ,test , Y_PRED)
Y_PRED = Y_PRED / 5

In [None]:
sample = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
sample.loss = Y_PRED
sample.head()

In [None]:
sample.to_csv('my_submission' , index=False)