In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

In [None]:
seed = 1234

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv", index_col="id")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv", index_col="id")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv", index_col="id")

In [None]:
# Take a look on our data
train.head()

In [None]:
# Get info about types of data and missing values
train.info()

In [None]:
test.info()

**We can see there is no missing values in both sets**

In [None]:
# # Plot distributions of all features of both datasets

# fig, axes = plt.subplots(7, 2, figsize=(15,15))
# for ax, col in zip(axes.ravel(), train.columns[:-1]):
#     sns.distplot(train[col], label='train', hist_kws={"alpha": 0.2}, ax=ax)
#     sns.distplot(test[col], label='test', color='red', hist_kws={"alpha": 0.2}, ax=ax)
#     ax.legend()
#     plt.tight_layout()
# plt.show()

**Both sets have similar distributions of features**

In [None]:
# # Plot boxplots for all features to find outliers

# for col in train.columns[:-1]:
#     plt.boxplot([train[col], test[col]], labels=['train', 'test'])
#     plt.title(col)
#     plt.legend()
#     plt.show()

**Both sets have outliers in features `cont7` and `cont9` and `train` dataset has outliers in `count10`.<br>Let's count them.**

In [None]:
Q1_train = train.quantile(0.25)
Q3_train = train.quantile(0.75)
IQR_train = Q3_train - Q1_train
((train < Q1_train - 1.5*IQR_train) | (train > Q3_train + 1.5*IQR_train)).agg([sum, 'mean'])

In [None]:
Q1_test = test.quantile(0.25)
Q3_test = test.quantile(0.75)
IQR_test = Q3_test - Q1_test
((test < Q1_test - 1.5 * IQR_test) | (test > Q3_test + 1.5 * IQR_test)).agg([sum, 'mean'])

In [None]:
def replace_outliers(data):
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        median_ = data[col].median()
#         data[col].mask(((data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)), median_, inplace=True)
        # data[col] = np.where(((data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)),
        #                     median_, data[col])
        data.loc[((data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)), col] = median_
    return data

In [None]:
train = replace_outliers(train)

**The qantity of outliers in datasets is quite small - about 2 % or less - so we can drop them.**

In [None]:
# train = train[~((train < Q1_train - 1.5*IQR_train) | (train > Q3_train + 1.5*IQR_train)).any(axis=1)]
# test = test[~((test < Q1_test - 1.5*IQR_test) | (test > Q3_test + 1.5*IQR_test)).any(axis=1)]

In [None]:
X = train.drop(columns=['target'])
y = train['target']

In [None]:
params = {'learning_rate': 0.1,
           'n_estimators': 1000,
           'max_depth': 4,
           'min_child_weight': 6,
           'gamma': 3.9,
           'subsample': 0.8,
           'colsample_bytree': 0.8,
           'objective': 'reg:squarederror',
           'nthread': 4,
           'scale_pos_weight': 1,
           'reg_alpha': 100,
           'seed': seed
}
xgbr = XGBRegressor(**params)

In [None]:
mean_error = 0
predictions = 0
kf = KFold(n_splits=5)
for num, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = xgbr
    model.fit(X_train, y_train)
    error = mean_squared_error(y_test, model.predict(X_test),
                              squared=False)
    print(f"{num} fold error: {error}")
    mean_error += error
    predictions += model.predict(test)
    
print(f"Mean error: {mean_error / kf.get_n_splits(X)}")
result_prediction = predictions / kf.get_n_splits(X)

In [None]:
submission['target'] = result_prediction
submission.to_csv('submission.csv')