<center><h3> Importing libraries </h3></center>

In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

<center><h3> EDA </h3></center>

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")

In [None]:
df_train.head()

In [None]:
df_test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")

In [None]:
df_test.head()

In [None]:
def df_stats(df):
    df_agg = df.agg(
        [
            "nunique",
            (lambda x: x.nunique() / len(x) * 100),
            (lambda x: x.isna().sum()),
            (lambda x: x.isna().sum() / len(x) * 100),
            (lambda x: x.isin([0]).sum()),
            (lambda x: x.isin([0]).sum() / len(x) * 100),
        ]
    ).transpose()

    df_agg.columns = [
        "Unique",
        "Percentage of unique",
        "NaNs",
        "Percentage of NaNs",
        "Null values",
        "Percentage of nulls",
    ]

    return df_agg

In [None]:
df_stats(df_train)

In [None]:
df_stats(df_test)

<center><h3> Simple pipeline </h3></center>

In [None]:
# df_train = df_train.drop(columns="cat10")
# df_test = df_test.drop(columns="cat10")

In [None]:
categorical = np.array(df_train.columns[np.where(df_train.dtypes == "object")])
numeric = np.array(df_train.columns[np.where(df_train.dtypes == "float64")])

In [None]:
def smooth(train, test, categorical):
    mean = train["target"].mean()
    for i in categorical:
        agg = train.groupby(i)["target"].agg(["count", "mean"])
        count = agg["count"]
        mean = agg["mean"]
        weight = 10

        # smoothed mean
        smooth = (count * mean + weight * mean) / (count * weight)

        train[i] = train[i].map(smooth)
        test[i] = test[i].map(smooth)

    return train, test

In [None]:
 df_train, df_test = smooth(df_train, df_test, categorical)

In [None]:
df_train = shuffle(df_train, random_state=142)
X = df_train.drop(columns="target")
y = df_train["target"]

In [None]:
args = {
    "max_depth": 5,
    "learning_rate": 0.09,
    "n_estimators": 1340,
    "min_child_weight": 103,
    "gamma": 0.000108,
    "alpha": 0.0165,
    "lambda": 0.0196,
    "colsample_bytree": 0.42,
    "subsample": 0.658,
    "eval_metric": "auc"
}

xgb_cool = xgb.XGBClassifier(**args)

In [None]:
params = {
    "objective": "binary:logistic",
    "grow_policy": "lossguide",
    "eval_metric": "auc",
    "min_child_weight": 20,
    "colsample_bytree": 0.3,
    "subsample": 0.7,
    "n_estimators": 4500,
    "learning_rate": 0.1,
    "n_jobs": -1,
}
xgb_cool = xgb.XGBRegressor(**params)

In [None]:
column_transformer = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("scaling", StandardScaler(), numeric),
    ]
)

pipeline = Pipeline(
    steps=[("onehot_scaling", column_transformer), ("xlassifier", xgb_cool)]
)

In [None]:
#cross_val_score(pipeline, X, y, scoring="roc_auc", cv=5)

In [None]:
pipeline.fit(X, y)

In [None]:
y_pred = pipeline.predict(df_test)

In [None]:
df_submission = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv")

In [None]:
df_submission["target"] = y_pred

In [None]:
df_submission.to_csv("submission_6.csv", index=None)