# Import


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os

# Read data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

# Pre process

In [None]:
target = train.pop('target')

# drop high cardinality
low_cardinality_cols = [cname for cname in train.columns if train[cname].nunique() < 10 and 
                        train[cname].dtype == "object"]
numeric_cols = [cname for cname in train.columns if train[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols

# Keep selected columns only
train = train[my_cols].copy()
test = test[my_cols].copy()

# low_cardinality_cols => One hot encoding (via get_dummies)
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# train.columns != test.columns (OHE) => add missing columns
missingCol_test = set(train.columns).difference(set(test.columns))
missingCol_train = set(test.columns).difference(set(train.columns))
test[[col for col in missingCol_test]] = 0
train[[col for col in missingCol_train]] = 0
test = test[train.columns]

# split train/valid data
X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size=0.80, test_size=0.2,random_state=0)

# Models

In [None]:
# XGBoost
my_model = XGBRegressor(random_state=0, n_estimators=500, learning_rate=0.1)

my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)])

# Eval

In [None]:
predictions = my_model.predict(X_valid)

mae = mean_absolute_error(predictions, y_valid)

print("Mean Absolute Error:" , mae)

# Submission

In [None]:
preds_test = my_model.predict(test)
output = pd.DataFrame({'Id': test.index,
                       'target': preds_test})
output.to_csv('submission.csv', index=False)