# Tabular Playground Series Prediction

## Import Packages

In [None]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Load datasets

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

## Preview Data before Processing

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe().transpose()

There isn't an obvious correlation between features and target values.

Since only numerical values are present, and no categorical values are obtained. It makes sense to consider regression models for fitting and training purposes.

## Observe test sample to be plugged into the trained model

In [None]:
test.describe().transpose()

In [None]:
test.shape

In [None]:
test.head(5)

## Data Preprocessing

## Drop id column

In [None]:
train.pop("id")
test_ids = test.pop("id")


In [None]:
train.head()

In [None]:
test.head()

### Train Validation Split

In [None]:
#Plan to use different test size

validation_split = 0.3 

train_features, validation_features = train_test_split(train, test_size=validation_split)

train_targets, validation_targets = train_features.pop("loss"),  validation_features.pop("loss")

In [None]:
train_features.head()

In [None]:
validation_features.head()

## Model Development
### Using XG_Boost

### Model Evaluation

In [None]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(train_features, train_targets)

In [None]:
from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(validation_features)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, validation_targets)))

## Tune Hyperparameters

In [None]:
model_1=XGBRegressor(n_estimators=100,learning_rate=0.05)
model_2=XGBRegressor(n_estimators=200,learning_rate=0.1)
model_3=XGBRegressor(n_estimators=300,learning_rate=0.5)
model_4=XGBRegressor(n_estimators=300,learning_rate=1, random_state=0)

models = [model_1, model_2, model_3, model_4]


In [None]:
def score_model(model):
    model.fit(train_features, train_targets, early_stopping_rounds=3,eval_set=[(validation_features,validation_targets)],
             verbose=False)
    preds = model.predict(validation_features)
    return mean_absolute_error(validation_targets, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

## Select Best Model To Run

In [None]:
ideal_model=model_4.fit(train_features, train_targets, early_stopping_rounds=3,eval_set=[(validation_features,validation_targets)],
             verbose=False)
loss_pred=ideal_model.predict(test)

%matplotlib inline
import seaborn as sns

sns.lineplot(data=loss_pred, label=test_ids)

### Submission

In [None]:
submission = pd.DataFrame({"id": test_ids, "loss": loss_pred.reshape(-1)})
submission.to_csv("submission.csv", index=False)