In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
import sklearn.exceptions
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

In [None]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

In [None]:
train.info()

**95 columns have float64 dtype whereas 7 columns have int64 dtypes**

In [None]:
train.head()

In [None]:
train.drop(['id'], axis=1, inplace=True)

# Target Column

In [None]:
train['loss'].describe()

In [None]:
fig = plt.figure(figsize=(14,6))
target_cnt = train['loss'].value_counts().sort_index()
sns.barplot(x=target_cnt.index, y=target_cnt)

**We can see that there are 43 unique values in the target variable and out of all these values 0 (categorical value) is in considerable amount**

In [None]:
train.drop('loss', axis=1).describe()

**We can see that there is a huge difference in the values of almost every feature hence we will have to scale down our data**

In [None]:
target = train['loss']
train_features = train.drop('loss', axis=1)

In [None]:
train_features.shape

## Splitting our data into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, target, test_size=0.2, random_state=42)

## Scaling Data

In [None]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [None]:
# transform test data
test_ss = ss.transform(test.drop(['id'], axis=1))

# Correlation Matrix

In [None]:
fig = plt.figure(figsize=(25,25))

train_corr = train_features.corr()
train_mask = np.triu(np.ones_like(train_corr, dtype=bool))

sns.heatmap(train_corr, 
            square=True, 
            linewidth=0.2,
            mask=train_mask,
            annot=False,
            center=0,
            cmap=sns.diverging_palette(240, 10),
           )

**The data shows almost no correlation among themselves as all of them are in pinky shade**

# Training: XGBoost

In [None]:
param_dist = {
    'objective':'reg:squarederror',
    'n_estimators':5,
    'seed': 123,
    
}

xgb_rr = XGBRegressor(**param_dist)

xgb_rr.fit(X_train_ss, y_train, verbose=True)

In [None]:
y_preds = xgb_rr.predict(X_test_ss)

In [None]:
mean_squared_error(y_test, y_preds, squared=False)

# Feature Importance


In [None]:
fig = plt.figure(figsize=(24,34))
ax = plt.axes()
xgb.plot_importance(xgb_rr, ax)

# Hyperparameter Tuning

In [None]:
params = {
    'n_estimators': [2, 3, 5, 10, 25, 50, 100, 150, 200],
    'max_depth':np.arange(3,12),
    'min_child_weight': np.arange(1,12),
    'eta':[.3, .2, .1, .05, .01, .005],
    'subsample': [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
}

In [None]:
xgb_rr = XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0, predictor= "gpu_predictor")
random_rmse = RandomizedSearchCV(xgb_rr, param_distributions=params, n_iter=20,
                                 verbose=2, cv=3,
                                 scoring='neg_root_mean_squared_error', random_state=123)

In [None]:
random_rmse.fit(X_train_ss, y_train)

In [None]:
-random_rmse.best_score_

In [None]:
random_rmse.best_params_

The above parameters are best for our xgboost model and we can now see that after hyperparameter optimization we have dropped down our RMSE value from 7.95 to 7.89

# Feature Elimination

In [None]:
param_dist = {
    'objective':'reg:squarederror',
    'n_estimators':50,
    'seed': 123,
    'min_child_weight': 6,
    'subsample': 0.8,
    'max_depth': 3,
    'eta': 0.2,
    'colsample_bytree': 0.7
    
}

xgb_rr = XGBRegressor(**param_dist)

In [None]:
rfe = RFE(xgb_rr, 75)
rfe = rfe.fit(X_train_ss, y_train)

# Testing on Test data

In [None]:
predictions = rfe.predict(test_ss)

# Submission

In [None]:
submission = pd.DataFrame({
    'id': np.asarray(test.id), 
    'loss': predictions.astype(int)
})
submission.to_csv('my_submission.csv', index=False)