In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,auc,classification_report,confusion_matrix,mean_squared_error, precision_score, recall_score,roc_curve

import time
from numpy import absolute
import joblib
from datetime import datetime, timedelta

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


# XGboost + GridSeachCV

In [2]:
# Load and review train data
train_data = pd.read_csv("/kaggle/input/30-days-of-ml/train.csv", low_memory=False, index_col='id')
test_data = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv", low_memory=False, index_col='id')
#submission_example = pd.read_csv("/kaggle/input/30-days-of-ml/sample_submission.csv", low_memory=False)

train_data.info()

object_cols=train_data.columns[1:10]
object_nunique = list(map(lambda col: train_data[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
#sorted(d.items(), key=lambda x: x[1])

# Separate target from predictors
y = train_data.target
X = train_data.drop(['target'], axis=1) # Remove target and Id

#cols_with_missing = [col for col in X_train.columns
                     #if X_train[col].isnull().any()]

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, 
                                                                test_size=0.2,random_state=0)

print("X_train_full shape: ",X_train_full.shape)
print("X_valid_full shape: ",X_valid_full.shape)
print("y_train shape: ",y_train.shape)
print("y_valid shape: ",y_valid.shape)


# Separate categorical and numerical columns (all categorical)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 20 and 
                        X_train_full[cname].dtype == "object"]
print("Categorical columns: ", categorical_cols)

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']] 
print("numerical columns: ",numerical_cols)

my_cols = categorical_cols + numerical_cols

#Save all the columns to use for training
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = test_data[my_cols].copy()

print("X_test shape: ",X_test.shape)

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train_transformed = preprocessor.fit_transform(X_train)
X_valid_transformed  = preprocessor.transform(X_valid)
X_test_transformed  = preprocessor.transform(X_test)
#print(X_train_transformed)
#print(X_valid_transformed)

# Define the model 
xgb_model = XGBRegressor()
#xgb_model.get_params()

xgb_params = {'objective': ['reg:linear'],
              'n_estimators': [30000],
              'learning_rate': [0.01],
              'subsample': [0.8],
              'max_depth': [6],
              'min_child_weight': [11],
              #'reg_lambda': [68.1],
              #'reg_alpha': [15.7],
              'random_state': [42],
              'tree_method': ['gpu_hist'],
              'verbosity': [0],
              'silent': [True]
              }
gs = GridSearchCV(xgb_model,
                  cv=10,
                  scoring='neg_root_mean_squared_error',
                  verbose=0,
                  param_grid=xgb_params)

start = time.time()

gs.fit(X_train_transformed, y_train, 
       early_stopping_rounds=5, 
       eval_set=[(X_valid_transformed, y_valid)],
       verbose=False)

stop = time.time()
print("Training time = ", timedelta(seconds=int(stop-start)))

# Preprocessing of validation data, get predictions
preds_valid = gs.best_estimator_.predict(X_valid_transformed)
mse_score=mean_squared_error(y_valid, preds_valid, squared=False)
print(mse_score)

# Use the model to generate predictions
predictions = gs.best_estimator_.predict(X_test_transformed)
print(gs.best_estimator_)
print(gs.best_params_)
print(gs.best_score_)

output = pd.DataFrame({'Id': X_test.index,
                       'target': predictions})
output.to_csv('xgboost_gs_tuned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 1 to 499999
Data columns (total 25 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   cat0    300000 non-null  object 
 1   cat1    300000 non-null  object 
 2   cat2    300000 non-null  object 
 3   cat3    300000 non-null  object 
 4   cat4    300000 non-null  object 
 5   cat5    300000 non-null  object 
 6   cat6    300000 non-null  object 
 7   cat7    300000 non-null  object 
 8   cat8    300000 non-null  object 
 9   cat9    300000 non-null  object 
 10  cont0   300000 non-null  float64
 11  cont1   300000 non-null  float64
 12  cont2   300000 non-null  float64
 13  cont3   300000 non-null  float64
 14  cont4   300000 non-null  float64
 15  cont5   300000 non-null  float64
 16  cont6   300000 non-null  float64
 17  cont7   300000 non-null  float64
 18  cont8   300000 non-null  float64
 19  cont9   300000 non-null  float64
 20  cont10  300000 non-null  float64
 21  cont11  30