In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Quick Early Exploration of data and sklearn RandForrestRegressor Benchmark

In [None]:
#load training set
training_set = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv", index_col= 'id')

In [None]:
#glance at the dataset
training_set.describe()

In [None]:
#view shape of data
training_set.shape

In [None]:
#print the number different data types
print(training_set.dtypes.unique())

In [None]:
#print na values
#looks like every row has about 15000 na values
with pd.option_context('display.max_rows', 119, 'display.max_columns', 10):
    print(training_set.isna().sum())

In [None]:
#plot a quick histogram of all of our features
#we see that there are many different kinds of distributions which will have to be addressed later
_ = training_set.hist(bins = 50, figsize = (20,15))

In [None]:
#appears to be little correlation between features
corr = training_set.corr()
display(corr)

In [None]:
#lets see which features look most correlated with our target
#there is little correlation between any features and the target
training_set.corr()[['claim']].sort_values(by='claim', ascending=False)

In [None]:
y = training_set['claim']
training_set.drop(['claim'], axis=1, inplace=True)
X = training_set

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV


from xgboost import XGBRegressor


num_cols = training_set.select_dtypes(exclude="object").columns

#define numerical transformer pipeline
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', StandardScaler()),
    ('quantile_transformer', QuantileTransformer(random_state=1,output_distribution='uniform'))
])

#pass numerical transformer to the preprocessor to pass into our model
preprocessor = ColumnTransformer([
    ('numerical', num_transformer, num_cols),
])

model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', XGBRegressor(n_estimators = 100,max_dept = 6,
                                             verbosity = 2,early_stopping_rounds = 20,
                                             learning_rate = 0.1, tree_method='gpu_hist',
                                             random_state = 1))])


In [None]:
param_grid = {
    'model__learning_rate': [0.1, 0.25, 0.5],
    'model__gamma': [0,.5,1.0],
    'model__early_stopping_rounds': [2, 5,10, 25],
    'model__max_depth': np.arange(2, 16, step = 2),
    'model__n_estimators': np.arange(50, 400, step = 50),
    'model__min_child_weight': np.arange(2, 10, step = 2)
}

random_cv = RandomizedSearchCV(
    model, param_grid, n_iter=100, cv=3, scoring="r2",verbose = 2, n_jobs=-1
)

In [None]:
#split data into groups
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8,
                                                      test_size=0.2, random_state = 1,
                                                      stratify = y)
#fit model with our training data
random_cv.fit(X_train, y_train)

print("Best params:\n")
print(random_cv.best_params_)

In [None]:
from sklearn.metrics import mean_squared_error
predictions = random_cv.predict(X_valid)

rmse = np.sqrt(mean_squared_error(predictions, y_valid))
print('RMSE:', rmse)

In [None]:
testing_set = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv", index_col= 'id')

In [None]:
final_pred = random_cv.predict(testing_set)

output = pd.DataFrame({'id': testing_set.index,
                       'claim': final_pred})
output.to_csv('submission.csv', index=False)

