In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyforest

In [None]:
import pyforest  ###Use major Python libraries without importing
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from collections import OrderedDict
from sklearn.model_selection import GridSearchCV

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
import warnings

warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv", sep=",")
train.head()

In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv", sep=",")
test.head()

In [None]:
###Check for missing values
train.isnull().sum()
#train.isnull.sum()

In [None]:
train_data = train.drop(["id"], axis=1)
y_train_data = train.target
test_data = test.drop(["id"], axis=1)

In [None]:
train_data.head()
y_train_data.head()
test_data.head()

In [None]:
train_data.shape
test_data.shape

In [None]:
train_data.columns

In [None]:
cat_cols = [feature for feature in train.columns if "cat" in feature]
cont_cols = [feature for feature in train.columns if "con" in feature]

In [None]:
for enc in cat_cols:
    le = LabelEncoder()
    train_data[enc] = le.fit_transform(train_data[enc])
    test_data[enc] = le.transform(test_data[enc])

In [None]:
train_data.head()
test_data.head()

In [None]:
###Plot the target variable

plt.figure(figsize=(12, 8))
sns.histplot(train_data["target"])
plt.xlabel("Target variable")
plt.ylabel("Count")
plt.show();

In [None]:
###Summarize the target variable using boxplot
plt.figure(figsize=(12, 8))
sns.boxplot(train_data["target"])
plt.xlabel("Target variable")
plt.ylabel("Count")
plt.show();

In [None]:
###Function to remove outliers from training data using IQR

def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3 - q1  # Interquartile range
    print("Q1: " + str(q1), "Q3: " + str(q3), "IQR: " + str(iqr))
    fence_low = q1 - 1.5 * iqr
    fence_high = q3 + 1.5 * iqr
    outlier_count = df_in.loc[
        (df_in[col_name] < fence_low) | (df_in[col_name] > fence_high)
    ]
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    print(
        "Total outliers in the target variable in training dataset is "
        + str(len(outlier_count))
    )
    print("Shape of data after cleaning is " + str(df_out.shape))
    return df_out

In [None]:
###Call the function
train_data_cleaned = remove_outlier(train_data, "target")

In [None]:
###Plot the target variavle to confirm outliers have been removed
plt.figure(figsize=(12, 8))
sns.histplot(train_data_cleaned["target"])
plt.xlabel("Target variable")
plt.ylabel("Count")
plt.show();

In [None]:
###Summary of the target variable
plt.figure(figsize=(12, 8))
sns.boxplot(train_data_cleaned["target"])
plt.xlabel("Target variable")
plt.ylabel("Count")
plt.show();

In [None]:
###Visualize the distribution of continous variables
train_data_cleaned.hist(cont_cols, figsize=(15, 30), layout=(9, 3));

In [None]:
###Create df to check correlation with target variable
data_for_corr = pd.concat(
    [train_data_cleaned[cont_cols], train_data_cleaned["target"]], axis=1
)

In [None]:
###https://www.kdnuggets.com/2019/07/annotated-heatmaps-correlation-matrix.html
corr_matrix = data_for_corr.corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap="coolwarm")
plt.show();

In [None]:
###Print tip 10 variables correlated with target
corr_matrix["target"].sort_values(ascending=False)[1:11]

In [None]:
X_train_final = train_data_cleaned.drop("target", axis=1)
X_train_final.head()

In [None]:
y_train_cleaned = train_data_cleaned["target"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(
    X_train_final, y_train_cleaned, test_size=0.25, shuffle=True, random_state=42
)

print("X_train - ", X_train.shape)
print("y_train - ", y_train.shape)
print("X_validation - ", X_validation.shape)
print("y_validation - ", y_validation.shape)

In [None]:
###Run the very first basic model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

y_predict = model.predict(X_validation)
y_train_predict = model.predict(X_train)
print("Train accuracy: " + str(mean_squared_error(y_train, y_train_predict, squared=False)))
print("Test accuracy: " + str(mean_squared_error(y_validation, y_predict,  squared=False)))

In [None]:
# Create the submission file
submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')
display(submission.head())
submission['target'] = model.predict(test_data)
submission.to_csv('xgboost_base.csv', index=False, header=True)

In [None]:
submission.to_csv('xgboost_base.csv', index=False, header=True)
submission.head()

In [None]:
###Fine tune the hyperparameters using grid cv
#params_for_testing = {
#    "min_child_weight": [1, 5, 10],
#    "gamma": [0.5, 1, 1.5, 2, 5],
#    "subsample": [0.6, 0.8, 0.9],
#    "colsample_bytree": [0.6, 0.8],
#    "max_depth": [3, 4, 5],
#    "n_estimators": 1000,
#    "learning_rate": [0.01, 0.05, 0.07, 0.1, 0.3],
#}

In [None]:
###Time consuming
#gsearch1 = GridSearchCV(estimator = model, param_grid = parameters_for_testing,scoring='neg_mean_squared_error')
#gsearch1.fit(X_train,y_train)
#print (gsearch1.grid_scores_)
#print('best params')
#print (gsearch1.best_params_)
#print('best score')
#print (gsearch1.best_score_)

In [None]:
model_tuned = xgb.XGBRegressor(max_depth= 2,
    min_child_weight= 4,
    n_estimators= 1000,
    learning_rate= 0.05,
    subsample= 0.9,
    colsample_bytree= 0.4,
    objective= 'reg:squarederror',
    eval_metric= 'rmse',
    random_state= 42
)
    
model_tuned.fit(X_train, y_train)

In [None]:
y_predict = model_tuned.predict(X_validation)
y_train_predict = model_tuned.predict(X_train)
print("Train accuracy: " + str(mean_squared_error(y_train, y_train_predict, squared=False)))
print("Test accuracy: " + str(mean_squared_error(y_validation, y_predict,  squared=False)))

In [None]:
submission['target'] = model_tuned.predict(test_data)
submission.to_csv('xgboost_tuned.csv', index=False, header=True)
submission.head()

In [None]:
model_tuned_final= xgb.XGBRegressor(max_depth= 2,
    min_child_weight= 4,
    n_estimators= 1000,
    learning_rate= 0.07,
    subsample= 0.9,
    colsample_bytree= 0.4,
    objective= 'reg:squarederror',
    eval_metric= 'rmse',
    random_state= 42
)
    
model_tuned_final.fit(X_train, y_train)

In [None]:
y_predict = model_tuned_final.predict(X_validation)
y_train_predict = model_tuned_final.predict(X_train)
print("Train accuracy: " + str(mean_squared_error(y_train, y_train_predict, squared=False)))
print("Test accuracy: " + str(mean_squared_error(y_validation, y_predict,  squared=False)))

In [None]:
submission['target'] = model_tuned_final.predict(test_data)
submission.to_csv('xgboost_tuned_final.csv', index=False, header=True)
submission.head()