In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load Training Set Data

data = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv", index_col="id")
data.dropna(subset=['target'], axis=0, inplace=True)

#Segregated data into target (y) and features (X)

X = data.copy()
y = X.pop("target")

In [None]:
#Identified NA values in training set

X.isna().sum()


Did not find any missing values in data set. Proceeded to conduct data analysis

# Data Analysis

In [None]:
#Seperated independant variables columns into categorical (cat_cols) and continuos variables (num_cols)

cat_cols = [ccols for ccols in X.columns if X[ccols].dtype == "object"]

num_cols = [ncols for ncols in X.columns if X[ncols].dtype in ["int64", "float64"] ]


In [None]:
X.describe()

As all continuous variables appear to have a mean and std. deviation that fall within a similiar range, scaling the continuous variables does not seem necessary.

**Distribution Plot of Categorical Variables**

In [None]:
for i in cat_cols:
    print(set(X[i]))

In [None]:
counting = X['cat0'].value_counts()

print(counting[1])

In [None]:
categories = []
values = []
for i, c_name in enumerate(cat_cols):
    cats = set(X[c_name])
    cats = list(cats)
    categories.append(cats)
    counting = X[c_name].value_counts()
    vals=[]
    for i in cats:
        vals.append(counting[i])
    values.append(vals)
        
fig, axs = plt.subplots(3, 4, figsize=(12,12))
ax = axs.flatten()
for i in range(len(cat_cols)):
    ax[i].bar(categories[i], values[i])
    ax[i].set_title(cat_cols[i])


We see that certain categorical features appear to be unbalanced. Specifically,cat4, cat5, cat6, cat7 and cat8 are significantly unbalanced unbalanced. Feature engineering may assist in dealing with this unbalance.

In [None]:
def modify_df(df):
    df['cat4'] = df['cat4'].apply(lambda x: x if x == 'B' else 'Z')
    df['cat5'] = df['cat5'].apply(lambda x: x if x in ['B', 'D'] else 'Z')
    df['cat6'] = df['cat6'].apply(lambda x: x if x == 'A' else 'Z')
    df['cat7'] = df['cat7'].apply(lambda x: x if x in ['E', 'D'] else 'Z')
    df['cat8'] = df['cat8'].apply(lambda x: x if x in ['E', 'C', 'G', 'A'] else 'Z')
    
    return df

# Feature Engineering

In [None]:
#Feature engineered X
X = modify_df(X)

In [None]:
#As XGBoost is not able to handle non-numerical categorical variables, label encoding is conducted 
encoder = ce.OrdinalEncoder()
encoder.fit(X)

X = encoder.transform(X)

# Model Development

In [None]:
#Split data into training set and validating set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [None]:
#Base Model: 
modelA = XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=4)

modelA.fit(X_train,y_train)

predA = modelA.predict(X_valid)

print(mean_squared_error(y_valid,predA))

#MSE of base model is 0.7228872613844344

# Manual Parameter Tuning

In [None]:
learnrates = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.20]
mse0 = []
for rate in learnrates:
    XG_model = XGBRegressor(n_estimators=1000, learning_rate= rate, max_depth=4)
    XG_model.fit(X_train,y_train)
    prediction = XG_model.predict(X_valid)
    mse0.append(mean_squared_error(y_valid,prediction))
    print(rate)


In [None]:
views = dict(zip(learnrates,mse0))
views = pd.DataFrame.from_dict(views, orient='index')

views.sort_values(by=0)

In [None]:
num_esti = [100,200,300,400,500,600,700,800,900,1000]
mse1 = []

for num in num_esti:
    XG_model = XGBRegressor(n_estimators=num, learning_rate=0.01, max_depth=4)
    XG_model.fit(X_train,y_train)
    prediction = XG_model.predict(X_valid)
    mse1.append(mean_squared_error(y_valid,prediction))
    print(num)

In [None]:
views = dict(zip(num_esti,mse1))
views = pd.DataFrame.from_dict(views, orient='index')

views.sort_values(by=0)

In [None]:
max_dep = [2,3,4,5,6,7,8]
mse2 = []

for depth in max_dep:
    XG_model = XGBRegressor(n_estimators=1000, learning_rate=0.06, max_depth=depth)
    XG_model.fit(X_train,y_train)
    prediction = XG_model.predict(X_valid)
    mse2.append(mean_squared_error(y_valid,prediction))
    print(depth)

In [None]:
views = dict(zip(max_dep,mse2))
views = pd.DataFrame.from_dict(views, orient='index')

views.sort_values(by=0)

# Make Submission

In [None]:
X_train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv",index_col='id')



c_col = [ccol for ccol in X_train.columns if X_train[ccol].dtype=='object']

num_col =[ncol for ncol in X_train.columns if X_train[ncol].dtype in ['int64', 'float64']]

columns = c_col + num_col
X_touse = X_train[columns].copy()

X_touse = modify_df(X_touse)



In [None]:
#Final Model:
model = XGBRegressor(n_estimators=1000, learning_rate=0.06, max_depth=3)

In [None]:
model.fit(X, y)
prediction = model.predict(X_touse)
print('done!')

In [None]:
output = pd.DataFrame({'id': X_train.index,
                       'target': predictionA})

output.head()
output.to_csv('submission.csv', index=False)