In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Tabular playground - May 2022 using XGBoost

I did the EDA in R in a separate notebook here: [https://www.kaggle.com/code/charlottetu/playground-may-eda-in-r](http://).  The 'f_27' feature was challenging - I took inspiration from this notebook: [https://www.kaggle.com/code/cv13j0/tps-may22-eda-gbdt](http://)

# Import libraries and read in the data

In [None]:
#Import libraries
import plotly as plt
from plotly.offline import iplot, init_notebook_mode
import plotly.express as px
import matplotlib.pyplot as py
import seaborn as sns
init_notebook_mode(connected = True)
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from statistics import mean
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Read in test and train data
df_train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')

In [None]:
#Understand the shape of the data
print(df_train.shape)
print(df_test.shape)

In [None]:
#Copy the data to avoid mutating the original object
X_train = df_train.copy()
X_test = df_test.copy()
X_train.columns

# Feature engineering on f_27

In [None]:
#Break up the alphabet into a separate column per column, and count how many instances of each letter there are in f_27.
alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']

for i in alphabet:
    X_train[i+'_count'] = X_train['f_27'].str.count(i)
    
for i in alphabet:
    X_test[i+'_count'] = X_test['f_27'].str.count(i)

In [None]:
#Check how the data looks now
X_train.head()

In [None]:
#Count how many unique characters there are 
X_train["unique_characters"] = X_train['f_27'].apply(lambda s: len(set(s)))
X_test["unique_characters"] = X_test['f_27'].apply(lambda s: len(set(s)))

In [None]:
# Separate the target from the training data set, and remove f_27 from both test and train
y_train = X_train['target']
X_train = X_train.drop(columns = ['target', 'f_27'])
X_test = X_test.drop(columns = 'f_27')

# Complete pro-processing and modelling

In [None]:
# Set up the model and scaling objects
model = XGBClassifier()
scale = MinMaxScaler()

In [None]:
#Confirm which columns to include in the processing
X_train.dtypes
colnames = X_train.columns
colnamesforproc = colnames.drop('id')
colnamesforproc

In [None]:
#Build the scaling column transformer
pre_processor = ColumnTransformer(transformers = [
        ('scaler',scale,colnamesforproc)
])

In [None]:
#Build the pipeline for processing and modelling
my_pipeline = Pipeline(steps = [('preprocessing',pre_processor),
                                ('model',model)])

In [None]:
#Get the ROC score
cross_val_score(my_pipeline,X_train,y_train, scoring = 'roc_auc')

In [None]:
#Fit the model
my_pipeline.fit(X_train,y_train)

In [None]:
#Get the predictions
predictions = my_pipeline.predict(X_test)

In [None]:
#Generate the output file
output = pd.DataFrame({'id': X_test.id,
                       'target': predictions})
output.to_csv('submission.csv', index=False)

In [None]:
#Check the output format
print(output.head())