# Tabular Playground Series - Mar 2021
The goal of these competitions is to provide a fun, and approachable for anyone, tabular dataset. These competitions will be great for people looking for something in between the Titanic Getting Started competition and a Featured competition. If you're an established competitions master or grandmaster, these probably won't be much of a challenge for you. We encourage you to avoid saturating the leaderboard.

<img src="https://www.kaggle.com/static/images/competitions/landing_header.png" width="150" />

# Preliminaries

In [None]:
import numpy as np 
import pandas as pd
import os
import warnings
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import scipy
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import rankdata

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        break#print(os.path.join(dirname, filename))
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

plt.xkcd()
# Mute warnings
warnings.filterwarnings('ignore')

# Data Preprocessing

<img src="https://www.kaggle.com/static/images/datasets/task_static.png" width="170" />

In [None]:
train = pd.read_csv(os.path.join(dirname, filenames[1]))
test = pd.read_csv(os.path.join(dirname, filenames[2]))
submit = pd.read_csv(os.path.join(dirname, filenames[0]))
test["target"] = -1
cols_base = [c for c in test.columns if c.find('id')==-1]
data = pd.concat([train, test], ignore_index=True, sort=False)
values = list(data.target.value_counts())
ax = data.target.value_counts().plot(kind='pie')
ax.legend(["0:"+str(values[0]),"1:"+str(values[2]),"T:"+str(values[1])],loc=2,fontsize=11)


In [None]:
sum(data.isnull().sum()),data.dtypes.unique()

In [None]:
data.describe().transpose().head(20)

In [None]:
percentiles = [p*.10 for p in range(11)]
data.describe(percentiles =percentiles).T.head(len(data.columns))

# Establish Baseline
* *Label encoding for categoricals*
* *Label encoding is good for XGBoost and RandomForest, but one-hot would be better for models like Lasso or Ridge. *
* *The `cat.codes` attribute holds the category levels.*


<img src="https://www.kaggle.com/static/images/kernel/landing_header.png" width="250" />

In [None]:
def score_baseline(_xbase,_ybase,model=None):
    
    param_dist = dict(booster='gbtree',use_label_encoder =False,objective='binary:logistic',max_depth=4, n_estimators=5)
    if model == None:
        model= XGBRegressor(**param_dist)
        
    cols_cats =  _xbase.select_dtypes(["category", "object","O"])
    for c in cols_cats:
        _xbase[c] = _xbase[c].astype("category")
        _xbase[c] =  _xbase[c].cat.codes
        
    baseline_score = cross_val_score(model,_xbase,_ybase, cv=5, scoring="roc_auc", n_jobs = -1, verbose=0)
    baseline_score = baseline_score.mean()
    print(f"Baseline score: {baseline_score:.8} AUC")

In [None]:
xbase = data.loc[(data.target > -1)].copy()
xbase.pop('id')
ybase = xbase.pop('target')
print(xbase.shape)
score_baseline(xbase,ybase)

# Feature Utility Scores
* *All discrete features should now have integer dtypes*

<img src="https://www.kaggle.com/static/images/community/communities.png" width="200" />

In [None]:
def mutual_information_scores(x, y):
    x = x.copy()  
    cols_cats =  x.select_dtypes(["category", "object","O"])
    for c in cols_cats:
        x[c] = x[c].astype("category")
        x[c] = x[c].factorize()
    for c in cols_cats:
        X[colname], _ = X[colname].factorize()
    #discrete_features = [pd.api.types.is_integer_dtype(t) for t in x.dtypes]
    discrete_features = [pd.api.types.is_integer_dtype(t) or pd.api.types.is_float_dtype(t) for t in x.dtypes]
    
    mi_scores = mutual_info_regression(x, y)#, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mutual_information_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    

In [None]:
mi_scores = mutual_information_scores(xbase,ybase)
print(mi_scores)

 **Uninformative features**
 * _features with 0.0 scores we'll drop entirely:_

In [None]:
def drop_uninformative(d, mi_scores,n=0.0):
    return d.loc[:, mi_scores > n]

_Removing..._

In [None]:
new_xbase = drop_uninformative(xbase, mi_scores,0.0)
print(new_xbase.shape)
score_baseline(new_xbase,ybase)

# Create Features

In [None]:
def  create_new_features(d):
    
    for c in range(11):
        d["cont"+str(c)+"_cat"+str(c)+"_mean"] = d.groupby("cat"+str(c))["cont"+str(c)].transform("median")  
    
    #d["cat10_1"] = d.cat10.transform(lambda r: r.strip()[:1])
    #d["cat10_2"] = d.cat10.transform(lambda r: r.strip()[-1:])
    #d.cat10_1 = d.cat10_1.astype("category")
    #d.cat10_2 = d.cat10_2.astype("category")
    
    return d

def create_cats_category(d):
    x= d.copy()
    for c in x.select_dtypes(["category", "object","O"]):
        x[c] = x[c].astype("category")
        x[c] = x[c].cat.codes
    return x

# Principal Component Analysis
_The PCA algorithm gave us loadings which described each component of variation, and also the components which were the transformed datapoints._

In [None]:
def create_principal_components(x, standardize=True):
    # Standardize
    if standardize:
        x = (x - x.mean(axis=0)) / x.std(axis=0)
    pca = PCA()
    xpca = pca.fit_transform(x)
    component_names = [f"pca_{i+1}" for i in range(xpca.shape[1])]
    xpca = pd.DataFrame(xpca, columns=component_names)
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=x.columns,  # and the rows are the original features
    )
    return pca, xpca, loadings

def pca_components(d, features):
    x = d.loc[:, features]
    _, xpca, _ = create_principal_components(x)
    return xpca

# Create Final Feature Set

In [None]:
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [None]:
def create_cat_encoder(x,y,c):
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    xencoded = encoder.fit_transform(x,y, cols=c)
    return xencoded

In [None]:
cat_cols=[]
data = create_new_features(data)
cats =  data.select_dtypes(["category", "object","O"])
for c in cats:
    count = data[c].value_counts()
    #if(len(count)>=4):
    cat_cols.append(c)

In [None]:

data.head(1)

In [None]:
x = data.copy()
x.pop('id')
y = x.pop('target')
print(x.shape,y.shape)
cat_encodes = create_cat_encoder(x,y,cat_cols)
data.drop(columns=cat_cols,inplace=True)
data.shape,cat_encodes.shape

In [None]:
data = create_cats_category(data)
data.head(1)

In [None]:
cat_cols

In [None]:
data = create_cats_category(data)
data.head(1)

In [None]:
data.columns

In [None]:
data = cat_encodes.join(data)
data.head(1)

In [None]:
data.shape,sum(data.isnull().sum())
x = data.loc[(data.target > -1)].copy()
x.pop('id')
y = x.pop('target')
print(x.shape,y.shape)
score_baseline(x,y)

In [None]:
for c in data.select_dtypes(["float64"]):
    data[c]=data[c].astype("float32")
    
for c in data.select_dtypes(["int64"]):
    data[c]=data[c].astype("int32")

data.dtypes.unique()

In [None]:
features = [c for c in data.columns if c.find("_")==-1 and c.find('id')==-1 and c.find("targ")==-1]
pcas = pca_components(data, features)
for c in pcas.select_dtypes(["float64"]):
    pcas[c]=pcas[c].astype("float32")    

pcas.head()

In [None]:
#data= data.join(pcas)

In [None]:
ncats = [c for c in cols_base if c.find("cat")>=0]
cols = [c for c in  xbase.columns if c not in ncats ]

In [None]:
xbase = data.loc[(data.target > -1)].copy()
xbase.pop('id')
ybase = xbase.pop('target')
print(xbase.shape)
score_baseline(xbase,ybase)

# Hyperparameter Tuning

In [None]:
data.dropna(inplace = True) 
data.reset_index(drop=True,inplace=True)
sum(data.isnull().sum())

In [None]:
cols = [c for c in data.columns if c.find('id')==-1 and c.find('targ')==-1]
scaler = StandardScaler()
s =scaler.fit_transform(data[cols])
data[cols] = s

In [None]:
data.dtypes.unique()
xbase = data.loc[(data.target > -1)].copy()
xbase.pop('id')
ybase = xbase.pop('target')
print(xbase.shape)
score_baseline(xbase,ybase)

In [None]:
data.head(1)

In [None]:
x = data.loc[data.target>-1].copy()
x.pop('id')
y = x.pop('target')
print('x:\t',x.shape)
print('y:\t',y.shape)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split( x, y,stratify=y, test_size=0.30, random_state=0)

In [None]:
xtrain = scipy.sparse.csr_matrix(xtrain)
xtest = scipy.sparse.csr_matrix(xtest)

In [None]:
xtrain.shape, ytrain.shape

In [None]:
param_dist = dict(#n_jobs=-1,
    booster='gbtree',
    use_label_encoder =False,
    objective='binary:logistic',
    verbosity=0,
    random_state=0,
    max_depth= 50,    # maximum depth of each tree - try 2 to 10
    learning_rate=0.1,# effect of each tree - try 0.0001 to 0.1
    n_estimators=50,  # number of trees (that is, boosting rounds) - try 1000 to 8000
    #min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree= 0.2, # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample= 0.5,# fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha= 12,  # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda= 4,# L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,  # set > 1 for boosted random forests
)

# Train Model and Create Submissions

In [None]:
model = xgb.XGBRegressor(**param_dist)
model.fit(xtrain, ytrain, eval_set=[(xtrain, ytrain), (xtest, ytest)], eval_metric=['auc','mae'], verbose=True)

In [None]:
predics = model.predict(xtest)

In [None]:
test_score =roc_auc_score(ytest,predics, average='micro')
print(f'Score: {test_score:.6f} AUC')    

In [None]:
test =data.loc[data.target==-1]
test.pop("id")
test.pop("target")

In [None]:
submit.shape,test.shape,xtrain.shape

In [None]:
matrix_test = scipy.sparse.csr_matrix(test)

In [None]:
predics = model.predict(matrix_test)

In [None]:
submit["target"] = predics
submit.to_csv("predics.csv", index = False)
print(submit.head(10))
print(submit.tail(10))

Reffer: https://www.kaggle.com/learn/overview 

<img src="https://www.kaggle.com/static/images/education/homepage-illustration2.png" width="200" />