# Reducing the number of categories
Effect of reducing the number of categories for different types of models

## Load data

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
train.head()

In [None]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

In [None]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']

## Reduction of columns in one hot encoding by category size reduction

In [None]:
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
import pandas as pd

In [None]:
min_cat_size = [0, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
data_size = []
for min_data_portion in min_cat_size:
    ct = CategoricalTransform(cat_cols, min_data_portion = min_data_portion)
    pipe = Pipeline([('categorical_transform', ct), 
                    ('one hot', OneHotTransform())])
    data_size.append(pipe.fit_transform(x_train).shape[1])
pd.DataFrame(dict(min_cat_size=min_cat_size, num_columns=data_size))

## Analysis Tools

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
kfold = KFold(n_splits=5, shuffle=True)

In [None]:
def analyse_cat_size(pipe, x_train=x_train, y_train = y_train):
    gscv = GridSearchCV(pipe, dict(trans__min_data_portion=min_cat_size), cv=kfold, scoring='roc_auc', verbose=1)
    gscv.fit(x_train, y_train)
    return pd.DataFrame(dict(cat_size = min_cat_size, 
                  fit_time=gscv.cv_results_['mean_fit_time'], 
                  auc=gscv.cv_results_['mean_test_score']))

## Lightgbm default model performance with different min_cat_sizes

In [None]:
from lightgbm.sklearn import LGBMClassifier

In [None]:
lightgbm_pipe = Pipeline([("trans", CategoricalTransform(cat_cols)),
                          ("lgbm", LGBMClassifier(n_jobs = -2))])

In [None]:
analyse_cat_size(lightgbm_pipe)

## Default xgboost, integer encoding

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-2, eval_metric='auc', use_label_encoder=False)
from categorical_transform import IntegerCategoricalTransform
from sklearn.pipeline import Pipeline
xgb_pipe = Pipeline([('trans',IntegerCategoricalTransform(cat_cols=cat_cols)),('xgboost', xgb)])

In [None]:
analyse_cat_size(xgb_pipe)

## Default xgboost, one-hot encoding

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_jobs=-2, eval_metric='auc', use_label_encoder=False)
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
xgb_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                     ('oht',OneHotTransform()),
                     ('xgboost', xgb)])

In [None]:
analyse_cat_size(xgb_pipe)

One hot slightly better than ordinal encoding. Reducing category size speeds up training significantly.

## Randomforest default integer encoding

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2)
from categorical_transform import IntegerCategoricalTransform
from sklearn.pipeline import Pipeline
rf_pipe = Pipeline([('trans',IntegerCategoricalTransform(cat_cols=cat_cols)),('rf', rf)])

In [None]:
analyse_cat_size(rf_pipe)

## Randomforest one hot encoding

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2)
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
rf_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                    ('oht', OneHotTransform()),
                    ('rf', rf)])

In [None]:
analyse_cat_size(rf_pipe)

One hot slightly better than ordinal encoding. Reducing category size speeds up training slightly.

## Categorical naive bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB
catnb = CategoricalNB()
from categorical_transform import NonNegativeIntegerCategoricalTransform
from sklearn.pipeline import Pipeline
catnb_pipe = Pipeline([('trans',NonNegativeIntegerCategoricalTransform(cat_cols=cat_cols)),
                       ('catnb', catnb)])

In [None]:
analyse_cat_size(catnb_pipe, x_train=x_train[cat_cols])

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from categorical_transform import CategoricalTransform, OneHotTransform
from sklearn.pipeline import Pipeline
lr_pipe = Pipeline([('trans',CategoricalTransform(cat_cols=cat_cols)),
                     ('oht', OneHotTransform()),
                     ('lr', LogisticRegression(n_jobs=-2))])

In [None]:
analyse_cat_size(lr_pipe, x_train=x_train)