In [2]:
# !pip install torch
# !pip install deepctr_torch

In [3]:
import os
os.chdir("/root/sli-product-recommendation")

import random

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
# load
dev_reference_date = '202305'
data_condition = f'delvariable_{dev_reference_date}'
raw_dataset_name = f'data_{data_condition}'
raw_dataset_path = f'input/{raw_dataset_name}.csv'
input_data = pd.read_csv(raw_dataset_path)
oot_data = pd.read_csv(raw_dataset_path.replace('202305','202308'))

In [5]:
# save
model_name = f"model_{data_condition}_small"
model_name

'model_delvariable_202305_small'

In [6]:
seed = 42 
random.seed(seed)

In [7]:
from utils.read_config import readDataConfig
data_config = readDataConfig(input_name=raw_dataset_name)
data_config.info

target = data_config.target
features = data_config.features

Dataset name:  data_delvariable_202305


In [8]:
data_config.features.keys()

dict_keys(['user_numeric', 'f_fire_contract', 'product_describe', 'product_yn', 'product_month', 'product_month_category', 'user_category', 'product_latest', 'has_item', 'target_category'])

In [43]:
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

fixlen_feature_columns = []
fixlen_feature_columns += [DenseFeat(feat, 1) for feat in features['user_numeric']]
# fixlen_feature_columns += [DenseFeat(feat, 1) for feat in features['has_item']]

# fixlen_feature_columns += [DenseFeat(feat, 1) for feat in features['product_describe']]
# fixlen_feature_columns += [DenseFeat(feat, 1) for feat in features['product_month']]
# fixlen_feature_columns += [DenseFeat(feat, 1) for feat in features['f_fire_contract']]

# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=2, embedding_dim=4, group_name='product_yn') for feat in features['product_yn']] 
# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=2, embedding_dim=4, group_name='product_month') for feat in features['product_month']] 
fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=input_data[feat].max() + 1, embedding_dim=8, group_name='user_category') for feat in features['user_category']]
# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=len(set(input_data[feat]))+1, embedding_dim=8, group_name='user_category') for feat in features['product_month_category']] 
fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=len(set(input_data[feat]))+1, embedding_dim=8, group_name='product_latest') for feat in features['product_latest']] 
# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=2, embedding_dim=8, group_name='has_item') for feat in features['has_item']] 
fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=len(set(input_data[feat]))+1, embedding_dim=8, group_name='target_category') for feat in features['target_category']] 
feature_names = get_feature_names(fixlen_feature_columns)
# fixlen_feature_columns

In [44]:
from sklearn.model_selection import GroupShuffleSplit
group_splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

for train_index, test_index in group_splitter.split(input_data[feature_names], groups=input_data['ID']):
    train = input_data.iloc[train_index].sample(frac=1, random_state=42)
    test = input_data.iloc[test_index].sample(frac=1, random_state=42)
    
assert len(set(train.ID)-set(test.ID)) == len(set(train.ID))

In [45]:
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
oot_model_input = {name: oot_data[name] for name in feature_names}

In [46]:
input_dataset = {'train':train, 'test':test}

# Modeling

In [47]:
import torch
from sklearn.metrics import log_loss, roc_auc_score
from deepctr_torch.models import DeepFM

In [48]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model_params = {"task":'binary',
                 "l2_reg_embedding":1e-5, 
                 "device":device}

model = DeepFM(linear_feature_columns=fixlen_feature_columns, dnn_feature_columns=fixlen_feature_columns,
               **model_params)

compile_params = {'optimizer':"adagrad", 
                  'loss':"binary_crossentropy",
                  'metrics':["binary_crossentropy", "auc"]
                 }
model.compile(**compile_params)


fit_params = {"batch_size":128,
              "epochs":45,
              "verbose":2,
              "validation_split":0.2}
history = model.fit(train_model_input, train[target].values, **fit_params)

pred_ans = model.predict(test_model_input)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cpu
Train on 67442 samples, validate on 16861 samples, 527 steps per epoch
Epoch 1/45
7s - loss:  0.6308 - binary_crossentropy:  0.6308 - auc:  0.6881 - val_binary_crossentropy:  0.6273 - val_auc:  0.6931
Epoch 2/45
7s - loss:  0.6210 - binary_crossentropy:  0.6210 - auc:  0.7052 - val_binary_crossentropy:  0.6263 - val_auc:  0.6950
Epoch 3/45
7s - loss:  0.6186 - binary_crossentropy:  0.6187 - auc:  0.7084 - val_binary_crossentropy:  0.6256 - val_auc:  0.6963
Epoch 4/45
7s - loss:  0.6170 - binary_crossentropy:  0.6170 - auc:  0.7109 - val_binary_crossentropy:  0.6250 - val_auc:  0.6976
Epoch 5/45
7s - loss:  0.6158 - binary_crossentropy:  0.6158 - auc:  0.7127 - val_binary_crossentropy:  0.6247 - val_auc:  0.6987
Epoch 6/45
7s - loss:  0.6148 - binary_crossentropy:  0.6148 - auc:  0.7147 - val_binary_crossentropy:  0.6252 - val_auc:  0.6988
Epoch 7/45
7s - loss:  0.6137 - binary_crossentropy:  0.6137 - auc:  0.7161 - val_binary_crossentropy:  0.6236 - val_auc:  0.7007
Epoch 8/45
7s -

# SAVE

In [15]:
model_path = f"models/{model_name}.pt"
torch.save(model, model_path)

In [16]:
import pickle
dataset_path = f'input/{model_name}_dataset.pkl'
with open(dataset_path, 'wb') as f:
    pickle.dump(input_dataset, f)

In [17]:
import json
model_config_contants = {}
model_config_contants['model_name'] = model_name
model_config_contants['model_path'] = model_path
model_config_contants['random_seed'] = seed

# parameters
model_config_contants["model_params"] = model_params
model_config_contants['compile_params'] = compile_params
model_config_contants["fit_params"] = fit_params

# raw dataset
model_config_contants['rawdata'] = {'raw_dataset_path':raw_dataset_path,
                        'raw_dataset_name':raw_dataset_name}
# dataset
model_config_contants['data'] = {'data_path':dataset_path,
                        'target_label':target,
                        'num_of_features':len(feature_names),
                        'num_of_trainset':len(train),
                        'num_of_testset':len(test),
                        
                        'train_ids':list(set(train.ID)),
                        'test_ids':list(set(test.ID))}

model_config_contants['features'] = {'feature_names':feature_names,
                            'features':fixlen_feature_columns}

with open(f'models/{model_name}_config.json', 'w') as f:
    json.dump(model_config_contants, f)

# ADD CLF

In [18]:
from utils import *
model_result = modelResult(model_name)
model_config = readModelConfig(model_name=model_name)
data_config = readDataConfig(input_name=model_config.get_raw_dataset_name)

Model name:  model_delvariable_202305_small
Dataset name:  data_delvariable_202305


In [19]:
trainset = model_result.read_trainset()
df_accuracy, accuracy, accuracy_old = model_result.get_accuracy(data=trainset)
clf_ids = list(df_accuracy[(df_accuracy.pred_item!='38.건강')&(df_accuracy.pred_item!='11.일반종신')].index)
len(clf_ids)

Num of Testset ids: 31942
Accuracy: 58.3%

data/OLD_202305.csv
Accuracy OLD: 48.0%


7755

In [20]:
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score, precision_score

In [21]:
df_for_clf_raw = trainset[trainset.ID.isin(clf_ids) & ~trainset.target_category.isin([1,2])]
assert df_for_clf_raw.duplicated().sum() == 0

In [22]:
# make X 
clf_cols = list(filter(lambda x : (x[0] not in ['S','C','F']) & ('yn' not in x) & (x[-4:] != '_cat') & (x!='has_item'), df_for_clf_raw.columns))
df_for_clf = df_for_clf_raw[clf_cols]

In [23]:
# data for chain
X_chain, y_chain = make_chain_input(df_for_clf)
chain_x_train, chain_x_test, chain_y_train, chain_y_test = train_test_split(X_chain, y_chain, test_size=0.2, random_state=seed)

# data fro base
X_base, y_base = make_base_input(df_for_clf)

In [24]:
from lightgbm import LGBMClassifier

lgb_classifier_chain = ClassifierChain(LGBMClassifier(random_state=seed, verbose=0))

lgb_params = {'base_estimator__learning_rate':[0.1],
              'base_estimator__n_estimators':[64, 32],
              'base_estimator__num_leaves':[4, 5],
              'base_estimator__colsample_bytree':[0.65, 0.75, 0.9],
              # 'subsample':[0.5],
             }
scoring = {'accuracy':make_scorer(accuracy_score),
           'f1_score':make_scorer(f1_score, average='weighted')}
grid_search = GridSearchCV(estimator=lgb_classifier_chain, param_grid=lgb_params, scoring=scoring, refit='f1_score', cv=3)
grid_result = grid_search.fit(chain_x_train, chain_y_train)
print(grid_result.best_params_)
lgb_bst = grid_result.best_estimator_
y_proba = lgb_bst.predict_proba(chain_x_test)

lgb_chain_accuracy = get_clf_pred(chain_y_test, y_proba)

# print(classification_report(y_test, y_pred_class))
clf_model_path = f"models/{model_name}_lgb_chain.pkl"
joblib.dump(lgb_bst, clf_model_path)

{'base_estimator__colsample_bytree': 0.65, 'base_estimator__learning_rate': 0.1, 'base_estimator__n_estimators': 64, 'base_estimator__num_leaves': 5}

Accuracy only CLF: 0.3302


['models/model_delvariable_202305_small_lgb_chain.pkl']

In [25]:
# binary = get_clf_pred(chain_y_test, y_proba, output='binary')
# accuracy_score(chain_y_test, binary, sample_weight='weighted')
# rep = classification_report(chain_y_test, binary)
# print(rep)

In [26]:
clf_chain_result = get_clf_chain_result(chain_y_test, y_proba)
clf_chain_result


Accuracy only CLF: 0.3302


Unnamed: 0,true_count,pred_count,pred_true,accuracy_pred,accuracy_true
3,263,550,244,44.4,92.8
4,113,90,27,30.0,23.9
5,81,108,20,18.5,24.7
6,40,7,0,0.0,0.0
7,80,178,45,25.3,56.2
8,37,28,9,32.1,24.3
9,3,0,0,,0.0
10,8,2,0,0.0,0.0
11,7,9,2,22.2,28.6
12,4,0,0,,0.0


In [27]:
# clf2. XGBoost
from xgboost import XGBClassifier
xgb_classifier_chain = ClassifierChain(XGBClassifier(random_state=seed, verbose=0))
xgb_params = {
    'base_estimator__learning_rate':[0.01, 0.1, 0.2],
    'base_estimator__max_depth' : [3, 4, 5],
    'base_estimator__n_estimators' : [50, 100, 200],
    'base_estimator__subsample' : [0.8, 0.9, 1.0],
    'base_estimator__colsample_bytree' : [0.8, 0.9, 1.0],
             }
scoring = {'accuracy':make_scorer(accuracy_score),
           'f1_score':make_scorer(f1_score, average='weighted')}

grid_search = GridSearchCV(estimator=xgb_classifier_chain, param_grid=xgb_params, scoring=scoring, refit='accuracy', cv=3)
grid_result = grid_search.fit(chain_x_train, chain_y_train)
print(grid_result.best_params_)
xgb_bst = grid_result.best_estimator_
y_proba = xgb_bst.predict_proba(chain_x_test)

xgb_chain_accuracy = get_clf_pred(chain_y_test, y_proba)

# save
clf_xgb_model_path = f"models/{model_name}_xgb_chain.pkl"
joblib.dump(xgb_bst, clf_xgb_model_path)

{'base_estimator__colsample_bytree': 0.9, 'base_estimator__learning_rate': 0.1, 'base_estimator__max_depth': 5, 'base_estimator__n_estimators': 100, 'base_estimator__subsample': 0.8}

Accuracy only CLF: 0.3169


['models/model_delvariable_202305_small_xgb_chain.pkl']

In [28]:
# y_binary = get_clf_pred(chain_y_test, y_proba, output='binary')
# accuracy_score(chain_y_test, y_binary, normalize=True, sample_weight=)

In [29]:
# clf3. catBoost
from catboost import CatBoostClassifier
cat_classifier_chain = ClassifierChain(CatBoostClassifier(random_state=seed, verbose=0))
cat_params = {
    'base_estimator__learning_rate':[0.1, 0.01],
    'base_estimator__depth':[3, 5, 6],
}
scoring = {
    'accuracy':make_scorer(accuracy_score),
    'f1_score':make_scorer(f1_score, average='micro')
}

grid_search = GridSearchCV(estimator=cat_classifier_chain, param_grid=cat_params, scoring=scoring, refit='accuracy', cv=3)
grid_result = grid_search.fit(chain_x_train, chain_y_train)
print(grid_result.best_params_)
cat_bst = grid_result.best_estimator_
y_proba = cat_bst.predict_proba(chain_x_test)

cat_chain_accuracy = get_clf_pred(chain_y_test, y_proba)

# save
clf_cat_model_path = f"models/{model_name}_cat_chain.pkl"
joblib.dump(cat_bst, clf_cat_model_path)

{'base_estimator__depth': 6, 'base_estimator__learning_rate': 0.1}

Accuracy only CLF: 0.3004


['models/model_delvariable_202305_small_cat_chain.pkl']

In [30]:
# # clf2. XGBoost
# # !pip install xgboost
# import xgboost as xgb
# xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(data_config.get_target_categories), random_state=seed)
# xgb_params = {
    
#         'learning_rate':[0.01, 0.1, 0.2],
#         'max_depth' : [3, 4, 5],
#         'n_estimators' : [50, 100, 200],
#         'subsample' : [0.8, 0.9, 1.0],
#         'colsample_bytree' : [0.8, 0.9, 1.0],
#         }
# xgb_params = {'colsample_bytree': [0.8],
#               'learning_rate': [0.01],
#               'max_depth': [2],
#          'n_estimators': [40, 50],
#          'subsample': [1.0]}

# grid_search = GridSearchCV(xgb_model, xgb_params, scoring='accuracy', cv=3)
# grid_result = grid_search.fit(x_train, y_train)
# print(grid_result.best_params_)
# xgb_bst = grid_result.best_estimator_
# y_pred_class = xgb_bst.predict(x_test)

# xgb_base_accuracy = accuracy_score(y_test, y_pred_class)
# print(f'XGB Accuracy: {xgb_base_accuracy:.4f}')
# joblib.dump(xgb_bst, f"models/{model_name}_xgb_base.pkl")

In [31]:
# # clf1. LightGBM
# import joblib
# from lightgbm import LGBMClassifier
# x_train, x_test, y_train, y_test = train_test_split(X_base, y_base, test_size=0.2, random_state=seed)
# lgb_model = LGBMClassifier(objective='multiclass',
#                            num_class=len(data_config.get_target_categories),
#                            random_state=seed,
#                           verbose=-1)

# lgb_params = {'learning_rate':[ 0.2],
#               'n_estimators':[16, 32],
#               'num_leaves':[4, 5],
#               'colsample_bytree':[ 0.75],
#               'subsample':[0.5, 0.8],
#              }
# grid_search = GridSearchCV(estimator=lgb_model, param_grid=lgb_params, scoring='accuracy', cv=3)
# grid_result = grid_search.fit(x_train, y_train)
# print(grid_result.best_params_)
# lgb_bst = grid_result.best_estimator_
# y_pred_class = lgb_bst.predict(x_test)

# lgb_base_accuracy = accuracy_score(y_test+3, y_pred_class+3)
# print(f'LGBM Accuracy: {lgb_base_accuracy:.4f}')
# joblib.dump(lgb_bst, f"models/{model_name}_lgb_base.pkl")

In [32]:
# # clf3. catBoost
# from catboost import CatBoostClassifier
# cat_model = CatBoostClassifier(loss_function='MultiClass',
#                                eval_metric='Accuracy',
#                                verbose=0)
# cat_params = {'learning_rate':[0.01],
#               'depth':[3,5,6],
#              }
# grid_search = GridSearchCV(estimator=cat_model, param_grid=cat_params, scoring='accuracy', cv=3)
# grid_result = grid_search.fit(x_train, y_train)
# print(grid_result.best_params_)
# cat_bst = grid_result.best_estimator_
# y_pred_class = cat_bst.predict(x_test)

# cat_accuracy = accuracy_score(y_test, y_pred_class)
# print(f'LGBM Accuracy: {cat_accuracy:.4f}')
# joblib.dump(cat_bst, f"models/{model_name}_lgb.pkl")

In [33]:
clf_model_accuracy = {
    # 'lgb_base':lgb_base_accuracy,
    'lgb_chain':lgb_chain_accuracy,
    'xgb_chain':xgb_chain_accuracy,
    'cat_chain':cat_chain_accuracy,
}

print(clf_model_accuracy)
max_key = max(clf_model_accuracy, key=clf_model_accuracy.get)
max_key

{'lgb_chain': 0.33024691358024694, 'xgb_chain': 0.3168724279835391, 'cat_chain': 0.3004115226337449}


'lgb_chain'

In [34]:
other_models = set(clf_model_accuracy) - set([max_key])
model_config_contants['add_clf'] = {'clf_model_path':f"models/{model_name}_{max_key}.pkl",
                                    'clf_features':clf_cols,
                                    'other_model_path':[f"models/{model_name}_{clf_name}.pkl" for clf_name in other_models],
                                    'multioutput_columns':list(clf_chain_result.index),}
with open(f'models/{model_name}_config.json', 'w') as f:
    json.dump(model_config_contants, f)