In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_curve, auc, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Boosting Algorithms :
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [3]:
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode
        self.dict_encoder = {}

    def fit(self,X,y=None):
        for i in self.columns:
            encoder = preprocessing.LabelEncoder()
            encoder.fit(X[i].astype(str))
            self.dict_encoder[i] = encoder
        return self # not relevant here

    def transform(self,X):
        output = X[self.columns].copy()
        for i in self.columns:
            output[i] = self.dict_encoder[i].transform(output[i])
        return output

def evaluate_model(model, X, y):
    try:
        y_pred = model.predict_proba(X)[:, 1]
    except:
        y_pred = model.predict(X)

    fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y, y_pred)
    auc_s = auc(fpr_rt_lm, tpr_rt_lm)
    print('AUC: ', auc_s)
    print('Gini: ', 2*auc_s - 1)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

In [10]:
folder_path = 'E:/cached_data/HR_pred'
train_path = folder_path + '/train_LZdllcl.csv'
test_path = folder_path +  '/test_2umaH9m.csv'

In [11]:
# Data loading
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [12]:
# cols classifying
cols_id = ['employee_id']
col_label = 'is_promoted'
cols_ft = [col for col in df_train.columns if col not in cols_id + [col_label] ]
cols_ft_cat = [col for col in df_train.columns if col not in cols_id + [col_label] and df_train[col].dtype=='object']
cols_ft_num = [col for col in cols_ft if col not in cols_ft_cat]

In [13]:
# for i in cols_ft_cat:
#     print( df_train.groupby(i)['employee_id'].count().sort_values(ascending = False) )
# # => dữ liệu khá sạch

In [14]:
df_train[cols_ft_cat] = df_train[cols_ft_cat].fillna('missing')
df_test[cols_ft_cat] = df_test[cols_ft_cat].fillna('missing')

In [15]:
X_train, X_test, y_train, y_test = train_test_split( df_train[cols_ft], df_train[col_label], test_size=0.33, random_state=42)

In [20]:
# Model compile
estimators = [
    ('rf', ensemble.RandomForestClassifier(n_estimators=10, random_state=42)),
    ('xtree', ensemble.ExtraTreesClassifier()),
    ('gbt', ensemble.GradientBoostingClassifier()),
    ('ada', ensemble.AdaBoostClassifier()),
    ('xgb', XGBClassifier()),
    ('cat', CatBoostClassifier()),
    ('lightgbm', LGBMClassifier()),
]
clf = ensemble.StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), 
    cv = 10, n_jobs = -1
)

p_cat = Pipeline([
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')) ,
    ('encode', MultiColumnLabelEncoder(columns = cols_ft_cat)) #OrdinalEncoder(categories=categories_val))
])
p_num = Pipeline([
    ('fillna', SimpleImputer(strategy='median')),
    ('norm', preprocessing.StandardScaler())
])
p1 = ColumnTransformer([
    ('cat', p_cat, cols_ft_cat),
    ('num', p_num, cols_ft_num)
])

pipe = Pipeline([
    ('transformer', p1),
    ('predictor', clf)
])

#### Fitting
pipe.fit(df_train[cols_ft], df_train[col_label])
# pipe.fit(X_train, y_train)

# Evaluating
print('Test AUC')
evaluate_model(pipe, X_test[cols_ft], y_test)

print('Train AUC')
evaluate_model(pipe, X_train[cols_ft], y_train)

KeyboardInterrupt: 

In [17]:
df_test['is_promoted'] = pipe.predict(df_test[cols_ft])

In [19]:
df_test[['employee_id', 'is_promoted']].to_csv( folder_path + '/submission.csv', index = False)