In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
import seaborn as sns
import category_encoders as ce
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc
import pickle
from tqdm import tqdm

pd.set_option('display.max_columns', 100)
DATA_FOLDER_PATH = "/Users/ritumutaka/J/Categorical_Feature_Encoding_Challenge_II/datas/cat-in-the-dat-ii"
RANDOM_STATE = 2020

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
"""
関数定義
"""
def count_plot(data_series):
    values = data_series.value_counts()
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    
    barplot = plt.bar(values.index, values, color='lightgreen', alpha=0.8)
    barplot[1].set_color('darkred')
    
    ax.set_title('Target Distribution')
    ax.annotate("percentage of target 1 : {}%".format(values.sum() / len(values)),
                xy=(0, 0), xycoords='axes fraction',
                xytext=(0, -50), textcoords='offset points',
                va="top", ha="left", color='grey',
                bbox=dict(boxstyle='round', fc="w", ec='w'))
    
    plt.xlabel('Target', fontsize=12, weight='bold')
    plt.show()

def plot_roc(y_true, y_pred, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    print("%s: %s" % (model_name, auc(fpr, tpr)))
    plt.title(model_name)
    plt.plot(fpr, tpr, marker='o')
    plt.xlabel('FPR: False positive rate')
    plt.ylabel('TPR: True positive rate')
    plt.grid()
    plt.show()

In [3]:
"""
データの読み込み & ターゲットエンコーディング
train_X, train_y
test_X
"""

MAKE_TARGET_ENCODED_DATA = False
ENCODE_FEATURES = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
                   'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
                   'day', 'month']

train = pd.read_csv(DATA_FOLDER_PATH + "/input/train.csv", index_col=0).sort_index(inplace=False)
test = pd.read_csv(DATA_FOLDER_PATH + "/input/test.csv", index_col=0).sort_index(inplace=False)
sample_submission = pd.read_csv(DATA_FOLDER_PATH + "/input/sample_submission.csv", index_col=0)

if not MAKE_TARGET_ENCODED_DATA:
    train_X = pd.read_csv(DATA_FOLDER_PATH + "/train_X_TargetEncoded.csv", index_col=0)
    train_y = train['target']
    test_X = pd.read_csv(DATA_FOLDER_PATH + "/test_X_TargetEncoded.csv", index_col=0)
elif MAKE_TARGET_ENCODED_DATA:
    # ターゲットエンコードしてデータを作成
    train_X, train_y = train.drop(['target'], axis=1), train['target']
    test_X = test
    
    # ターゲットエンコーディング
    skf = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
    encode_features = ENCODE_FEATURES
    smoothing = 0.20
    
    #  trainデータをエンコード
    oof = pd.DataFrame([])
    for tr_index, oof_index in skf.split(train_X, train_y):
        ce_target_encoder = ce.TargetEncoder(cols=encode_features, smoothing=smoothing)
        ce_target_encoder.fit(train_X.iloc[tr_index, :], train_y.iloc[tr_index])
        oof_part = ce_target_encoder.transform(train_X.iloc[oof_index, :])
        oof = oof.append(oof_part, ignore_index=False)
        print(oof_index)

    # testデータをエンコード
    ce_target_encoder = ce.TargetEncoder(cols=encode_features, smoothing=smoothing)
    ce_target_encoder.fit(train_X, train_y)
    train_X = oof.sort_index()
    ce_target_encoder.transform(test_X)
    
    # ターゲットエンコード済みデータとして保存
    train_X.to_csv(DATA_FOLDER_PATH + "/train_X_TargetEncoded.csv")
    test_X.to_csv(DATA_FOLDER_PATH + "/test_X_TargetEncoded.csv")

load data
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [10]:
"""
序数変数のエンコード
(単純に階級を数字に置き換えているだけ)
"""
# id列をindexとして指定しないと、test側でindexが0から連番となり、nanが代入されるため注意
ord_columns = ['id', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4']
train_temp = pd.read_csv(DATA_FOLDER_PATH + "/train_ord.csv", usecols=ord_columns, index_col=0)
test_temp = pd.read_csv(DATA_FOLDER_PATH + "/test_ord.csv", usecols=ord_columns, index_col=0)
ord_columns.remove('id')
train_X[ord_columns] = train_temp
test_X[ord_columns] = test_temp

# 不要な変数を削除
del train_temp
del test_temp

In [12]:
"""
特徴生成 & 特徴選択
"""
# ord_5
train_X.drop(['ord_5'], axis=1, inplace=True)
test_X.drop(['ord_5'], axis=1, inplace=True)

# add isnull column
train_X['isnull_count'] = train.isnull().sum(axis=1)
test_X['isnull_count'] = test.isnull().sum(axis=1)

In [13]:
"""
モデル定義
"""
# 線形モデル : ロジスティック回帰
LinearModel_LogisticRegression = linear_model.LogisticRegression(
    random_state=RANDOM_STATE, solver='lbfgs', max_iter=2020, fit_intercept=True, penalty='none', verbose=0
)

# ツリーモデル : LightGBM
TreeModel_LightGbm = lgb.LGBMClassifier(
    # デフォルトパラメータ
)

In [14]:
%%time
"""
KFoldでvalidation
"""
kf = KFold(n_splits=5, shuffle=False, random_state=RANDOM_STATE)

EnsembleModels = []
RocScores = {}

for i, [tr_index, val_index] in tqdm(enumerate(kf.split(train_X, train_y))):
    TempModel_LogisticRegression = LinearModel_LogisticRegression.fit(train_X.iloc[tr_index, :], train_y.iloc[tr_index])
    TempModel_LightGbm = TreeModel_LightGbm.fit(train_X.iloc[tr_index, :], train_y.iloc[tr_index])
    EnsembleModels.extend([
        (f'LogisticRegression_{i}', TempModel_LogisticRegression),
        (f'LightGbm_{i}', TempModel_LightGbm),
    ])
    
    # validation
    RocScores[f'{i}'] = [
        roc_auc_score(train_y.iloc[val_index], TempModel_LogisticRegression.predict_proba(train_X.iloc[val_index, :])[:, 1]),
        roc_auc_score(train_y.iloc[val_index], TempModel_LightGbm.predict_proba(train_X.iloc[val_index, :])[:, 1]),
    ]
    print(val_index)
   



[     0      1      2 ... 119997 119998 119999]
[120000 120001 120002 ... 239997 239998 239999]
[240000 240001 240002 ... 359997 359998 359999]
[360000 360001 360002 ... 479997 479998 479999]
[480000 480001 480002 ... 599997 599998 599999]
CPU times: user 6min 25s, sys: 17.5 s, total: 6min 42s
Wall time: 3min 12s


In [15]:
"""
モデルを保存
"""
with open("../datas/cat-in-the-dat-ii/models/EnsembleModels_20200301_00.pkl", mode="wb") as f:
    pickle.dump(EnsembleModels, file=f)

"""
ローカルCV結果を保存
"""
with open("../datas/cat-in-the-dat-ii/models/RocScores_20200301_00.pkl", mode="wb") as f:
    pickle.dump(RocScores, f)

In [39]:
"""
アンサンブル予測
"""
EnsembleDf = sample_submission.copy() 

for name_model in EnsembleModels:
    temp = pd.Series(data=name_model[1].predict_proba(test_X)[:, 1], index=test.index, name=name_model[0])
    EnsembleDf = pd.concat([EnsembleDf, temp], axis=1)

EnsembleDf.drop(['target'], axis=1, inplace=True)
EnsembleDf.sort_index(axis=1, inplace=True)   # カラム名でソート

# 不要な変数を削除
del temp

In [41]:
sample_submission['target'] = EnsembleDf.mean(axis=1)
sample_submission.to_csv("../datas/cat-in-the-dat-ii/submission_20200301.csv")