In [1]:
""" やること
1. エンコードされたtrain.csv, test.csvの作成(ツリーモデル用)
"""
import pandas as pd
from tqdm import tqdm
import category_encoders as ce
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

"""
各種設定
"""
pd.set_option('display.max_columns', 100)
PROJECT_FOLDER_PATH = "/Users/ritumutaka/J/Categorical_Feature_Encoding_Challenge_II"
DATA_FOLDER_PATH = "/Users/ritumutaka/J/Categorical_Feature_Encoding_Challenge_II/cat-in-the-dat-ii"
RANDOM_STATE = 2020
MAKE_TARGET_ENCODED_DATA = True
MAKE_ORD_ENCODED_DATA = True  # OrdEncodeに実装
LOAD_NUM = 600000    # 5万データのみ読み込み

In [2]:
"""
関数定義
"""
def count_plot(data_series):
    values = data_series.value_counts()
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    
    barplot = plt.bar(values.index, values, color='lightgreen', alpha=0.8)
    barplot[1].set_color('darkred')
    
    ax.set_title('Target Distribution')
    ax.annotate("percentage of target 1 : {}%".format(values.sum() / len(values)),
                xy=(0, 0), xycoords='axes fraction',
                xytext=(0, -50), textcoords='offset points',
                va="top", ha="left", color='grey',
                bbox=dict(boxstyle='round', fc="w", ec='w'))
    
    plt.xlabel('Target', fontsize=12, weight='bold')
    plt.show()

def plot_roc(y_true, y_pred, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    print("%s: %s" % (model_name, auc(fpr, tpr)))
    plt.title(model_name)
    plt.plot(fpr, tpr, marker='o')
    plt.xlabel('FPR: False positive rate')
    plt.ylabel('TPR: True positive rate')
    plt.grid()
    plt.show()

In [3]:
"""
データの読み込み
train, test, sample_submission 
"""

ENCODE_FEATURES = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
                   'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
                   'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5',
                   'day', 'month']

train = pd.read_csv(DATA_FOLDER_PATH + "/input/train.csv", index_col=0)[:LOAD_NUM].sort_index(inplace=False)
test = pd.read_csv(DATA_FOLDER_PATH + "/input/test.csv", index_col=0)[:LOAD_NUM].sort_index(inplace=False)
sample_submission = pd.read_csv(DATA_FOLDER_PATH + "/input/sample_submission.csv", index_col=0)

# 編集に複製
train_X, train_y = train.drop(['target'], axis=1), train['target']
test_X = test

5it [00:32,  6.50s/it]


In [None]:
"""
特徴生成 & 特徴選択
"""
# 不要なカラムの削除
train_X.drop(['ord_3'], axis=1, inplace=True)
test_X.drop(['ord_3'], axis=1, inplace=True)

# add isnull columns 意味なし
# for col in ['ord_5', 'nom_7', 'nom_8', 'ord_3']:
#     train_X[f'{col}_isnull_count'] = train[col].isnull().astype('int')
#     test_X[f'{col}_isnull_count'] = test[col].isnull().astype('int')

In [None]:
"""
ターゲットエンコーディング
"""

# ターゲットエンコードしてデータを作成
train_X, train_y = train.drop(['target'], axis=1), train['target']
test_X = test

# ターゲットエンコーディング
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
encode_features = ENCODE_FEATURES
smoothing = 0.20

#  trainデータをエンコード
oof = pd.DataFrame([])
for tr_index, oof_index in tqdm(skf.split(train_X, train_y)):
    ce_target_encoder = ce.TargetEncoder(cols=encode_features, smoothing=smoothing)
    ce_target_encoder.fit(train_X.iloc[tr_index, :], train_y.iloc[tr_index])
    oof_part = ce_target_encoder.transform(train_X.iloc[oof_index, :])
    oof = oof.append(oof_part, ignore_index=False)

# testデータをエンコード
ce_target_encoder = ce.TargetEncoder(cols=encode_features, smoothing=smoothing)
ce_target_encoder.fit(train_X, train_y)
train_X = oof.sort_index()
test_X = ce_target_encoder.transform(test_X, override_return_df=True)


In [4]:
"""
序数変数のエンコード
(単純に階級を数字に置き換えているだけ)
"""

if MAKE_ORD_ENCODED_DATA:
    # OrdEncode.ipynbに実装
    pass
else:
    # id列をindexとして指定しないと、test側でindexが0から連番となり、nanが代入されるため注意
    # 詳細はwikiに記載
    ord_columns = ['id', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4']
    train_temp = pd.read_csv(DATA_FOLDER_PATH + "/output/train_ord.csv", usecols=ord_columns, index_col=0)
    test_temp = pd.read_csv(DATA_FOLDER_PATH + "/output/test_ord.csv", usecols=ord_columns, index_col=0)
    ord_columns.remove('id')
    train_X[ord_columns] = train_temp
    test_X[ord_columns] = test_temp

    # 不要な変数を削除
    del train_temp
    del test_temp

In [6]:
"""
データの保存
"""
if len(train_X) < 600000 and len(test_X) < 400000:
    print('少数データのみ読み込んでいるため、保存はパスします')
else:
    pd.concat([train_X, train_y], axis=1).to_csv(f"{PROJECT_FOLDER_PATH}/cat-in-the-dat-ii/output/train_tree.csv") 
    test_X.to_csv(f"{PROJECT_FOLDER_PATH}/cat-in-the-dat-ii/output/test_tree.csv")
    print('保存しました')

保存しました
