In [9]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

import matplotlib.pyplot as plt

from scipy.stats import randint, uniform

In [11]:
base_path = '/content/drive/MyDrive/LG-Aimers/phase2'
train_df = pd.read_csv(f'{base_path}/train.csv')
test_df = pd.read_csv(f'{base_path}/submission.csv')

In [12]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

# 범주형 특성은 모두 공백 제거 후 대문자
for col in label_columns:
    train_df[col] = train_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )
    test_df[col] = test_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )

In [13]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [14]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([train_df[label_columns], test_df[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    train_df[col] = df_all.iloc[: len(train_df)][col]
    test_df[col] = df_all.iloc[len(test_df) :][col]

In [17]:
train_df.columns

Index(['bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

In [57]:
used_features = [
    'bant_submit',
    # 'customer_country',  # response_corporate으로 대체
    'business_unit',
    'com_reg_ver_win_rate',  # 결측비율 70% 이상
    'customer_idx',
    'customer_type',  # 결측비율 70% 이상
    'enterprise',
    'historical_existing_cnt',
    'id_strategic_ver',
    'it_strategic_ver',
    'idit_strategic_ver',
    # 'customer_job',  # 256개보다 많아서 일단 없앰(10개 이하는 통합하면 61개)
    'lead_desc_length',
    'inquiry_type',
    # 'product_category',  # 256개보다 많아서 일단 없앰
    # 'product_subcategory',  # 256개보다 많아서 일단 없앰
    # 'product_modelname',  # 256개보다 많아서 일단 없앰
    # 'customer_country.1',  # response_corporate으로 대체
    'customer_position',
    'response_corporate',
    # 'expected_timeline',  # 256개보다 많아서 일단 없앰
    'ver_cus',
    'ver_pro',
    'ver_win_rate_x',
    # 'ver_win_ratio_per_bu',
    'business_area',
    'business_subarea',  # 결측비율 70% 이상
    'lead_owner',
    # 'is_converted'
]

In [37]:
len(used_features)

21

In [34]:
used_features.index('customer_type'), used_features.index('inquiry_type'), used_features.index('customer_position'), used_features.index('response_corporate'),

(4, 11, 12, 13)

In [58]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df[used_features],
    train_df["is_converted"],
    test_size=0.2,
    shuffle=True,
    stratify=train_df['is_converted'],
    random_state=42,
)

In [59]:
hgb_clf = HistGradientBoostingClassifier(
    loss='log_loss',
    learning_rate=0.1,
    max_iter=100,
    max_leaf_nodes=31,
    max_depth=None,
    min_samples_leaf=20,
    l2_regularization=0,
    max_bins=255,
    categorical_features=[
        9,
        12,
        13,
        14
    ],
)

In [60]:
hgb_pipeline = Pipeline([
    ("impute_zero", make_column_transformer(
        (SimpleImputer(strategy='constant', fill_value=0),
         ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
          'historical_existing_cnt', 'ver_win_rate_x']),
        remainder='passthrough'
    )),
    # ("impute_most_frequent", SimpleImputer(strategy='most_frequent')),
    ("hgb_clf", hgb_clf)
])

In [48]:
hgb_pipeline[:-1].get_feature_names_out()

array(['simpleimputer__id_strategic_ver',
       'simpleimputer__it_strategic_ver',
       'simpleimputer__idit_strategic_ver',
       'simpleimputer__historical_existing_cnt',
       'simpleimputer__ver_win_rate_x', 'remainder__bant_submit',
       'remainder__business_unit', 'remainder__com_reg_ver_win_rate',
       'remainder__customer_idx', 'remainder__customer_type',
       'remainder__enterprise', 'remainder__lead_desc_length',
       'remainder__inquiry_type', 'remainder__customer_position',
       'remainder__response_corporate', 'remainder__ver_cus',
       'remainder__ver_pro', 'remainder__ver_win_ratio_per_bu',
       'remainder__business_area', 'remainder__business_subarea',
       'remainder__lead_owner'], dtype=object)

In [61]:
hgb_pipeline.fit(X_train, y_train)

In [52]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [62]:
pred = hgb_pipeline.predict(X_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  649   321]
 [   68 10822]]

정확도: 0.9672
정밀도: 0.9052
재현율: 0.6691
F1: 0.7694
