In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [155]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import tree

import matplotlib.pyplot as plt

from scipy.stats import randint, uniform

In [156]:
base_path = '/content/drive/MyDrive/LG-Aimers/phase2'
train_df = pd.read_csv(f'{base_path}/train.csv')
test_df = pd.read_csv(f'{base_path}/submission.csv')

In [157]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

# 범주형 특성은 모두 공백 제거 후 대문자
for col in label_columns:
    train_df[col] = train_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )
    test_df[col] = test_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )
# country 특성의 경우 모두 도시 제거
# 이메일만 남아있는 entry 모두 NaN으로 변경
train_df['customer_country'] = train_df['customer_country'].apply(
    lambda x: x if pd.isna(x) else x.split('/')[-1]).apply(
        lambda x: x if pd.isna(x) else (pd.NA if '@' in x else x)
    )
test_df['customer_country'] = test_df['customer_country'].apply(
    lambda x: x if pd.isna(x) else x.split('/')[-1]).apply(
        lambda x: x if pd.isna(x) else (pd.NA if '@' in x else x)
    )

train_df['customer_country.1'] = train_df['customer_country.1'].apply(
    lambda x: x if pd.isna(x) else x.split('/')[-1]).apply(
        lambda x: x if pd.isna(x) else (pd.NA if '@' in x else x)
    )
test_df['customer_country.1'] = test_df['customer_country.1'].apply(
    lambda x: x if pd.isna(x) else x.split('/')[-1]).apply(
        lambda x: x if pd.isna(x) else (pd.NA if '@' in x else x)
    )

In [158]:
# train set, test set 모두 label encoding

def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [159]:
df_all = pd.concat([train_df[label_columns], test_df[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    train_df[col] = df_all.iloc[: len(train_df)][col]
    test_df[col] = df_all.iloc[len(test_df) :][col]

In [160]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop('is_converted', axis=1),
    train_df["is_converted"],
    test_size=0.2,
    shuffle=True,
    stratify=train_df['is_converted'],
    random_state=42,
)

In [161]:
zero_impute_features = [
    'id_strategic_ver',
    'it_strategic_ver'
]

others_features = [
    'bant_submit',  # 결측치 처리 필요 없음
    'com_reg_ver_win_rate',  # KNNImputer or 0으로 대체
    'customer_idx',  # 나중에 rbf
    'historical_existing_cnt',  # KNNImputer or 0으로 대체
    # 'id_strategic_ver',  # 0으로 채우기
    # 'it_strategic_ver',  # 0으로 채우기
    # 'idit_strategic_ver',
    'lead_desc_length',  # KNNImputer or 'Other'
    'ver_cus',
    'ver_pro',
    'ver_win_rate_x',  # KNNImputer or 0으로 대체
    'ver_win_ratio_per_bu',  # KNNImputer or 0으로 대체
    'lead_owner',  # 나중에 rbf
    "customer_country",  # KNNImputer
    "business_subarea",  # KNNImputer
    "business_area",  # KNNImputer
    "business_unit",
    "customer_type",  # KNNImputer & End customer 합치기
    "enterprise",
    "customer_job",  # KNNImputer or 'other'로 그냥 채우기?
    "inquiry_type",  # KNNImputer & Quotation 합치기
    "product_category",  # KNNImputer or 'etc'
    "product_subcategory",  # KNNImputer or '0'이라는 새로운 카테고리를 만드는 방식
    "product_modelname",  # KNNImputer or '0'이라는 새로운 없음 카테고리
    "customer_country.1",  # KNNImputer or '//'
    "customer_position",
    "response_corporate",
    "expected_timeline",  # KNNImputer or 'etc', discussed... 합치기
]

class DoNothingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.X = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X

zero_impute_pipeline = Pipeline([
    ("simple_imputer", SimpleImputer(strategy='constant', fill_value=0)),
])

preprocessing = ColumnTransformer([
    ("zero_impute_pipeline", zero_impute_pipeline, zero_impute_features),
    ("do_nothing_pipeline", DoNothingTransformer(), others_features)
], remainder='drop')

decision_tree_clf = DecisionTreeClassifier(
    max_depth=38,
    min_samples_split=2,
    min_samples_leaf=1,
    max_leaf_nodes=2040,
)

decision_tree_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    # ("simple_imputer", SimpleImputer(strategy='constant', fill_value=-1)),
    ("knn_imputer", KNNImputer(n_neighbors=3, weights='distance')),
    ("decision_tree", decision_tree_clf)
])

In [162]:
decision_tree_pipeline.fit(X_train, y_train)

In [163]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [164]:
pred = decision_tree_pipeline.predict(X_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  760   210]
 [  202 10688]]

정확도: 0.9653
정밀도: 0.7900
재현율: 0.7835
F1: 0.7867


In [165]:
cv_scores = cross_val_score(decision_tree_pipeline, train_df, train_df['is_converted'],
                cv=5, scoring=make_scorer(f1_score))
print(cv_scores)
print(cv_scores.mean().round(3))

[0.78100775 0.55053599 0.37877095 0.83350254 0.32597793]
0.574


In [166]:
test_pred = decision_tree_pipeline.predict(test_df)
sum(test_pred)

868

In [167]:
decision_tree_clf.get_depth()

26

In [168]:
decision_tree_clf.get_n_leaves()

1434

In [169]:
param_distrib = {
    'knn_imputer__n_neighbors': randint(low=3, high=10),
    'knn_imputer__weights': ['uniform', 'distance'],
    'decision_tree__max_depth': randint(low=20, high=50),
    'decision_tree__max_leaf_nodes': randint(low=1000, high=5000),
}

rnd_search = RandomizedSearchCV(
    decision_tree_pipeline, param_distributions=param_distrib,
    n_iter=10, cv=3, scoring=make_scorer(f1_score),
    verbose=True
)

In [170]:
rnd_search.fit(train_df, train_df['is_converted'])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_score_

In [None]:
decision_tree_best = rnd_search.best_estimator_

In [None]:
decision_tree_best.fit(train_df, train_df['is_converted'])

In [None]:
cv_scores = cross_val_score(decision_tree_best,
                            train_df, train_df['is_converted'],
                            cv=3, scoring=make_scorer(f1_score))
print(cv_scores)
print(cv_scores.mean().round(3))

In [None]:
X_test = test_df.drop(['is_converted', "id"], axis=1)
test_pred = decision_tree_best.predict(X_test)
sum(test_pred) # True로 예측된 개수