In [85]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [86]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

from scipy.stats import randint, uniform

In [87]:
base_path = '/content/drive/MyDrive/LG-Aimers/phase2'
train_df = pd.read_csv(f'{base_path}/train.csv')
test_df = pd.read_csv(f'{base_path}/submission.csv')

In [88]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

# 범주형 특성은 모두 공백 제거 후 대문자
for col in label_columns:
    train_df[col] = train_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )
    test_df[col] = test_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )

In [89]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [90]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([train_df[label_columns], test_df[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    train_df[col] = df_all.iloc[: len(train_df)][col]
    test_df[col] = df_all.iloc[len(test_df) :][col]

In [91]:
smote = SMOTE(random_state=42)

In [92]:
X_res, y_res = smote.fit_resample(
    train_df.drop('is_converted', axis=1).fillna(0),
    train_df['is_converted']
)
train_df_over_sampled = pd.concat([X_res, y_res], axis=1)

In [94]:
train_df_over_sampled

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.000000,6787,0,0.066667,32160,9,0,0.000000,0.0,0.0,...,33,246,1,0,0.003079,0.026846,0,28,0,True
1,1.000000,6421,0,0.066667,23122,9,0,12.000000,0.0,0.0,...,33,246,1,0,0.003079,0.026846,0,0,1,True
2,1.000000,4421,0,0.088889,1755,9,0,144.000000,0.0,0.0,...,21,246,1,0,0.003079,0.026846,0,16,2,True
3,1.000000,1431,0,0.088889,4919,9,0,0.000000,0.0,0.0,...,21,246,1,0,0.003079,0.026846,0,47,3,True
4,1.000000,3741,0,0.088889,17126,25,0,0.000000,0.0,0.0,...,21,246,0,0,0.003079,0.026846,0,86,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108893,0.837176,3954,0,0.026584,20504,9,0,0.000000,0.0,0.0,...,34,377,0,0,0.002006,0.017484,4,48,5,True
108894,0.420318,124,0,0.000000,25096,29,0,0.000000,0.0,0.0,...,21,448,0,0,0.000000,0.000000,12,86,631,True
108895,0.536280,6598,2,0.000000,25096,29,0,0.000000,0.0,0.0,...,34,246,0,0,0.000000,0.000000,12,86,117,True
108896,0.859582,8570,2,0.022661,39945,23,0,0.280836,0.0,0.0,...,27,302,0,0,0.000153,0.017993,11,78,197,True


In [96]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df_over_sampled.drop('is_converted', axis=1),
    train_df_over_sampled["is_converted"],
    test_size=0.2,
    shuffle=True,
    stratify=train_df_over_sampled['is_converted'],
    random_state=42,
)

In [97]:
ada_boost_clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=24),
    n_estimators=332,
    learning_rate=6.421300001290953,
)

In [98]:
ada_boost_clf.fit(X_train.fillna(0), y_train)

In [42]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [99]:
pred = ada_boost_clf.predict(X_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[10683   207]
 [  536 10354]]

정확도: 0.9659
정밀도: 0.9522
재현율: 0.9810
F1: 0.9664


In [50]:
param_distrib = {
    'estimator__max_depth': randint(low=1, high=50),
    'n_estimators': randint(low=50, high=500),
    'learning_rate': uniform(0.01, 10)
}

rnd_search = RandomizedSearchCV(
    ada_boost_clf,
    param_distributions=param_distrib,
    cv=StratifiedKFold(n_splits=3, shuffle=True),
    scoring=make_scorer(f1_score),
    verbose=True
)

아래 랜덤서치는 최소 1시간 생각해야함

In [52]:
rnd_search.fit(
    train_df.drop('is_converted', axis=1).fillna(0),
    train_df["is_converted"]
)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [53]:
cv_res = pd.DataFrame(rnd_search.cv_results_ )
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)

In [64]:
cv_res[['params', 'mean_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,split0_test_score,split1_test_score,split2_test_score
7,"{'estimator__max_depth': 31, 'learning_rate': ...",0.811141,0.814037,0.810411,0.808974
4,"{'estimator__max_depth': 39, 'learning_rate': ...",0.809722,0.80923,0.808483,0.811454
3,"{'estimator__max_depth': 34, 'learning_rate': ...",0.808579,0.810423,0.808757,0.806555
9,"{'estimator__max_depth': 24, 'learning_rate': ...",0.807901,0.80968,0.806744,0.80728
1,"{'estimator__max_depth': 35, 'learning_rate': ...",0.807486,0.80424,0.812661,0.805556
6,"{'estimator__max_depth': 46, 'learning_rate': ...",0.807418,0.80372,0.811003,0.80753
2,"{'estimator__max_depth': 20, 'learning_rate': ...",0.496136,0.53008,0.20393,0.754398
5,"{'estimator__max_depth': 6, 'learning_rate': 7...",0.159878,0.113208,0.155201,0.211224
0,"{'estimator__max_depth': 9, 'learning_rate': 3...",0.141335,0.167217,0.026748,0.230042
8,"{'estimator__max_depth': 1, 'learning_rate': 9...",0.002873,0.004914,0.0,0.003706


In [65]:
cv_res.iloc[3]['params']

{'estimator__max_depth': 24,
 'learning_rate': 6.421300001290953,
 'n_estimators': 332}

In [55]:
rnd_search.best_params_

{'estimator__max_depth': 31,
 'learning_rate': 4.7030031488541795,
 'n_estimators': 301}

In [60]:
best_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=31),
    learning_rate=4.7030031488541795,
    n_estimators=301
)

In [61]:
best_model.fit(X_train.fillna(0), y_train)

In [62]:
get_clf_eval(y_val, best_model.predict(X_val.fillna(0)))

오차행렬:
 [[  760   210]
 [  123 10767]]

정확도: 0.9719
정밀도: 0.8607
재현율: 0.7835
F1: 0.8203


In [101]:
# 예측에 필요한 데이터 분리
X_test = test_df.drop(["is_converted", "id"], axis=1)

In [102]:
test_pred = ada_boost_clf.predict(X_test.fillna(0))
sum(test_pred) # True로 예측된 개수

1270