In [243]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [244]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import tree

import matplotlib.pyplot as plt

from scipy.stats import randint, uniform

In [245]:
base_path = '/content/drive/MyDrive/LG-Aimers/phase2'
train_df = pd.read_csv(f'{base_path}/train.csv')
test_df = pd.read_csv(f'{base_path}/submission.csv')

In [246]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

# 범주형 특성은 모두 공백 제거 후 대문자
for col in label_columns:
    train_df[col] = train_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )
    test_df[col] = test_df[col].apply(
        lambda x: x if pd.isna(x) else x.replace(" ", "").replace('-', '').upper()
    )

In [247]:
countries = [
    "AFGHANISTAN", "ALBANIA", "ALGERIA", "ANDORRA", "ANGOLA",
    "ANTIGUAANDBARBUDA", "ARGENTINA", "ARMENIA", "AUSTRALIA",
    "AUSTRIA", "AZERBAIJAN", "BAHAMAS", "BAHRAIN", "BANGLADESH",
    "BARBADOS", "BELARUS", "BELGIUM", "BELIZE", "BENIN", "BHUTAN",
    "BOLIVIA", "BOSNIAANDHERZEGOVINA", "BOTSWANA", "BRAZIL",
    "BRUNEI", "BULGARIA", "BURKINAFASO", "BURUNDI", "CABOVERDE",
    "CAMBODIA", "CAMEROON", "CANADA", "CENTRALAFRICANREPUBLIC",
    "CHAD", "CHILE", "CHINA", "COLOMBIA", "COMOROS", "CONGO",
    "COSTARICA", "COTEDIVOIRE", "CROATIA", "CUBA", "CYPRUS",
    "CZECHREPUBLIC", "DEMOCRATICREPUBLICOFTHECONGO", "DENMARK",
    "DJIBOUTI", "DOMINICA", "DOMINICANREPUBLIC", "EASTTIMOR",
    "ECUADOR", "EGYPT", "ELSALVADOR", "EQUATORIALGUINEA",
    "ERITREA", "ESTONIA", "ESWATINI", "ETHIOPIA", "FIJI",
    "FINLAND", "FRANCE", "GABON", "GAMBIA", "GEORGIA",
    "GERMANY", "GHANA", "GREECE", "GRENADA", "GUATEMALA",
    "GUINEA", "GUINEABISSAU", "GUYANA", "HAITI", "HONDURAS",
    "HUNGARY", "ICELAND", "INDIA", "INDONESIA", "IRAN",
    "IRAQ", "IRELAND", "ISRAEL", "ITALY", "JAMAICA",
    "JAPAN", "JORDAN", "KAZAKHSTAN", "KENYA", "KIRIBATI",
    "KOSOVO", "KUWAIT", "KYRGYZSTAN", "LAOS", "LATVIA",
    "LEBANON", "LESOTHO", "LIBERIA", "LIBYA", "LIECHTENSTEIN",
    "LITHUANIA", "LUXEMBOURG", "MADAGASCAR", "MALAWI",
    "MALAYSIA", "MALDIVES", "MALI", "MALTA", "MARSHALLISLANDS",
    "MAURITANIA", "MAURITIUS", "MEXICO", "MICRONESIA",
    "MOLDOVA", "MONACO", "MONGOLIA", "MONTENEGRO", "MOROCCO",
    "MOZAMBIQUE", "MYANMAR", "NAMIBIA", "NAURU", "NEPAL",
    "NETHERLANDS", "NEWZEALAND", "NICARAGUA", "NIGER",
    "NIGERIA", "NORTHKOREA", "NORTHMACEDONIA", "NORWAY",
    "OMAN", "PAKISTAN", "PALAU", "PALESTINE", "PANAMA",
    "PAPUANEWGUINEA", "PARAGUAY", "PERU", "PHILIPPINES",
    "POLAND", "PORTUGAL", "QATAR", "ROMANIA", "RUSSIA",
    "RWANDA", "SAINTKITTSANDNEVIS", "SAINTLUCIA",
    "SAINTVINCENTANDTHEGRENADINES", "SAMOA", "SANMARINO",
    "SAOTOMEANDPRINCIPE", "SAUDIARABIA", "SENEGAL", "SERBIA",
    "SEYCHELLES", "SIERRALEONE", "SINGAPORE", "SLOVAKIA",
    "SLOVENIA", "SOLOMONISLANDS", "SOMALIA", "SOUTHAFRICA",
    "SOUTHKOREA", "SOUTHSUDAN", "SPAIN", "SRILANKA", "SUDAN",
    "SURINAME", "SWEDEN", "SWITZERLAND", "SYRIA", "TAIWAN",
    "TAJIKISTAN", "TANZANIA", "THAILAND", "TOGO", "TONGA",
    "TRINIDADANDTOBAGO", "TUNISIA", "TURKEY", "TURKMENISTAN",
    "TUVALU", "UGANDA", "UKRAINE",
    "UNITEDARABEMIRATES", "UAE"  # 축약된 국가명 추가
    "UNITEDKINGDOM",
    "UNITEDSTATES", "USA"  # 축약된 국가명 추가
    "URUGUAY", "UZBEKISTAN",
    "VANUATU", "VATICANCITY", "VENEZUELA", "VIETNAM",
    "YEMEN", "ZAMBIA", "ZIMBABWE"
]

us_states = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA",
    "COLORADO", "CONNECTICUT", "DELAWARE", "FLORIDA", "GEORGIA",
    "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS",
    "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS",
    "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA",
    "NEBRASKA", "NEVADA", "NEWHAMPSHIRE", "NEWJERSEY", "NEWMEXICO",
    "NEWMEXICO", "NEWYORK", "NORTHCAROLINA", "NORTHDAKOTA", "OHIO",
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODEISLAND",
    "SOUTHCAROLINA", "SOUTHDAKOTA", "TENNESSEE", "TEXAS", "UTAH",
    "VERMONT", "VIRGINIA", "WASHINGTON", "WESTVIRGINIA", "WISCONSIN",
    "WYOMING"
]

us_state_abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE",
    "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS",
    "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
    "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
    "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV",
    "WI", "WY"
]

def filter_country(name):
    # 이메일 주소인 경우 pd.NA로 변경
    if '@' in name:
        return pd.NA
    # 국가명인 경우 국가명으로 변경
    for country in countries:
        if country in name:
            # 미국은 축약된거 풀어주기
            if country == 'USA':
                return 'UNITEDSTATES'
            else:
                return country
    # 미국 주의 이름이 내부에 있을 경우 UNITEDSTATES
    for us_state in us_states:
        if us_state in name:
            return 'UNITEDSTATES'

    # TODO: top10 국가에 대해 도시명 비교하기

    # 미국 주의 축약 이름이 내부에 있을 경우 UNITEDSTATES
    for us_state_abbrev in us_state_abbreviations:
        if us_state_abbrev in name:
            return 'UNITEDSTATES'
    # 숫자가 있을 경우 미국 주소라고 가정
    if any(char.isdigit() for char in name):
        return 'UNITEDSTATES'

    # 위 경우에 해당하지 않는 건 다른 나라 도시의 이름
    # 일단 결측치로 바꾼 후 imputing으로 추론해보기 or 'Other'로 묶어버리기
    return pd.NA

# NaN인 경우 pass, 아닌 경우 /, . 제거 후 filter_country
train_df['customer_country'] = train_df['customer_country'].apply(
    lambda x: x if pd.isna(x) else filter_country(
        x.split('/')[-1].replace(
            '.', ''
        )
    )
)

test_df['customer_country'] = test_df['customer_country'].apply(
    lambda x: x if pd.isna(x) else filter_country(
        x.split('/')[-1].replace(
            '.', ''
        )
    )
)

train_df['customer_country.1'] = train_df['customer_country.1'].apply(
    lambda x: x if pd.isna(x) else filter_country(
        x.split('/')[-1].replace(
            '.', ''
        )
    )
)

test_df['customer_country.1'] = test_df['customer_country.1'].apply(
    lambda x: x if pd.isna(x) else filter_country(
        x.split('/')[-1].replace(
            '.', ''
        )
    )
)


In [248]:
# train set, test set 모두 label encoding

def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [249]:
df_all = pd.concat([train_df[label_columns], test_df[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    train_df[col] = df_all.iloc[: len(train_df)][col]
    test_df[col] = df_all.iloc[len(test_df) :][col]

In [250]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop('is_converted', axis=1),
    train_df["is_converted"],
    test_size=0.2,
    shuffle=True,
    # stratify=train_df['is_converted'],
    random_state=42,
)

In [251]:
zero_impute_features = [
    'id_strategic_ver',
    'it_strategic_ver'
]

others_features = [
    'bant_submit',  # 결측치 처리 필요 없음
    'com_reg_ver_win_rate',  # KNNImputer or -1으로 대체
    'customer_idx',  # 나중에 rbf
    'historical_existing_cnt',  # KNNImputer or -1으로 대체
    # 'id_strategic_ver',  # 0으로 채우기
    # 'it_strategic_ver',  # 0으로 채우기
    # 'idit_strategic_ver',
    'lead_desc_length',  # KNNImputer or 'Other'
    # 'ver_cus',  # 0이 1에 비해 너무 많음
    # 'ver_pro',  # 0이 1에 비해 너무 많음
    'ver_win_rate_x',  # KNNImputer or -1으로 대체
    'ver_win_ratio_per_bu',  # KNNImputer or -1으로 대체
    'lead_owner',  # 나중에 rbf
    # "customer_country",  # KNNImputer
    "business_subarea",  # KNNImputer
    "business_area",  # KNNImputer
    "business_unit",
    "customer_type",  # KNNImputer
    "enterprise",
    "customer_job",  # KNNImputer
    "inquiry_type",  # KNNImputer
    "product_category",  # KNNImputer or 'etc'
    "product_subcategory",  # KNNImputer or '-1'이라는 새로운 카테고리를 만드는 방식
    "product_modelname",  # KNNImputer or '-1'이라는 새로운 없음 카테고리
    # "customer_country.1",  # KNNImputer or '//'
    "customer_position",
    "response_corporate",
    "expected_timeline",  # KNNImputer or 'etc', discussed... 합치기, 기간별로 숫자 임베딩
]

class DoNothingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.X = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X

zero_impute_pipeline = Pipeline([
    ("simple_imputer", SimpleImputer(strategy='constant', fill_value=0)),
])

preprocessing = ColumnTransformer([
    ("zero_impute_pipeline", zero_impute_pipeline, zero_impute_features),
    ("do_nothing_pipeline", DoNothingTransformer(), others_features)
], remainder='drop')

random_forest_clf = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    max_leaf_nodes=None,
    bootstrap=True,
    max_samples=None,
    class_weight='balanced'
)

random_forest_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("simple_imputer", SimpleImputer(strategy='constant', fill_value=0)),
    # ("knn_imputer", KNNImputer(n_neighbors=5)),
    ("random_forest", random_forest_clf)
])

In [252]:
random_forest_pipeline.fit(X_train, y_train)

In [253]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [254]:
pred = random_forest_pipeline.predict(X_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  691   294]
 [   27 10848]]

정확도: 0.9729
정밀도: 0.9624
재현율: 0.7015
F1: 0.8115


In [255]:
test_pred = random_forest_pipeline.predict(test_df)
sum(test_pred) # True로 예측된 개수

144

In [137]:
random_forest_clf.feature_importances_

array([1.81766463e-03, 3.69205840e-04, 1.95577524e-02, 3.06446650e-02,
       3.81382405e-01, 2.42978603e-02, 8.07002986e-02, 1.74126078e-02,
       1.49601359e-02, 1.16057386e-01, 2.26936842e-02, 1.94382483e-02,
       1.57686339e-02, 1.46386982e-02, 3.69459409e-02, 1.07594716e-02,
       2.83434953e-02, 9.47590738e-03, 3.15016146e-02, 1.64178605e-02,
       1.37342598e-02, 2.36190642e-02, 2.21371130e-02, 3.36350707e-02,
       1.36909561e-02])

In [91]:
random_forest_pipeline.predict_proba(X_val)

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       ...,
       [0.98, 0.02],
       [0.94, 0.06],
       [1.  , 0.  ]])

In [70]:
np.argmin(random_forest_clf.feature_importances_)

1

In [71]:
cv_scores = cross_val_score(random_forest_pipeline,
               train_df, train_df['is_converted'],
               cv=3,
               scoring=make_scorer(f1_score))
print(cv_scores)
print(cv_scores.mean().round(3))

[0.80776141 0.63931223 0.59547202]
0.681


In [72]:
param_distrib = {
    'random_forest__n_estimators': randint(low=80, high=1000),
    'random_forest__max_depth': randint(low=10, high=50),
    'random_forest__min_samples_split': randint(low=2, high=10),
    'random_forest__min_samples_leaf': randint(low=1, high=10),
}

rnd_search = RandomizedSearchCV(
    random_forest_pipeline,
    param_distributions=param_distrib,
    cv=3, scoring=make_scorer(recall_score)
)

In [73]:
rnd_search.fit(train_df, train_df['is_converted'])

In [74]:
rnd_search.best_params_

{'random_forest__max_depth': 37,
 'random_forest__min_samples_leaf': 1,
 'random_forest__min_samples_split': 4,
 'random_forest__n_estimators': 685}

In [76]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)
cv_res.head(n=10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_depth,param_random_forest__min_samples_leaf,param_random_forest__min_samples_split,param_random_forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,34.773673,1.717311,1.579155,0.040066,37,1,4,685,"{'random_forest__max_depth': 37, 'random_fores...",0.695114,0.51577,0.439356,0.55008,0.107194,1
6,23.115147,0.828032,1.078265,0.030831,26,2,2,457,"{'random_forest__max_depth': 26, 'random_fores...",0.692641,0.496599,0.423886,0.537708,0.113504,2
3,5.640709,0.368017,0.277413,0.005138,17,2,4,116,"{'random_forest__max_depth': 17, 'random_fores...",0.689549,0.499691,0.41646,0.535233,0.114285,3
1,37.520509,1.799028,1.673981,0.045318,40,3,6,751,"{'random_forest__max_depth': 40, 'random_fores...",0.690167,0.480519,0.413985,0.528224,0.117689,4
7,26.941035,1.003504,1.231549,0.040717,29,3,6,549,"{'random_forest__max_depth': 29, 'random_fores...",0.690167,0.478664,0.41151,0.52678,0.11874,5
4,6.85246,0.792564,0.314274,0.007843,41,4,5,137,"{'random_forest__max_depth': 41, 'random_fores...",0.68893,0.466914,0.39604,0.517295,0.124766,6
2,24.881797,0.868468,1.123485,0.036203,13,4,5,551,"{'random_forest__max_depth': 13, 'random_fores...",0.682746,0.46073,0.393564,0.512347,0.123571,7
8,29.123891,0.94132,1.517219,0.189145,20,7,2,624,"{'random_forest__max_depth': 20, 'random_fores...",0.681509,0.456401,0.391089,0.509666,0.124402,8
0,34.352292,1.866115,1.498499,0.045884,43,6,2,689,"{'random_forest__max_depth': 43, 'random_fores...",0.683364,0.456401,0.387995,0.509253,0.126243,9
5,40.378981,1.615317,1.877681,0.0482,37,8,9,868,"{'random_forest__max_depth': 37, 'random_fores...",0.680891,0.453927,0.388614,0.50781,0.125257,10
