### 필수 라이브러리

In [109]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.pipeline import make_pipeline

### 데이터 셋 읽어오기

In [110]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [111]:
# df_train = df_train.drop_duplicates()
# Assuming df_train is your DataFrame
# Replace missing values in 'customer_country' with the mode of each 'lead_owner' group
mode_fill = df_train.groupby('lead_owner')['customer_country'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['customer_country'] = df_train['customer_country'].fillna(mode_fill)

mode_fill = df_train.groupby('lead_owner')['customer_country.1'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['customer_country.1'] = df_train['customer_country.1'].fillna(mode_fill)

mode_fill = df_train.groupby('customer_position')['inquiry_type'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['inquiry_type'] = df_train['inquiry_type'].fillna(mode_fill)

mode_fill = df_train.groupby('customer_position')['inquiry_type'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['inquiry_type'] = df_train['inquiry_type'].fillna(mode_fill)

mode_fill = df_train.groupby('customer_type')['inquiry_type'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['inquiry_type'] = df_train['inquiry_type'].fillna(mode_fill)

# This code will fill the missing values in 'customer_country' with the mode of the corresponding 'lead_owner' group.
# If there is no mode (i.e., all values are missing) for a specific 'lead_owner', it will remain missing.

# Example:
# Before:
# df_train:
#    lead_owner  customer_country
# 0      owner1               USA
# 1      owner2               NaN
# 2      owner1               NaN
# 3      owner2               NaN

# After:
# df_train:
#    lead_owner  customer_country
# 0      owner1               USA
# 1      owner2               USA
# 2      owner1               USA
# 3      owner2               NaN  # If all values for owner2 are missing, it remains missing

# Count the number of missing values in 'customer_country'
missing_values_count = df_train['customer_country'].isnull().sum()

print("Number of missing values in customer_country:", missing_values_count)

missing_values_count = df_train['customer_country.1'].isnull().sum()

print("Number of missing values in customer_country.1:", missing_values_count)

missing_values_count = df_train['inquiry_type'].isnull().sum()

print("Number of missing values in inquiry_type: ", missing_values_count)

# df_train.to_csv('train_customer_country_filled.csv', index=False)


Number of missing values in customer_country: 6
Number of missing values in customer_country.1: 6
Number of missing values in inquiry_type:  8


### 레이블 인코딩

In [112]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [113]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

print(df_train.isna().sum()/len(df_train))

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

bant_submit                0.000000
customer_country           0.000101
business_unit              0.000000
com_reg_ver_win_rate       0.754330
customer_idx               0.000000
customer_type              0.741345
enterprise                 0.000000
historical_existing_cnt    0.768023
id_strategic_ver           0.941921
it_strategic_ver           0.981096
idit_strategic_ver         0.923017
customer_job               0.315908
lead_desc_length           0.000000
inquiry_type               0.000135
product_category           0.326717
product_subcategory        0.844264
product_modelname          0.844365
customer_country.1         0.000101
customer_position          0.000000
response_corporate         0.000000
expected_timeline          0.520464
ver_cus                    0.000000
ver_pro                    0.000000
ver_win_rate_x             0.689421
ver_win_ratio_per_bu       0.741918
business_area              0.689421
business_subarea           0.906811
lead_owner                 0

학습, 검증 데이터 분리

In [114]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]


In [115]:
df_train.fillna(df_train.mean())

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,9070,0,0.066667,32160,10,0,19.912184,1.0,1.0,...,33,246,1,0,0.003079,0.026846,0,28,0,True
1,1.00,8406,0,0.066667,23122,10,0,12.000000,1.0,1.0,...,33,246,1,0,0.003079,0.026846,0,0,1,True
2,1.00,6535,0,0.088889,1755,10,0,144.000000,1.0,1.0,...,21,246,1,0,0.003079,0.026846,0,17,2,True
3,1.00,3388,0,0.088889,4919,10,0,19.912184,1.0,1.0,...,21,246,1,0,0.003079,0.026846,0,44,3,True
4,1.00,5799,0,0.088889,17126,29,0,19.912184,1.0,1.0,...,21,246,0,0,0.003079,0.026846,0,86,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,10287,0,0.091685,33747,9,1,19.912184,1.0,1.0,...,34,8,0,0,0.000026,0.028777,8,62,694,False
59295,0.75,3490,0,0.040000,35420,29,0,19.912184,1.0,1.0,...,7,15,0,0,0.000026,0.028777,8,86,39,False
59296,0.75,8799,0,0.040000,19249,29,0,19.912184,1.0,1.0,...,35,246,0,0,0.000026,0.028777,8,86,125,False
59297,1.00,12795,0,0.040000,40327,33,0,19.912184,1.0,1.0,...,35,267,0,0,0.000026,0.028777,8,86,134,False


In [116]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

### 모델 정의 

In [117]:
model = RandomForestClassifier(random_state=2024)

In [118]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))


In [119]:
model.fit(x_train.fillna(0), y_train)

In [120]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  668   279]
 [   25 10888]]

정확도: 0.9744
정밀도: 0.9639
재현율: 0.7054
F1: 0.8146


### GridSearchCV를 이용한 하이퍼파라미터 튜닝

In [123]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'n_estimators':[10, 50, 100],
    'max_depth' : [6, 8, 10, 12], 
    'min_samples_leaf' : [8, 12, 18],
    'min_samples_split' : [8, 16, 20]
}

rf_clf = RandomForestClassifier(random_state=2024)
grid_cv = RandomizedSearchCV(rf_clf , params , cv=3, scoring='recall')
grid_cv.fit(x_train.fillna(0) , y_train) # grid.cv.fit(train_x, train_y)

estimator =grid_cv.best_estimator_ 
pred = estimator.predict(x_val.fillna(0)) # estimator.predict(test)

In [122]:
get_clf_eval(y_val, pred)

오차행렬:
 [[  532   415]
 [   17 10896]]

정확도: 0.9636
정밀도: 0.9690
재현율: 0.5618
F1: 0.7112


### 테스트 데이터 예측