# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    make_scorer,
)
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import KFold
from collections import Counter
from catboost import Pool,CatBoostClassifier

### 데이터 셋 읽어오기

In [3]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

In [5]:
# 바꿔줄 단어들을 리스트로 나열
End = ['End-user', 'End-Customer', 'Commercial end-user']
Specifier = ['Specifier / Influencer']
Solution  = ['Meeting Solution', 'Software / Solution Provider', 'Software/Solution Provider']
Service = ['Authorized Service Center', 'Authorized Service Dealer']
Other = ['Other', 'Others', 'Etc.']

In [6]:
df_train.loc[df_train['customer_type'].isin(End), 'customer_type'] = 'End Customer'
df_train.loc[df_train['customer_type'].isin(Specifier), 'customer_type'] = 'Specifier/ Influencer'
df_train.loc[df_train['customer_type'].isin(Solution), 'customer_type'] = 'Solution Eco-Partner'
df_train.loc[df_train['customer_type'].isin(Service), 'customer_type'] = 'Service Partner'
df_train.loc[df_train['customer_type'].isin(Other), 'customer_type'] = 'Other'

In [7]:
## df_train에 적용된 코드와 동일한 처리를 df_test에도 적용
df_test.loc[df_test['customer_type'].isin(End), 'customer_type'] = 'End Customer'
df_test.loc[df_test['customer_type'].isin(Specifier), 'customer_type'] = 'Specifier/ Influencer'
df_test.loc[df_test['customer_type'].isin(Solution), 'customer_type'] = 'Solution Eco-Partner'
df_test.loc[df_test['customer_type'].isin(Service), 'customer_type'] = 'Service Partner'
df_test.loc[df_test['customer_type'].isin(Other), 'customer_type'] = 'Other'

In [8]:
# 결과확인
df_train['customer_type'].value_counts()

customer_type
End Customer             10652
Specifier/ Influencer     2568
Channel Partner           1368
Service Partner            349
Solution Eco-Partner       154
Installer/Contractor        52
Corporate                   31
HVAC Engineer               23
Other                       20
Engineer                    20
Developer                   17
Technician                  16
Consultant                  15
Home Owner                  10
Manager / Director           8
Homeowner                    5
Installer                    5
Architect/Consultant         5
Reseller                     5
Interior Designer            5
Distributor                  4
System Integrator            2
Dealer/Distributor           2
Technical Assistant          1
Administrator                1
Name: count, dtype: int64

In [9]:
def assign_id_strategic_ver(row):
    if row['business_unit'] == 'ID' and row['business_area'] in ['corporate / office', 'retail', 'hotel & accommodation', 'education']:
        return 1
    else:
        return 0

df_train['id_strategic_ver'] = df_train.apply(assign_id_strategic_ver, axis=1)
df_test['id_strategic_ver'] = df_test.apply(assign_id_strategic_ver, axis=1)

In [10]:
def assign_it_strategic_ver(row):
    if row['business_unit'] == 'IT' and row['business_area'] in ['corporate / office', 'retail', 'hotel & accommodation', 'education']:
        return 1
    else:
        return 0

df_train['it_strategic_ver'] = df_train.apply(assign_it_strategic_ver, axis=1)
df_test['it_strategic_ver'] = df_test.apply(assign_it_strategic_ver, axis=1)

In [11]:
# 'id_strategic_ver'나 'it_strategic_ver' 중 하나가 1인 경우에는 'idit_strategic_ver'에 1을 할당
df_train['idit_strategic_ver'] = df_train.apply(lambda row: 1 if row['id_strategic_ver'] == 1 or row['it_strategic_ver'] == 1 else 0, axis=1)

# 'id_strategic_ver'나 'it_strategic_ver' 중 하나가 1인 경우에는 'idit_strategic_ver'에 1을 할당
df_test['idit_strategic_ver'] = df_test.apply(lambda row: 1 if row['id_strategic_ver'] == 1 or row['it_strategic_ver'] == 1 else 0, axis=1)


In [12]:
### customer_job과 customer_position 의 값을 배열과 비교하여 변경하기 ###
def search_func_pos(df):
    # 대소문자를 구분하지 않기 위해 소문자로 변환
    df['customer_position'] = df['customer_position'].str.lower()
    df['customer_job'] = df['customer_job'].str.lower()

    # job function과 job seniority 배열 소문자 변환하여 비교
    job_function = ["3d/vfx art", "accounting", "administrative", "arts and design", "business development", "clinical specialist", "community and social services",
                    "consulting", "education", "engineering", "entrepreneurship", "film production", "finance", "graphic/color art", "healthcare services",
                    "human resources", "information technology", "legal", "marketing", "media and communication", "medical imaging specialist", "medical solution",
                    "military and protective services", "operations", "pathologist", "product management", "program and project management",
                    "purchasing", "quality assurance", "radiology professional", "real estate", "research", "sales", "support", "surgery professional", "others", "other" ]

    job_seniority = ["ceo/founder", "partner", "c-level executive", "vice president", "director", "manager",
                     "associate/analyst", "entry level", "trainee", "intern","others"]

    # customer_position과 customer_job을 비교하여 조건에 맞는 행을 선택합니다.
    condition = (df['customer_position'].isin(job_function)) & (df['customer_job'].isin(job_seniority))
    condition_T = df[condition]
    # DataFrame으로 변환
    condition_T = pd.DataFrame(condition_T, columns=['customer_position', 'customer_job'])
    print(condition_T)

    # 조건에 맞는 행의 customer_position과 customer_job 값을 서로 변경합니다.
    df.loc[condition, ['customer_position', 'customer_job']] = df.loc[condition, ['customer_job', 'customer_position']].values

    return df

# 함수를 호출하여 데이터프레임을 변경합니다.
df_train = search_func_pos(df_train)
df_test = search_func_pos(df_test)

            customer_position       customer_job
14048             engineering  associate/analyst
19019                  others             others
19021                  others             others
19124                  others             others
19129                  others             others
...                       ...                ...
58189    surgery professional            manager
58190             pathologist            manager
58196                research             intern
58245  radiology professional            manager
58532                  others             others

[496 rows x 2 columns]
     customer_position customer_job
10              others       others
49              others       others
55              others       others
63              others       others
68              others       others
...                ...          ...
5131            others       others
5209            others       others
5238            others       others
5242            others      

In [13]:
### job function에 해당하지 않는 컬럼 확인 ###
# 37개
job_function = ["3d/vfx art", "accounting", "administrative", "arts and design", "business development", "clinical specialist", "community and social services",
                "consulting", "education", "engineering", "entrepreneurship", "film production", "finance", "graphic/color art", "healthcare services",
                "human resources", "information technology", "legal", "marketing", "media and communication", "medical imaging specialist", "medical solution",
                "military and protective services", "operations", "pathologist", "product management", "program and project management",
                "purchasing", "quality assurance", "radiology professional", "real estate", "research", "sales", "support", "surgery professional", "others", "other" ]

# customer_job 컬럼에서 job_function에 해당하는 카테고리를 필터링
function_job = set(job_function)
matching_job = set(df_train['customer_job']).intersection(function_job)

# job_function에 해당하지 않는 나머지 카테고리들을 출력
other_job = set(df_train['customer_job']) - matching_job

In [14]:
# 각 카테고리에 대한 매핑 딕셔너리 생성
three_d_vfx_art = ["3d/vfx art"]

accounting = ["accounting", "accounts payable", "account exec/manager", "account management"]

administrative = ["administrative", "platform administrator", "imaging administrator", "pacs administrator", "systems administrator", "facility administrator", "network administrator", "admin", "admin assistant", "administración", "administration", "administrative assistant", "adminisztráció", "amministrativo"]

arts_and_design = ["arts and design", "support/facilitator, designer", "design and provide equipment", "design/build", "design/decision maker", "design/install/training/support", "kreation und design", "kreation_und_design", "lead designer", "művészet_és_design", "interior designer", "art and design", "art installation", "arte y diseño", "arte_e_design", "artist, lead on equipment selection", "arts_and_design", "design", "designer", "designer, creative technologist", "designer, producer", "designer/installer", "designere / budget", "designers"]

business_development = ["business development", "sourcing & quoting for end user", "distributor quotation", "developer/property", "curation", "quotation curator", "quote gathering/proposer to owner", "quoting project", "business_development", "development coordinator/procurement"]

clinical_specialist = ["clinical specialist", "clinic", "mental health"]

community_and_social_services = ["community and social services", "community_and_social_services"]

consulting = ["consulting", "technical advisor, reseller", "consultent", "consultant", "consultant / purchaser", "consultant,cabinet fabricator", "strategic communications", "strategy & operations specialist", "solution advisor", "solutions architect", "technology consultant"]

education = ["education", "teacher", "teaching", "educator", "k12 school", "higher education (college & university)", "institute & academy", "instructor"]

engineering = ["engineering", "tech", "technical", "electrical contractor", "implement", "senior design engineer", "solution engineer", "system engineer", "systems engineer", "lead engineer", "electronics & telco", "engineer", "engineering & technical", "engineering & technical executive", "engineering director", "engineering, design, and install", "system designer, integrator", "systems design", "systems designer", "principal engineer", "hardware", "hardware design engineer", "hardware selection", "chief eng.", "chief engineer", "chief of engineering", "design engineer", "director of engineering"]

entrepreneurship = ["business owner", "ceo", "director comercial", "head", "engagement executive", "execution", "executive", "owner","owner representation", "owning company", "ownner-marketing director", "ceo/founder", "lead", "organizer", "leader", "vice president", "vp/gm", "underboss", "the big boss"]

film_production = ["film production", "home theater", "community theater"]

finance = ["finance", "finanzas", "finanzen", "pénzügy", "finance executive"]

graphic_color_art = ["graphic/color art", "colorist", "gc", "graphic design"]

healthcare_services = ["healthcare services", "healthcare professionals", "healthcare_services", "healthcare"]

human_resources = ["human resources", "hr posting", "hr", "human_resources"]

information_technology = ["information technology", "software developer", "emerging technology / innovation", "informatics, touch capability", "information technology\u200b", "information_technology", "infrastructure", "it", "it - information technology", "it admin", "it administrator", "it dairector", "it department", "it director", "it hardware technician", "it integrator", "it manager", "it project lead", "it specialist", "it support", "it tech.", "it/software", "application development", "cloud / mobility", "collaboration & web apps", "computing & it"],

legal = ["legal"]

marketing = ["marketing", "technical marketing", "advertising", "product marketing", "advertising and promotions team", "event marketing", "field marketing", "marketing coordinator", "marketing executive", "marketing operations"]

media_and_communication = ["media and communication", "broadcasting & media", "media and communications", "media_and_communication", "media_e_comunicazione", "medien_und_kommunikation", "medios_de_comunicación", "média_és_kommunikáció"]

medical_imaging_specialist = ["medical imaging specialist", "medical imaging  specialist", "spécialiste_en_imagerie_médicale"]

medical_solution = ["medical solution", "doctor", "tierarzt", "medical solution  provider", "medical solution provider", "medical solution provider\u200b"]

military_and_protective_services = ["military and protective services", "military_and_protective_services"]

operations = ["coo", "director of operations","regional director of operations", "operations executive", "operations manager", "operations", "facilities and operations", "üzemeltetés"]

pathologist = ["pathologist"]

product_management = ["product management", "product_management"]

program_and_project_management = ["program and project management", "av project manager", "signage subcontractor p/m", "general manager - project manager", "digital project manager", "program directors", "gestión_de_proyectos", "program-_és_projektmenedzsment", "program_and_project_management", "program_and_project_manager", "programm- und projektmanagement", "programm-_und_projektmanagement", "project administrator", "project coordinator", "project director", "project facilitator", "project head", "project lead", "project manage", "project manager", "project manager / estimator", "project manager / principal", "project manager/designer", "project researcher", "project sales/manage", "projection manager", "projectr mgmt", "projektmenedzsment\tprogram and project management", "planner", "planner/purchaser", "planning and installation", "pm", "a/v project manager", "project manager", "owner / project manager", "producer/project manager"]

purchasing = ["purchasing", "buyer", "buyer, coordinating", "obtain quotes, process purchase", "requirements and buyer", "ordering manager", "requisition", "purchase", "purchase and install", "purchase dept", "purchaser", "purchaser, it and installer", "purchasers", "purchasing agent", "purchasing authority", "purchasing coordinator", "purchasing director", "purchasing manager", "purchasing supervisor", "purchsing", "director purchaser", "drop, purchase maxhub", "public bidder", "bidder"]

quality_assurance = ["quality assurance", "quality_assurance"]

radiology_professional = ["radiology professional", "profesional de radiología", "radiology  professional", "radiology_professional"]

real_estate = ["real estate", "building owner", "property owner"]

research = ["research", "associate/analyst", "r&d project manager", "research & development", "research and developement", "research products and prices", "research/install", "product research", "product researcher"]

sales = ["sales", "asking for quote for client", "field / outside sales", "sourcing / procurement", "sourcing/procurement", "reseller/integrator", "procurement", "procurement specialist", "procurment", "revendedor", "car dealership", "vendor / reseller", "vendite", "értékesítés", "technical sales", "reseller", "sale", "sales engineering", "sales executive", "sales manager", "sales operations", "sales rep", "salesman"]

support = ["support", "help desk / desktop services", "helpdesk specialist", "post install support and service", "supplier and installation"]

surgery_professional = ["surgery professional", "profesional de cirugía", "surgery professional\u200b"]


In [15]:
df_train.loc[df_train['customer_job'].isin(three_d_vfx_art), 'customer_job'] = 'three_d_vfx_art'
df_train.loc[df_train['customer_job'].isin(accounting), 'customer_job'] = 'accounting'
df_train.loc[df_train['customer_job'].isin(administrative), 'customer_job'] = 'administrative'
df_train.loc[df_train['customer_job'].isin(arts_and_design), 'customer_job'] = 'arts_and_design'
df_train.loc[df_train['customer_job'].isin(business_development), 'customer_job'] = 'business_development'
df_train.loc[df_train['customer_job'].isin(clinical_specialist), 'customer_job'] = 'clinical_specialist'
df_train.loc[df_train['customer_job'].isin(community_and_social_services), 'customer_job'] = 'community_and_social_services'
df_train.loc[df_train['customer_job'].isin(consulting), 'customer_job'] = 'consulting'
df_train.loc[df_train['customer_job'].isin(education), 'customer_job'] = 'education'
df_train.loc[df_train['customer_job'].isin(engineering), 'customer_job'] = 'engineering'
df_train.loc[df_train['customer_job'].isin(entrepreneurship), 'customer_job'] = 'entrepreneurship'
df_train.loc[df_train['customer_job'].isin(film_production), 'customer_job'] = 'film_production'
df_train.loc[df_train['customer_job'].isin(finance), 'customer_job'] = 'finance'
df_train.loc[df_train['customer_job'].isin(film_production), 'customer_job'] = 'film_production'
df_train.loc[df_train['customer_job'].isin(graphic_color_art), 'customer_job'] = 'graphic_color_art'
df_train.loc[df_train['customer_job'].isin(healthcare_services), 'customer_job'] = 'healthcare_services'
df_train.loc[df_train['customer_job'].isin(human_resources), 'customer_job'] = 'human_resources'
df_train.loc[df_train['customer_job'].isin(information_technology), 'customer_job'] = 'information_technology'
df_train.loc[df_train['customer_job'].isin(legal), 'customer_job'] = 'legal'
df_train.loc[df_train['customer_job'].isin(marketing), 'customer_job'] = 'marketing'
df_train.loc[df_train['customer_job'].isin(media_and_communication), 'customer_job'] = 'media_and_communication'
df_train.loc[df_train['customer_job'].isin(medical_imaging_specialist), 'customer_job'] = 'medical_imaging_specialist'
df_train.loc[df_train['customer_job'].isin(medical_solution), 'customer_job'] = 'medical_solution'
df_train.loc[df_train['customer_job'].isin(military_and_protective_services), 'customer_job'] = 'military_and_protective_services'
df_train.loc[df_train['customer_job'].isin(operations), 'customer_job'] = 'operations'
df_train.loc[df_train['customer_job'].isin(pathologist), 'customer_job'] = 'pathologist'
df_train.loc[df_train['customer_job'].isin(product_management), 'customer_job'] = 'product_management'
df_train.loc[df_train['customer_job'].isin(program_and_project_management), 'customer_job'] = 'program_and_project_management'
df_train.loc[df_train['customer_job'].isin(purchasing), 'customer_job'] = 'purchasing'
df_train.loc[df_train['customer_job'].isin(quality_assurance), 'customer_job'] = 'quality_assurance'
df_train.loc[df_train['customer_job'].isin(radiology_professional), 'customer_job'] = 'radiology_professional'
df_train.loc[df_train['customer_job'].isin(real_estate), 'customer_job'] = 'real_estate'
df_train.loc[df_train['customer_job'].isin(research), 'customer_job'] = 'research'
df_train.loc[df_train['customer_job'].isin(sales), 'customer_job'] = 'sales'
df_train.loc[df_train['customer_job'].isin(support), 'customer_job'] = 'support'
df_train.loc[df_train['customer_job'].isin(surgery_professional), 'customer_job'] = 'surgery_professional'

In [16]:
df_test.loc[df_test['customer_job'].isin(three_d_vfx_art), 'customer_job'] = 'three_d_vfx_art'
df_test.loc[df_test['customer_job'].isin(accounting), 'customer_job'] = 'accounting'
df_test.loc[df_test['customer_job'].isin(administrative), 'customer_job'] = 'administrative'
df_test.loc[df_test['customer_job'].isin(arts_and_design), 'customer_job'] = 'arts_and_design'
df_test.loc[df_test['customer_job'].isin(business_development), 'customer_job'] = 'business_development'
df_test.loc[df_test['customer_job'].isin(clinical_specialist), 'customer_job'] = 'clinical_specialist'
df_test.loc[df_test['customer_job'].isin(community_and_social_services), 'customer_job'] = 'community_and_social_services'
df_test.loc[df_test['customer_job'].isin(consulting), 'customer_job'] = 'consulting'
df_test.loc[df_test['customer_job'].isin(education), 'customer_job'] = 'education'
df_test.loc[df_test['customer_job'].isin(engineering), 'customer_job'] = 'engineering'
df_test.loc[df_test['customer_job'].isin(entrepreneurship), 'customer_job'] = 'entrepreneurship'
df_test.loc[df_test['customer_job'].isin(film_production), 'customer_job'] = 'film_production'
df_test.loc[df_test['customer_job'].isin(finance), 'customer_job'] = 'finance'
df_test.loc[df_test['customer_job'].isin(film_production), 'customer_job'] = 'film_production'
df_test.loc[df_test['customer_job'].isin(graphic_color_art), 'customer_job'] = 'graphic_color_art'
df_test.loc[df_test['customer_job'].isin(healthcare_services), 'customer_job'] = 'healthcare_services'
df_test.loc[df_test['customer_job'].isin(human_resources), 'customer_job'] = 'human_resources'
df_test.loc[df_test['customer_job'].isin(information_technology), 'customer_job'] = 'information_technology'
df_test.loc[df_test['customer_job'].isin(legal), 'customer_job'] = 'legal'
df_test.loc[df_test['customer_job'].isin(marketing), 'customer_job'] = 'marketing'
df_test.loc[df_test['customer_job'].isin(media_and_communication), 'customer_job'] = 'media_and_communication'
df_test.loc[df_test['customer_job'].isin(medical_imaging_specialist), 'customer_job'] = 'medical_imaging_specialist'
df_test.loc[df_test['customer_job'].isin(medical_solution), 'customer_job'] = 'medical_solution'
df_test.loc[df_test['customer_job'].isin(military_and_protective_services), 'customer_job'] = 'military_and_protective_services'
df_test.loc[df_test['customer_job'].isin(operations), 'customer_job'] = 'operations'
df_test.loc[df_test['customer_job'].isin(pathologist), 'customer_job'] = 'pathologist'
df_test.loc[df_test['customer_job'].isin(product_management), 'customer_job'] = 'product_management'
df_test.loc[df_test['customer_job'].isin(program_and_project_management), 'customer_job'] = 'program_and_project_management'
df_test.loc[df_test['customer_job'].isin(purchasing), 'customer_job'] = 'purchasing'
df_test.loc[df_test['customer_job'].isin(quality_assurance), 'customer_job'] = 'quality_assurance'
df_test.loc[df_test['customer_job'].isin(radiology_professional), 'customer_job'] = 'radiology_professional'
df_test.loc[df_test['customer_job'].isin(real_estate), 'customer_job'] = 'real_estate'
df_test.loc[df_test['customer_job'].isin(research), 'customer_job'] = 'research'
df_test.loc[df_test['customer_job'].isin(sales), 'customer_job'] = 'sales'
df_test.loc[df_test['customer_job'].isin(support), 'customer_job'] = 'support'
df_test.loc[df_test['customer_job'].isin(surgery_professional), 'customer_job'] = 'surgery_professional'

In [17]:
# 제품군 간단한 전처리 하기

# 각 카테고리에 대한 매핑 딕셔너리 생성
other = ["other", "others", "etc.", "khác", "outros", "lainnya", "אחר", "otros"]
commercial_tv = ["commercial tv", "commercial tv,tv", "commercial tv,audio/video", "commercial_tv", "tv,commercial tv", "comercial tv"]
heating = ["heating", "חימום" ,"حلول التدفئة", "isıtma", "ogrzewanie (pompy ciepła)", "calefacción"]
multi_split  = ["multi-split", "פיצול מרובה", "multi split"]
single_split = ["single-split", "split tunggal", "single split"]
chiller = ["chiller", "مبرد (تشيلر)", "soğutucu", "pendingin"]
video_wall_signage  = ["video wall signage", "videwall", "video wall"]
hotel_tv = ["hotel tv", "酒店電視"]
hospital_tv = ["hospital tv", "醫院電視"]


df_train.loc[df_train['product_category'].isin(other), 'product_category'] = 'other'
df_train.loc[df_train['product_category'].isin(commercial_tv), 'product_category'] = 'commercial_tv'
df_train.loc[df_train['product_category'].isin(heating), 'product_category'] = 'heating'
df_train.loc[df_train['product_category'].isin(multi_split), 'product_category'] = 'multi_split'
df_train.loc[df_train['product_category'].isin(single_split), 'product_category'] = 'single_split'
df_train.loc[df_train['product_category'].isin(chiller), 'product_category'] = 'chiller'
df_train.loc[df_train['product_category'].isin(video_wall_signage), 'product_category'] = 'video_wall_signage'
df_train.loc[df_train['product_category'].isin(hotel_tv), 'product_category'] = 'hotel_tv'

df_test.loc[df_test['product_category'].isin(other), 'product_category'] = 'other'
df_test.loc[df_test['product_category'].isin(commercial_tv), 'product_category'] = 'commercial_tv'
df_test.loc[df_test['product_category'].isin(heating), 'product_category'] = 'heating'
df_test.loc[df_test['product_category'].isin(multi_split), 'product_category'] = 'multi-split'
df_test.loc[df_test['product_category'].isin(single_split), 'product_category'] = 'single_split'
df_test.loc[df_test['product_category'].isin(chiller), 'product_category'] = 'chiller'
df_test.loc[df_test['product_category'].isin(video_wall_signage), 'product_category'] = 'video_wall_signage'
df_test.loc[df_test['product_category'].isin(hotel_tv), 'product_category'] = 'hotel_tv'

In [18]:
# 방법2 : 전처리만 (레이블 인코딩 X) VER
import re

# "less"가 포함되면 "Less than 3 months"을 반환
# "3"과 "6"이 모두 포함되면 "3 Months ~ 6 Months"을 반환
# "6"과 "9"가 모두 포함되면 "6 Months ~ 9 Months"를 반환
# "9"와 "1"이 모두 포함되면 "9 Months ~ 1 year"을 반환
# "more"가 포함되면 "More than a year"를 반환
# 그 외의 결측치에는 Not specified 을 반환하는 함수

def custom_replace(value):
    value_str = str(value)  # 정수형 데이터를 문자열로 변환
    if "less" in value_str.lower():
        return 'Less than 3 months'
    elif "3" in value_str and "6" in value_str:
        return '3 Months ~ 6 Months'
    elif "6" in value_str and "9" in value_str:
        return '6 Months ~ 9 Months'
    elif "9" in value_str and "1" in value_str:
        return '9 Months ~ 1 year'
    elif "more" in value_str.lower():
        return 'More than a year'
    else: # 941개 NaN값 처리
#         return -1 # 전부 -1
        return 'Not specified' #전부 Not specified
#         return value  #원래 결측치 값 -> 이후 결측치 0으로 채우기


# df_train/df_test에 함수 적용
df_train['expected_timeline'] = df_train['expected_timeline'].apply(custom_replace)
df_test['expected_timeline'] = df_test['expected_timeline'].apply(custom_replace)

In [19]:
### 결측치 처리 함수 작성 ###
def ver_cus_0to1(df):
  df.loc[(df['business_area'].isin(['corporate / office', 'retail', 'education', 'hotel & accommodation'])) &
         (df['customer_type'].isin(['End-Customer', 'End Customer', 'End-user'])),
         'ver_cus'] = 1
  return df

### df_train/df_test에 적용
df_train = ver_cus_0to1(df_train)
df_test = ver_cus_0to1(df_test)

In [20]:
### 결측치 처리 함수 ###
def ver_pro_0to1(df):
    df.loc[(df['business_area'].isin(['corporate / office', 'retail', 'hotel & accommodation','education'])) &
           (df['product_category'].str.contains('signage', case=False) |
            df['product_category'].str.contains('hotel tv', case=False)),
           'ver_pro'] = 1
    return df

### df_train/df_test에 적용
df_train = ver_pro_0to1(df_train)
df_test = ver_pro_0to1(df_test)

In [21]:
df_train.head() # 전처리된 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End Customer,Enterprise,,0,0,...,LGEPH,Less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End Customer,Enterprise,12.0,0,0,...,LGEPH,Less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End Customer,Enterprise,144.0,0,0,...,LGEIL,Less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End Customer,Enterprise,,0,0,...,LGEIL,Less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,0,0,...,LGEIL,Less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


In [22]:
df_test.head() # 전처리된 학습용 데이터 살펴보기

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.0,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,1,...,LGESP,Not specified,1,0,0.001183,0.04984,retail,Electronics & Telco,278,True
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,0,...,LGEUS,Not specified,0,0,1.3e-05,,transportation,Others,437,True
2,8491,1.0,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,0,...,LGEGF,Less than 3 months,0,0,6e-05,0.131148,hospital & health care,General Hospital,874,True
3,19895,0.5,/ Madison / United States,ID,0.118644,17204,,Enterprise,,1,...,LGEUS,More than a year,0,0,0.001183,0.04984,retail,,194,False
4,10465,1.0,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1,...,LGESP,Less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,True


In [23]:
df_train.iloc[1:6, df_train.columns.isin(['business_unit', 'business_area', 'product_category', 'customer_type', 'ver_pro', 'ver_cus'])]

Unnamed: 0,business_unit,customer_type,product_category,ver_cus,ver_pro,business_area
1,AS,End Customer,multi_split,1,0,corporate / office
2,AS,End Customer,single_split,1,0,corporate / office
3,AS,End Customer,vrf,1,0,corporate / office
4,AS,Specifier/ Influencer,multi_split,0,0,corporate / office
5,AS,End Customer,chiller,1,0,corporate / office


### 레이블 인코딩

In [24]:
# def label_encoding(series: pd.Series) -> pd.Series:
#     """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

#     my_dict = {}

#     # 모든 요소를 문자열로 변환
#     series = series.astype(str)

#     for idx, value in enumerate(sorted(series.unique())):
#         my_dict[value] = idx
#     series = series.map(my_dict)

#     return series

In [25]:
# # 레이블 인코딩할 칼럼들
# label_columns = [
#     "customer_country",
#     "business_subarea",
#     "business_area",
#     "business_unit",
#     "customer_type",
#     "enterprise",
#     "customer_job",
#     "inquiry_type",
#     "product_category",
#     "product_subcategory",
#     "product_modelname",
#     "customer_country.1",
#     "customer_position",
#     "response_corporate",
#     # "expected_timeline",
# ]

# df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

# for col in label_columns:
#     df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [26]:
# for col in label_columns:
#     df_train[col] = df_all.iloc[: len(df_train)][col]
#     df_test[col] = df_all.iloc[len(df_train) :][col]

In [27]:
# 불필요해 보이는 열 제거
columns_to_drop = ["customer_country", "customer_country.1", "response_corporate","product_subcategory", "product_modelname", "business_subarea"]

df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [28]:
df_train.head() # 전처리된 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,...,product_category,customer_position,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,lead_owner,is_converted
0,1.0,AS,0.066667,32160,End Customer,Enterprise,,0,0,0,...,multi_split,entry level,Less than 3 months,1,0,0.003079,0.026846,corporate / office,0,True
1,1.0,AS,0.066667,23122,End Customer,Enterprise,12.0,0,0,0,...,multi_split,ceo/founder,Less than 3 months,1,0,0.003079,0.026846,corporate / office,1,True
2,1.0,AS,0.088889,1755,End Customer,Enterprise,144.0,0,0,0,...,single_split,partner,Less than 3 months,1,0,0.003079,0.026846,corporate / office,2,True
3,1.0,AS,0.088889,4919,End Customer,Enterprise,,0,0,0,...,vrf,ceo/founder,Less than 3 months,1,0,0.003079,0.026846,corporate / office,3,True
4,1.0,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,0,0,0,...,multi_split,partner,Less than 3 months,0,0,0.003079,0.026846,corporate / office,4,True


### 2-2. 학습, 검증 데이터 분리

In [29]:
# x_train, x_val, y_train, y_val = train_test_split(
#     df_train.drop("is_converted", axis=1),
#     df_train["is_converted"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=400,
# )

In [30]:
is_holdout = False
iterations = 3000
patience = 100

# Train/Val 데이터를 5-fold로 나누기
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold에 대한 예측값을 저장할 리스트
ensemble_preds = []

## 3. 모델 학습

### 모델 학습

In [31]:
# # 랜덤 서치
# XGB = XGBClassifier(random_state=42, tree_method='hist', device='cuda')

# param_dist = {
#     'n_estimators': randint(50, 1001),
#     'max_depth': randint(3, 10),
#     'min_child_weight': randint(1, 5),
#     'learning_rate': (0.01, 0.2),
#     'subsample': (0.6, 1.0),
#     'colsample_bytree': (0.6,1.0),
#     'gamma': (0, 3),
#     'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
# }

# scoring = make_scorer(f1_score)
# random_search = RandomizedSearchCV(XGB, param_distributions=param_dist, n_iter=150, cv=5, scoring=scoring, random_state=42)

In [32]:
# # ramdom search로 파라미터 찾기
# best_models = []

# for train_index, val_index in kf.split(df_train):
#     # Train/Val 데이터 분할
#     x_train_fold, x_val_fold = df_train.drop("is_converted", axis=1).iloc[train_index], df_train.drop("is_converted", axis=1).iloc[val_index]
#     y_train_fold, y_val_fold = df_train["is_converted"].iloc[train_index], df_train["is_converted"].iloc[val_index]

#     # Train 데이터 imputation
#     imputer = SimpleImputer(strategy='most_frequent')
#     x_train_fold_imputed = imputer.fit_transform(x_train_fold)

#     # 모델 훈련
#     random_search.fit(x_train_fold_imputed, y_train_fold)
#     print(random_search.best_score_)
#     print(random_search.best_params_)
#     best_params_xgb = random_search.best_params_
#     best_model_xgb = XGBClassifier(**best_params_xgb, random_state=42)
#     # best_model_xgb = XGBClassifier(random_state=42, colsample_bytree=1.0, gamma=0, learning_rate=0.2, max_depth=6, min_child_weight=2, n_estimators=926, subsample=1.0)
#     # best_model_lgb = lgb.LGBMClassifier(random_state=42, colsample_bytree=0.8, learning_rate=0.2, max_depth=4, min_child_samples=16, n_estimators=871, subsample=0.9)
#     best_model_xgb.fit(x_train_fold_imputed, y_train_fold)

#     # Validation 데이터 imputation 및 예측
#     x_val_fold_imputed = imputer.transform(x_val_fold)
#     fold_pred = best_model_xgb.predict(x_val_fold_imputed)


#     # fold별 예측값 저장
#     ensemble_preds.append(fold_pred)

#     # 훈련된 모델 저장
#     best_models.append(best_model_xgb)

In [33]:
# # 이미 찾은 파라미터로 학습만 하기
# best_models = []

# # 파라미터 설정
# params_list = [
#     {'colsample_bytree': 1.0, 'gamma': 3, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 677, 'reg_alpha': 0.1, 'subsample': 0.6},
#     {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 995, 'reg_alpha': 0.01, 'subsample': 0.6},
#     {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 695, 'reg_alpha': 1, 'subsample': 1.0},
#     {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 581, 'reg_alpha': 1e-05, 'subsample': 1.0},
#     {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 702, 'reg_alpha': 0.01, 'subsample': 1.0}
# ]


# for i, (train_index, val_index) in enumerate(kf.split(df_train)):
#     # Train/Val 데이터 분할
#     x_train_fold, x_val_fold = df_train.drop("is_converted", axis=1).iloc[train_index], df_train.drop("is_converted", axis=1).iloc[val_index]
#     y_train_fold, y_val_fold = df_train["is_converted"].iloc[train_index], df_train["is_converted"].iloc[val_index]

#     # Train 데이터 imputation
#     imputer = SimpleImputer(strategy='most_frequent')
#     x_train_fold_imputed = imputer.fit_transform(x_train_fold)

#     # 각 fold에 대해 해당하는 파라미터 딕셔너리 선택
#     params = params_list[i % len(params_list)]

#     # 모델 훈련
#     best_model_xgb = XGBClassifier(**params, random_state=42)
#     # best_model_xgb = XGBClassifier(random_state=42, colsample_bytree=1.0, gamma=0, learning_rate=0.2, max_depth=6, min_child_weight=2, n_estimators=926, subsample=1.0)
#     best_model_xgb.fit(x_train_fold_imputed, y_train_fold)

#     # Validation 데이터 imputation 및 예측
#     x_val_fold_imputed = imputer.transform(x_val_fold)
#     fold_pred = best_model_xgb.predict(x_val_fold_imputed)

#     # fold별 예측값 저장
#     ensemble_preds.append(fold_pred)


#     # 훈련된 모델 저장
#     best_models.append(best_model_xgb)

In [34]:
# 클래스 0과 클래스 1의 비율에 따라 scale_pos_weight 설정
df_train['is_converted'].value_counts()

scale_pos_weight = 54449/4850
scale_pos_weight
# 11.22


11.22659793814433

In [35]:
cat_features = ['business_unit', 'customer_idx', 'customer_type', 'enterprise', 'customer_job', 'inquiry_type', 'product_category', 'customer_position', 'business_area', 'lead_owner', 'expected_timeline']  # 실제 범주형 특성 이름으로 변경

In [36]:
# CatBoost 학습
best_models = []

models = []
for train_index, val_index in kf.split(df_train):
    print("="*50)
    
    # Train/Val 데이터 분할
    x_train_fold, x_val_fold = df_train.drop("is_converted", axis=1).iloc[train_index], df_train.drop("is_converted", axis=1).iloc[val_index]
    y_train_fold, y_val_fold = df_train["is_converted"].iloc[train_index], df_train["is_converted"].iloc[val_index]

    # cat_features = x_train_fold.columns[x_train_fold.nunique() > 2].tolist()
    # cat_feature_indices = [x_train_fold.columns.get_loc(feature) for feature in cat_features]
    # cat_feature_indices = [str(int(idx)) for idx in cat_feature_indices]

    x_train_fold[cat_features] = x_train_fold[cat_features].astype('str')
    x_val_fold[cat_features] = x_val_fold[cat_features].astype('str')

    
    # Train 데이터 imputation
    imputer = SimpleImputer(strategy='most_frequent')
    x_train_fold_imputed = pd.DataFrame(imputer.fit_transform(x_train_fold), columns=x_train_fold.columns)

    # Validation 데이터 imputation 및 예측
    x_val_fold_imputed = pd.DataFrame(imputer.transform(x_val_fold), columns=x_val_fold.columns)
    
    x_train_fold_imputed[cat_features] = x_train_fold_imputed[cat_features].astype('str')
    x_val_fold_imputed[cat_features] = x_val_fold_imputed[cat_features].astype('str')
    
    
    model = CatBoostClassifier(iterations=iterations,random_state=42,scale_pos_weight = 11, eval_metric="F1",cat_features=cat_features,one_hot_max_size=4)
    model.fit(x_train_fold_imputed, y_train_fold, 
            eval_set=[(x_val_fold_imputed, y_val_fold)], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
     
    fold_pred = model.predict(x_val_fold_imputed)
    
    # fold별 예측값 저장
    ensemble_preds.append(fold_pred)
    
    best_models.append(model)
    
    if is_holdout:
        break    

Learning rate set to 0.050976
0:	learn: 0.8389758	test: 0.8529728	best: 0.8529728 (0)	total: 149ms	remaining: 7m 27s
100:	learn: 0.9406396	test: 0.9448980	best: 0.9448980 (100)	total: 5.44s	remaining: 2m 36s
200:	learn: 0.9479151	test: 0.9459949	best: 0.9469929 (125)	total: 10.4s	remaining: 2m 24s
300:	learn: 0.9549505	test: 0.9475790	best: 0.9476860 (231)	total: 15.5s	remaining: 2m 18s
400:	learn: 0.9593172	test: 0.9484969	best: 0.9489995 (339)	total: 20.6s	remaining: 2m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9489995347
bestIteration = 339

Shrink model to first 340 iterations.
Learning rate set to 0.050976
0:	learn: 0.8391963	test: 0.8541770	best: 0.8541770 (0)	total: 88.9ms	remaining: 4m 26s
100:	learn: 0.9403311	test: 0.9453684	best: 0.9454562 (85)	total: 4.64s	remaining: 2m 13s
200:	learn: 0.9460518	test: 0.9478435	best: 0.9492282 (169)	total: 9.43s	remaining: 2m 11s
300:	learn: 0.9521016	test: 0.9488147	best: 0.9502578 (225)	total: 14.6s	remainin

In [37]:
# iterations = 2500

# # Train 데이터 imputation
# imputer = SimpleImputer(strategy='most_frequent')
# x_train_imputed = pd.DataFrame(imputer.fit_transform(x_train), columns=x_train.columns)
# x_train_imputed[cat_features] = x_train_imputed[cat_features].astype('str')

# # Train 데이터의 타깃 변수
# # y_train = df_train["is_converted"]

# model = CatBoostClassifier(iterations=iterations, random_state=42, task_type="GPU", eval_metric="F1", cat_features=cat_features, one_hot_max_size=4)
# model.fit(x_train_imputed, y_train, verbose=100)

# best_model = model

In [38]:
for idx, preds in enumerate(ensemble_preds):
    print(f"Fold {idx+1} 예측값 개수:", len(preds))

Fold 1 예측값 개수: 11860
Fold 2 예측값 개수: 11860
Fold 3 예측값 개수: 11860
Fold 4 예측값 개수: 11860
Fold 5 예측값 개수: 11859


In [39]:
# # 앙상블을 통한 최종 예측/다수결 적용
# final_preds = []

# for i in range(min(map(len, ensemble_preds))):
#     # 각 fold별 예측값 조합
#     combined_preds = [ensemble_preds[j][i] for j in range(len(ensemble_preds)) if len(ensemble_preds[j]) > i]

#     # 모든 fold에서 예측값이 있는 경우에만 다수결 적용
#     if combined_preds:
#         # 다수결을 통한 최종 예측
#         majority_vote = Counter(combined_preds).most_common(1)[0][0] == 'True'
#         final_preds.append(majority_vote)


In [40]:
# 앙상블을 통한 최종 예측/5개 모델 중 2개 이상 모델이 1로 예측할 경우
final_preds = []

for i in range(min(map(len, ensemble_preds))):
    # 각 fold별 예측값 조합
    combined_preds = [ensemble_preds[j][i] == 'True' for j in range(len(ensemble_preds)) if len(ensemble_preds[j]) > i]

    # 모든 fold에서 예측값이 있는 경우에만 다수결 적용
    if combined_preds:
        num_ones = sum(combined_preds)  # 1로 분류된 모델의 개수를 세기 위해 1의 개수를 계산
        if num_ones >= 2:  # 2개 이상의 모델이 1로 분류했을 때
          final_preds.append(True)
        else:
          final_preds.append(False)  # 그 외의 경우에는 0으로 예측

In [41]:
# # 그리드 서치
# XGB = XGBClassifier(random_state=42)

# param_grid = {
#     'n_estimators': [50, 100, 300, 1000],
#     'max_depth': [3, 5, 7, 9],
#     'min_child_weight':[1,3],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.7, 0.8, 0.9, 1.0],
#     'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
#     'gamma' : [0, 0.5, 1]
# }

# scoring = make_scorer(f1_score)
# grid_search = GridSearchCV(XGB, param_grid, cv=5, scoring=scoring)
# grid_search.fit(x_train_imputed, y_train)

# print(grid_search.best_score_)
# print(grid_search.best_params_)

In [42]:
# # LightGBM 랜덤 서치
# LGBM = lgb.LGBMClassifier(random_state=42)

# # 탐색할 하이퍼파라미터 범위 지정
# param_dist = {
#     'n_estimators': randint(50, 1001),
#     'max_depth': randint(3, 15),
#     'min_child_samples': randint(15, 25),
#     'learning_rate': (0.01, 0.2),
#     'subsample': [0.7, 0.8, 0.9, 1.0],
#     'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
# }

# # 평가 지표 및 랜덤 서치 객체 생성
# scoring = make_scorer(f1_score)
# random_search = RandomizedSearchCV(LGBM, param_distributions=param_dist, n_iter=150, cv=5, scoring=scoring, random_state=42)


# # early stopping을 위한 검증 세트 설정
# # eval_set = [(x_val_imputed, y_val)]

# # 랜덤 서치 수행
# random_search.fit(x_train_imputed, y_train)
# # random_search.fit(x_train_imputed, y_train, eval_set=eval_set, eval_metric='f1', early_stopping_rounds=50, verbose=False)


# # 최적의 결과 출력
# print(random_search.best_score_)
# print(random_search.best_params_)

In [43]:
# # 그리드 서치로 찾은 최적의 파라미터
# # best_params_xgb = grid_search.best_params_

# # 랜덤 서치로 찾은 최적의 파라미터
# best_params_lgb = random_search.best_params_

# # 최적의 파라미터로 XGBoost 모델 생성
# best_model_lgb = lgb.LGBMClassifier(**best_params_lgb, random_state=42)
# best_model_lgb.fit(x_train_imputed, y_train)

### 모델 성능 보기

In [44]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [45]:
# Validation 데이터에 대한 평가
get_clf_eval(y_val_fold, final_preds)

오차행렬:
 [[ 755  183]
 [ 943 9978]]

정확도: 0.9051
정밀도: 0.4446
재현율: 0.8049
F1: 0.5728


In [46]:
# # 결측치를 최빈값으로 대체
# imputer = SimpleImputer(strategy='most_frequent')
# x_val_imputed = pd.DataFrame(imputer.fit_transform(x_val), columns=x_val.columns)
# x_val_imputed[cat_features] = x_val_imputed[cat_features].astype('str')


# # 예측 및 평가
# pred = best_model.predict(x_val_imputed)
# pred = pred.astype(bool)  # 예측값의 데이터 타입을 boolean으로 변환
# get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [47]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

x_test[cat_features] = x_test[cat_features].astype('str')


# 결측치를 최빈값으로 대체
imputer = SimpleImputer(strategy='most_frequent')
x_test_imputed = pd.DataFrame(imputer.fit_transform(x_test), columns=x_test.columns)

x_test_imputed[cat_features] = x_test_imputed[cat_features].astype('str')

In [48]:
# Test 데이터 예측
test_preds = []
for model in best_models:
    test_pred = model.predict(x_test_imputed)
    test_preds.append(test_pred)

In [49]:
# # 앙상블을 통한 최종 예측/다수결 적용
# final_test_preds = []

# for i in range(len(test_preds[0])):
#     combined_test_preds = [test_preds[j][i] for j in range(len(test_preds))]
#     # majority_vote_test = Counter(combined_test_preds).most_common(1)[0][0]
#     majority_vote_test = Counter(combined_test_preds).most_common(1)[0][0] == 'True'
#     final_test_preds.append(majority_vote_test)

In [50]:
# 앙상블을 통한 최종 예측/5개 모델 중 2개 이상 모델이 1로 예측할 경우        
final_test_preds = []

for i in range(len(test_preds[0])):
    # combined_test_preds = [test_preds[j][i] for j in range(len(test_preds))]
    combined_test_preds = [test_preds[j][i] == 'True' for j in range(len(test_preds))]
    num_ones = sum(combined_test_preds)  # 1로 분류된 모델의 개수를 세기 위해 True의 개수를 계산
    if num_ones >= 2:  # 2개 이상의 모델이 True로 분류했을 때
        final_test_preds.append(True)
    else:
        final_test_preds.append(False)  # 그 외의 경우에는 False로 예측


In [51]:
# test_pred = model.predict(x_test.fillna(0))
# test_pred = best_model_xgb.predict(x_test_imputed)
# sum(test_pred) # True로 예측된 개수
sum(final_test_preds) # True로 예측된 개수

2228

### 제출 파일 작성

In [52]:
# # 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
# df_sub = pd.read_csv("submission.csv")
# df_sub["is_converted"] = test_pred

# # 제출 파일 저장
# df_sub.to_csv("submission.csv", index=False)

In [53]:
# Submission 파일에 예측값 추가
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = final_test_preds
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**