# LG Aimers  영업 성공 여부 분류 경진대회
> 팀명 : *영업 챔피언스*   
> 팀원 : 김이정, 오인우, 이세희, 박주현

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
#catboost 모델 설치
!pip install catboost




[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
#사용할 라이브러리 import
import pandas as pd
import numpy as np
import re
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    make_scorer,
)
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from catboost import Pool,CatBoostClassifier

### 데이터 셋 읽어오기

In [3]:
#제공된 csv 파일 읽어오기
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

### 2-1. 컬럼별 데이터 전처리
- Train데이터는 범주형 변수가 대부분이므로, 카테고리 재분류 및 내용수정/오타 수정 등의 전처리를 거쳤다.

# customer type 전처리
-  customer type은 5개의 카테고리('End Customer','Specifier/ Influencer','Solution Eco-Partner','Other','Home_Owner')에 해당하는 내용의 경우, 해당 카테고리에 재할당 해주었으며,
원본 데이터의 의미를 살리기 위해 나머지 데이터는 그대로 유지했다
- 여러 테스트 결과 나머지 데이터를 특정 카테고리로 분류하는 것보다, 그대로 유지하는 것이 성능이 더 뛰어났다.


In [5]:
# 바꿔줄 단어들을 리스트로 나열
End = ['End-user', 'End-Customer', 'Commercial end-user']
Specifier = ['Specifier / Influencer']
Solution  = ['Software / Solution Provider', 'Software/Solution Provider']
Other = ['Other', 'Others', 'Etc.']
Home_Owner = ['Home Owner', 'Homeowner']

In [6]:
# df_train 전처리
df_train.loc[df_train['customer_type'].isin(End), 'customer_type'] = 'End Customer'
df_train.loc[df_train['customer_type'].isin(Specifier), 'customer_type'] = 'Specifier/ Influencer'
df_train.loc[df_train['customer_type'].isin(Solution), 'customer_type'] = 'Solution Eco-Partner'
df_train.loc[df_train['customer_type'].isin(Other), 'customer_type'] = 'Other'
df_train.loc[df_train['customer_type'].isin(Home_Owner), 'customer_type'] = 'Home_Owner'

In [7]:
## df_train에 적용된 코드와 동일한 처리를 df_test에도 적용
df_test.loc[df_test['customer_type'].isin(End), 'customer_type'] = 'End Customer'
df_test.loc[df_test['customer_type'].isin(Specifier), 'customer_type'] = 'Specifier/ Influencer'
df_test.loc[df_test['customer_type'].isin(Solution), 'customer_type'] = 'Solution Eco-Partner'
df_test.loc[df_test['customer_type'].isin(Other), 'customer_type'] = 'Other'
df_test.loc[df_test['customer_type'].isin(Home_Owner), 'customer_type'] = 'Home_Owner'

In [8]:
# 결과확인
df_train['customer_type'].value_counts()

End Customer             10652
Specifier/ Influencer     2568
Channel Partner           1368
Service Partner            349
Solution Eco-Partner       154
Installer/Contractor        52
Corporate                   31
HVAC Engineer               23
Other                       20
Engineer                    20
Developer                   17
Technician                  16
Home_Owner                  15
Consultant                  15
Manager / Director           8
Installer                    5
Architect/Consultant         5
Reseller                     5
Interior Designer            5
Distributor                  4
System Integrator            2
Dealer/Distributor           2
Technical Assistant          1
Administrator                1
Name: customer_type, dtype: int64

# business unit에 따른 id_strategic_ver/it_strategic/idit_strataegic_ver 전처리
- 'id_strategic_ver','it_strategic_ver','idit_strategic_ver'는 'business_unit'컬럼의'ID,IT'값과 관계가 있었다.
따라서, 아래와 같이 가중치를 부여하는 방식으로 전처리를 했다
    - business_unit이 ID 이면서 business_area 가 특정 사업부인 경우 가중치 부여
    - business_unit이 IT 이면서 business_area 가 특정 사업부인 경우 가중치 부여
    - 'id_strategic_ver'나 'it_strategic_ver' 중 하나가 1인 경우에는 'idit_strategic_ver'에 1을 할당

In [9]:
# business_unit이 ID 이면서 business_area 가 특정 사업부인 경우 가중치 부여
def assign_id_strategic_ver(row):
    if row['business_unit'] == 'ID' and row['business_area'] in ['corporate / office', 'retail', 'hotel & accommodation', 'education']:
        return 1
    else:
        return 0

df_train['id_strategic_ver'] = df_train.apply(assign_id_strategic_ver, axis=1)
df_test['id_strategic_ver'] = df_test.apply(assign_id_strategic_ver, axis=1)

In [10]:
# business_unit이 IT 이면서 business_area 가 특정 사업부인 경우 가중치 부여
def assign_it_strategic_ver(row):
    if row['business_unit'] == 'IT' and row['business_area'] in ['corporate / office', 'retail', 'hotel & accommodation', 'education']:
        return 1
    else:
        return 0

df_train['it_strategic_ver'] = df_train.apply(assign_it_strategic_ver, axis=1)
df_test['it_strategic_ver'] = df_test.apply(assign_it_strategic_ver, axis=1)

In [11]:
# 'id_strategic_ver'나 'it_strategic_ver' 중 하나가 1인 경우에는 'idit_strategic_ver'에 1을 할당
df_train['idit_strategic_ver'] = df_train.apply(lambda row: 1 if row['id_strategic_ver'] == 1 or row['it_strategic_ver'] == 1 else 0, axis=1)

df_test['idit_strategic_ver'] = df_test.apply(lambda row: 1 if row['id_strategic_ver'] == 1 or row['it_strategic_ver'] == 1 else 0, axis=1)

# customer_job과 customer_position 전처리
- customer_job,customer_position컬럼은 각 컬럼별 카테고리에 해당하는 값들을 매핑하여 전처리했다.




In [12]:
# customer_job과 customer_position 의 값을 비교하여 변경하기
def search_func_pos(df):
    # 대소문자를 구분하지 않기 위해 소문자로 변환
    df['customer_position'] = df['customer_position'].str.lower()
    df['customer_job'] = df['customer_job'].str.lower()

    # job function과 job seniority 배열 소문자 변환하여 비교
    job_function = ["3d/vfx art", "accounting", "administrative", "arts and design", "business development", "clinical specialist", "community and social services",
                    "consulting", "education", "engineering", "entrepreneurship", "film production", "finance", "graphic/color art", "healthcare services",
                    "human resources", "information technology", "legal", "marketing", "media and communication", "medical imaging specialist", "medical solution",
                    "military and protective services", "operations", "pathologist", "product management", "program and project management",
                    "purchasing", "quality assurance", "radiology professional", "real estate", "research", "sales", "support", "surgery professional", "others", "other" ]

    job_seniority = ["ceo/founder", "partner", "c-level executive", "vice president", "director", "manager",
                     "associate/analyst", "entry level", "trainee", "intern","others"]

    # customer_position과 customer_job을 비교하여 조건에 맞는 행을 선택
    condition = (df['customer_position'].isin(job_function)) & (df['customer_job'].isin(job_seniority))
    condition_T = df[condition]
    # DataFrame으로 변환
    condition_T = pd.DataFrame(condition_T, columns=['customer_position', 'customer_job'])

    # 조건에 맞는 행의 customer_position과 customer_job 값을 서로 변경
    df.loc[condition, ['customer_position', 'customer_job']] = df.loc[condition, ['customer_job', 'customer_position']].values

    return df

# 함수를 호출하여 데이터프레임을 변경
df_train = search_func_pos(df_train)
df_test = search_func_pos(df_test)

In [13]:
# customer_job 의 각 카테고리에 대한 매핑 딕셔너리 생성
three_d_vfx_art = ["3d/vfx art"]

accounting = ["accounting", "accounts payable", "account exec/manager", "account management"]

administrative = ["administrative", "platform administrator", "imaging administrator", "pacs administrator", "systems administrator", "facility administrator", "network administrator", "admin", "admin assistant", "administración", "administration", "administrative assistant", "adminisztráció", "amministrativo"]

arts_and_design = ["arts and design", "support/facilitator, designer", "design and provide equipment", "design/build", "design/decision maker", "design/install/training/support", "kreation und design", "kreation_und_design", "lead designer", "művészet_és_design", "interior designer", "art and design", "art installation", "arte y diseño", "arte_e_design", "artist, lead on equipment selection", "arts_and_design", "design", "designer", "designer, creative technologist", "designer, producer", "designer/installer", "designere / budget", "designers"]

business_development = ["business development", "sourcing & quoting for end user", "distributor quotation", "developer/property", "curation", "quotation curator", "quote gathering/proposer to owner", "quoting project", "business_development", "development coordinator/procurement"]

clinical_specialist = ["clinical specialist", "clinic", "mental health"]

community_and_social_services = ["community and social services", "community_and_social_services"]

consulting = ["consulting", "technical advisor, reseller", "consultent", "consultant", "consultant / purchaser", "consultant,cabinet fabricator", "strategic communications", "strategy & operations specialist", "solution advisor", "solutions architect", "technology consultant"]

education = ["education", "teacher", "teaching", "educator", "higher education (college & university)", "institute & academy", "instructor"]

engineering = ["engineering", "tech", "technical", "electrical contractor", "implement", "senior design engineer", "solution engineer", "system engineer", "systems engineer", "lead engineer", "electronics & telco", "engineer", "engineering & technical", "engineering & technical executive", "engineering director", "engineering, design, and install", "system designer, integrator", "systems design", "systems designer", "principal engineer", "hardware", "hardware design engineer", "hardware selection", "chief eng.", "chief engineer", "chief of engineering", "design engineer", "director of engineering"]

entrepreneurship = ["business owner", "ceo", "director comercial", "head", "engagement executive", "execution", "executive", "owner","owner representation", "owning company", "ownner-marketing director", "ceo/founder", "lead", "organizer", "leader", "vice president", "vp/gm", "underboss", "the big boss"]

film_production = ["film production", "home theater", "community theater"]

finance = ["finance", "finanzas", "finanzen", "pénzügy", "finance executive"]

graphic_color_art = ["graphic/color art", "colorist", "gc", "graphic design"]

healthcare_services = ["healthcare services", "healthcare professionals", "healthcare_services"]

human_resources = ["human resources", "hr posting", "hr", "human_resources"]

information_technology = ["information technology", "it", "it - information technology", "it department", "information technology\u200b", "software developer", "emerging technology / innovation", "informatics, touch capability", "information technology\u200b", "information_technology", "infrastructure", "it", "it - information technology", "it admin", "it administrator", "it dairector", "it department", "it director", "it hardware technician", "it integrator", "it manager", "it project lead", "it specialist", "it support", "it tech.", "it/software", "application development", "cloud / mobility", "collaboration & web apps", "computing & it"],

legal = ["legal"]

marketing = ["marketing", "technical marketing", "advertising", "product marketing", "advertising and promotions team", "event marketing", "field marketing", "marketing coordinator", "marketing executive", "marketing operations"]

media_and_communication = ["media and communication", "broadcasting & media", "media and communications", "media_and_communication", "media_e_comunicazione", "medien_und_kommunikation", "medios_de_comunicación", "média_és_kommunikáció"]

medical_imaging_specialist = ["medical imaging specialist", "medical imaging  specialist", "spécialiste_en_imagerie_médicale"]

medical_solution = ["medical solution", "doctor", "tierarzt", "medical solution  provider", "medical solution provider", "medical solution provider\u200b"]

military_and_protective_services = ["military and protective services", "military_and_protective_services"]

operations = ["coo", "director of operations","regional director of operations", "operations executive", "operations manager", "operations", "facilities and operations", "üzemeltetés"]

pathologist = ["pathologist"]

product_management = ["product management", "product_management"]

program_and_project_management = ["program and project management", "av project manager", "signage subcontractor p/m", "general manager - project manager", "digital project manager", "program directors", "gestión_de_proyectos", "program-_és_projektmenedzsment", "program_and_project_management", "program_and_project_manager", "programm- und projektmanagement", "programm-_und_projektmanagement", "project administrator", "project coordinator", "project director", "project facilitator", "project head", "project lead", "project manage", "project manager", "project manager / estimator", "project manager / principal", "project manager/designer", "project researcher", "project sales/manage", "projection manager", "projectr mgmt", "projektmenedzsment\tprogram and project management", "planner", "planner/purchaser", "planning and installation", "pm", "a/v project manager", "project manager", "owner / project manager", "producer/project manager"]

purchasing = ["purchasing", "buyer", "buyer, coordinating", "obtain quotes, process purchase", "requirements and buyer", "ordering manager", "requisition", "purchase", "purchase and install", "purchase dept", "purchaser", "purchaser, it and installer", "purchasers", "purchasing agent", "purchasing authority", "purchasing coordinator", "purchasing director", "purchasing manager", "purchasing supervisor", "purchsing", "director purchaser", "drop, purchase maxhub", "public bidder", "bidder"]

quality_assurance = ["quality assurance", "quality_assurance"]

radiology_professional = ["radiology professional", "profesional de radiología", "radiology  professional", "radiology_professional"]

real_estate = ["real estate", "building owner", "property owner"]

research = ["research", "associate/analyst", "r&d project manager", "research & development", "research and developement", "research products and prices", "research/install", "product research", "product researcher"]

sales = ["sales", "asking for quote for client", "field / outside sales", "sourcing / procurement", "sourcing/procurement", "reseller/integrator", "procurement", "procurement specialist", "procurment", "revendedor", "car dealership", "vendor / reseller", "vendite", "értékesítés", "technical sales", "reseller", "sale", "sales engineering", "sales executive", "sales manager", "sales operations", "sales rep", "salesman"]

support = ["support", "help desk / desktop services", "helpdesk specialist", "post install support and service", "supplier and installation"]

surgery_professional = ["surgery professional", "profesional de cirugía", "surgery professional\u200b"]

other = ["egyéb", "other", "others", "otros", "autres"]

general_manager = ["general manager", "gm"]

In [14]:
# 직업군과 그에 해당하는 값들의 리스트들을 job_lists 로 묶음
job_lists = [
    ('three_d_vfx_art', three_d_vfx_art),
    ('accounting', accounting),
    ('administrative', administrative),
    ('arts_and_design', arts_and_design),
    ('business_development', business_development),
    ('clinical_specialist', clinical_specialist),
    ('community_and_social_services', community_and_social_services),
    ('consulting', consulting),
    ('education', education),
    ('engineering', engineering),
    ('entrepreneurship', entrepreneurship),
    ('film_production', film_production),
    ('finance',finance),
    ('graphic_color_art',graphic_color_art),
    ('healthcare_services',healthcare_services),
    ('human_resources',human_resources),
    ('information_technology',information_technology),
    ('legal',legal),
    ('marketing',marketing),
    ('media_and_communication',media_and_communication),
    ('medical_imaging_specialist',medical_imaging_specialist),
    ('medical_solution',medical_solution),
    ('military_and_protective_services',military_and_protective_services),
    ('operations',operations),
    ('pathologist',pathologist),
    ('product_management',product_management),
    ('program_and_project_management',program_and_project_management),
    ('purchasing',purchasing),
    ('quality_assurance',quality_assurance),
    ('radiology_professional',radiology_professional),
    ('real_estate',real_estate),
    ('research',research),
    ('sales',sales),
    ('support',support),
    ('surgery_professional',surgery_professional),
    ('other', other),
    ('general_manager', general_manager)
]

In [15]:
# customer_job을 매핑하는 함수 생성
def categorize_jobs(df, job_lists):
    for job_list in job_lists:
        job_name = job_list[0]
        job_values = job_list[1]
        df.loc[df['customer_job'].isin(job_values), 'customer_job'] = job_name
    return df

# 함수 호출 및 적용
df_train = categorize_jobs(df_train, job_lists)
df_test = categorize_jobs(df_test, job_lists)

In [16]:
# cutomer_position 의 각 카테고리에 대한 매핑 딕셔너리 생성

position_mapping = {
    "CEO/Founder": ["ceo/founder", "founder", "co-founder", "chief executive officer", "ceo/fundador"],
    "Partner": ["partner", "business partner"],
    "C-level Executive": ["c-level executive", "c-levelexecutive"],
    "Vice President": ["vice president", "vp", "vicepresident"],
    "Director": ["director", "business unit director", "director cum faculty at gaining apex coaching centre"],
    "Manager": ["manager", "av management", "product management"],
    "Associate/Analyst": ["associate/analyst"],
    "Entry Level": ["entrylevel", "entry level"],
    "Trainee": ["trainee"],
    "Intern": ["intern", "unpaid"],
    "other" : ["others", "other"]
    }

# 각 customer_position을 job_seniority 배열에 매핑
df_train['customer_position'] = df_train['customer_position'].apply(
    lambda x: next((seniority for seniority, positions in position_mapping.items() if any(pos in x.lower() for pos in positions)), x)
    # 매핑되는 항목이 없으면 원래 값(x)을 유지
)

# 각 customer_position을 job_seniority 배열에 매핑
df_test['customer_position'] = df_test['customer_position'].apply(
    lambda x: next((seniority for seniority, positions in position_mapping.items() if any(pos in x.lower() for pos in positions)), x)
    # 매핑되는 항목이 없으면 원래 값(x)을 유지
)

# 매핑 결과 확인
print(df_train['customer_position'].value_counts())

none                   19680
Manager                 8226
CEO/Founder             7993
other                   7465
Director                4849
                       ...  
associate professor        1
hon dean                   1
chairman                   1
pgt physics                1
entrepreneurship           1
Name: customer_position, Length: 96, dtype: int64


# product_category 전처리
-  product_category 컬럼은 컬럼의 전처리가 되지 않은, 다른 나라의 언어로 기입된 값들을 카테고리에 매핑하는 방식으로 전처리했다.

In [17]:
# product_category 다른 나라 언어로 기입된 내용 전처리 하기

# 각 카테고리에 대한 매핑 딕셔너리 생성
def preprocess_product_category(df):
    mapping_dict = {
        "other" : ["other", "others", "etc.", "khác", "outros", "lainnya", "אחר", "otros", "ฯลฯ", "inne", 'autre'],
        "commercial_tv" : ["commercial tv", "commercial tv,tv", "commercial tv,audio/video", "tv,commercial tv", "comercial tv"],
        "heating" : ["heating", "חימום" ,"حلول التدفئة", "isıtma", "ogrzewanie (pompy ciepła)", "calefacción", "aquecimento"],
        "multi_split"  : ["multi-split", "פיצול מרובה", "multi split", "multi-split (plusieurs pièces)"],
        "single_split" : ["single-split", "split tunggal", "single split"],
        "chiller" : ["chiller", "مبرد (تشيلر)", "soğutucu", "pendingin"],
        "video_wall_signage"  : ["video wall signage", "videwall", "video wall", "videowall signage"],
        "hotel_tv" : ["hotel tv", "酒店電視"],
        "hospital_tv" : ["hospital tv", "醫院電視"],
        "one_quick_series" : ["one:quick series", "lg one:quick series", "one:quick", "lg one:quick", "onequick series", "one quick:flex", "one:quick flex", 'one quick works', 'aio | one quick'],
        "medical_display" : ["medical display", "medical displays", "medical monitors", "medical- surgical", "medical monitor"],
        "air_conditioner" : ["מזגנים למקום מגורים", "điều hòa gia dụng", "เครื่องปรับอากาศเผื่อที่อยู่อาศัย", "ac rumah", "điều hòa cục bộ", "تكييفات", "climatiseur résidentiel", 'system air conditioner'],
        "interactive_digital_board" : ['idb', 'interactive digital board']
    }

    for category, keywords in mapping_dict.items():
            df.loc[df['product_category'].isin(keywords), 'product_category'] = category

    return df

# Train 및 Test 데이터 전처리
df_train = preprocess_product_category(df_train)
df_test = preprocess_product_category(df_test)

# inquiry_type 전처리
- inquiry_type은 LG공식 홈페이지의 Inquiry To buy의 inquiry_type을 기준으로, 해당 카테고리에 맞게 값들을 매핑하여 전처리했다.
- 해당 카테고리와 일치한다고 판단이 불가능한 데이터들은 원본 형태로 유지하는 것이, 성능이 가장 뛰어났다.

In [18]:
# 각 inquiry_type에 대한 딕셔너리 생성
inquiry_mapping = {
    "Quotation or Purchase Consultation": ['Quotation or purchase consultation', 'Quotation or Purchase Consultation', 'quotation_or_purchase_consultation',
                                           'quotation_or_purchase_consultation', 'Quotation or Purchase consultation', 'Purchase or Quotation',
                                          'Purchase', 'quotation_', 'Request for quotation or purchase' ] ,
    "Usage or Technical Consultation": ['Usage or Technical Consultation', 'Technical Consultation', 'technical_consultation', 'usage or technical consultation', 'usage_or_technical_consultation', 'Usage or technical consultation',
                                       'Technical Support', 'Request for technical consulting', 'technical'],
    "Request a Demo": ['Request a Demo'],
    "OEM/ODM Request": ['OEM/ODM Request'],
    "Request for Partnership": ['Request for Partnership' ],
    "Customer Suggestions": ['Customer Suggestions'],
    "Request for Distributorship" : ['Request for Distributorship'],
    "Sales Inquiry" : ['Sales inquiry','sales' ],
    "Others" : ['Other', 'other', 'other_', 'Others', 'others', 'Etc.']
}

# inquiry_type 매핑 함수
def map_inquiry_type(inquiry):
    for inquiry_type, inquiry_list in inquiry_mapping.items():
        if inquiry.lower() in [item.lower() for item in inquiry_list]:
            return inquiry_type
    return inquiry

# 각 inquiry_mapping을 inquiry_type 배열에 매핑
df_train['inquiry_type'] = df_train['inquiry_type'].astype(str).apply(map_inquiry_type)
df_test['inquiry_type'] = df_test['inquiry_type'].astype(str).apply(map_inquiry_type)

# expected_timeline 전처리

- inquiry_type은 LG공식 홈페이지의 Inquiry To Buy의 expected_timeline을 기준으로, 해당 카테고리에 맞게 값들을 매핑하여 전처리했다.
- 결측치는 'Not specified'값을 일괄적으로 부여했다.

In [19]:
# expected_timeline 전처리

def custom_replace(value):
    value_str = str(value)  # 정수형 데이터를 문자열로 변환
    if "less" in value_str.lower() and ("3" in value_str or "2" in value_str or "1" in value_str):
        return 'Less than 3 months'
    elif ("3" in value_str and "6" in value_str) or ("less" in value_str.lower() and ("6" in value_str or "5" in value_str or "4" in value_str)):
        return '3 Months ~ 6 Months'
    elif "6" in value_str and "9" in value_str:
        return '6 Months ~ 9 Months'
    elif "9" in value_str and "1" in value_str:
        return '9 Months ~ 1 year'
    elif "more" in value_str.lower():
        return 'More than a year'
    else:
        return 'Not specified'

# df_train/df_test에 함수 적용
df_train['expected_timeline'] = df_train['expected_timeline'].apply(custom_replace)
df_test['expected_timeline'] = df_test['expected_timeline'].apply(custom_replace)

# ver_cus와 ver_pro 전처리

- 기존 train의 ver_cus 데이터 확인 후 특정 business_unit을 ['corporate / office', 'retail', 'education', 'hotel & accommodation']의 네 가지로 설정했다.

- 기존 train의 ver_pro 데이터 확인 후 특정 category를 'signage'와 'hotel_tv' 로 설정했다.



In [20]:
# ver_cus 잘못 기입된 값 전처리 함수
# 가설로 설정한 4가지의 business_unit에 해당되면서, end-user인 경우 가중치 부여
def ver_cus_0to1(df):
  df.loc[(df['business_area'].isin(['corporate / office', 'retail', 'education', 'hotel & accommodation'])) &
         (df['customer_type'].isin(['End-Customer', 'End Customer', 'End-user'])),
         'ver_cus'] = 1
  return df

### df_train/df_test에 적용
df_train = ver_cus_0to1(df_train)
df_test = ver_cus_0to1(df_test)

In [21]:
# ver_pro 잘못 기입된 값 전처리 함수
# 가설로 설정한 4가지의 business_unit에 해당되면서, product_category에 signage와 hotel_tv가 포함되면 가중치 부여
def ver_pro_0to1(df):
    df.loc[(df['business_area'].isin(['corporate / office', 'retail', 'hotel & accommodation','education'])) &
           (df['product_category'].str.contains('signage', case=False) |
            df['product_category'].str.contains('hotel_tv', case=False)),
           'ver_pro'] = 1
    return df

### df_train/df_test에 적용
df_train = ver_pro_0to1(df_train)
df_test = ver_pro_0to1(df_test)

In [22]:
# 불필요해 보이는 열 설정 및 제거
columns_to_drop = ["customer_country", "customer_country.1", "response_corporate","product_subcategory", "product_modelname", "business_subarea"]

df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

### 2-2. 학습, 검증 데이터 분리

In [23]:
is_holdout = False
iterations = 3000
patience = 50     # 조기 종료 조건을 50으로 설정

# Train/Val 데이터를 10-fold로 나누기
# kf = KFold(n_splits=10, shuffle=True, random_state=42)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 데이터 불균형을 고려하여 StratifiedKFold 사용

# 각 fold에 대한 예측값을 저장할 리스트
ensemble_preds = []

## 3. 모델 학습

### 모델 학습

In [24]:
# 타겟 값의 True와 False의 비율에 따라 scale_pos_weight 설정
df_train['is_converted'].value_counts()

scale_pos_weight = 54449/4850
scale_pos_weight
# 11.22

11.22659793814433

In [25]:
# 범주형 변수 지정
cat_features = ['business_unit', 
                'customer_idx', 
                'customer_type', 
                'enterprise',
                'customer_job',
                'inquiry_type', 
                'product_category',
                'customer_position',
                'business_area',
                'lead_owner', 
                'expected_timeline']

In [26]:
# CatBoost Model 학습
best_models = []

models = []
for train_index, val_index in kf.split(df_train.drop("is_converted", axis=1), df_train["is_converted"]):
    print("="*50)

    # Train/Val 데이터 분할
    x_train_fold, x_val_fold = df_train.drop("is_converted", axis=1).iloc[train_index], df_train.drop("is_converted", axis=1).iloc[val_index]
    y_train_fold, y_val_fold = df_train["is_converted"].iloc[train_index], df_train["is_converted"].iloc[val_index]


    x_train_fold[cat_features] = x_train_fold[cat_features].astype('str')
    x_val_fold[cat_features] = x_val_fold[cat_features].astype('str')


    # 결측치 처리를 위한 imputer 설정(train 데이터의 최빈값으로 설정)
    imputer = SimpleImputer(strategy='most_frequent')
    x_train_fold_imputed = pd.DataFrame(imputer.fit_transform(x_train_fold), columns=x_train_fold.columns)

    # Validation 데이터 imputation
    x_val_fold_imputed = pd.DataFrame(imputer.transform(x_val_fold), columns=x_val_fold.columns)

    x_train_fold_imputed[cat_features] = x_train_fold_imputed[cat_features].astype('str')
    x_val_fold_imputed[cat_features] = x_val_fold_imputed[cat_features].astype('str')

    # 모델 학습
    model = CatBoostClassifier(iterations=iterations,random_state=42,scale_pos_weight = 11, eval_metric="F1",cat_features=cat_features,one_hot_max_size=4)
    model.fit(x_train_fold_imputed, y_train_fold,
            eval_set=[(x_val_fold_imputed, y_val_fold)],
            early_stopping_rounds=patience ,
            verbose = 100
        )

    fold_pred = model.predict(x_val_fold_imputed)

    # fold별 예측값 저장
    ensemble_preds.append(fold_pred)

    # 학습한 모델 저장
    best_models.append(model)

    if is_holdout:
        break

Learning rate set to 0.052481
0:	learn: 0.8420409	test: 0.8486708	best: 0.8486708 (0)	total: 229ms	remaining: 11m 26s
100:	learn: 0.9396487	test: 0.9451025	best: 0.9452802 (94)	total: 9.74s	remaining: 4m 39s
200:	learn: 0.9471330	test: 0.9475772	best: 0.9484011 (174)	total: 20.4s	remaining: 4m 43s
300:	learn: 0.9530514	test: 0.9492966	best: 0.9508351 (270)	total: 31.6s	remaining: 4m 43s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.950835142
bestIteration = 270

Shrink model to first 271 iterations.
Learning rate set to 0.052481
0:	learn: 0.8543646	test: 0.8724523	best: 0.8724523 (0)	total: 97.1ms	remaining: 4m 51s
100:	learn: 0.9404636	test: 0.9540187	best: 0.9541079 (98)	total: 9.77s	remaining: 4m 40s
200:	learn: 0.9479717	test: 0.9550867	best: 0.9554451 (189)	total: 20.6s	remaining: 4m 46s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9554450802
bestIteration = 189

Shrink model to first 190 iterations.
Learning rate set to 0.052481
0:	lea

In [27]:
# 각 flod 별 예측값 개수 조회
for idx, preds in enumerate(ensemble_preds):
    print(f"Fold {idx+1} 예측값 개수:", len(preds))

Fold 1 예측값 개수: 5930
Fold 2 예측값 개수: 5930
Fold 3 예측값 개수: 5930
Fold 4 예측값 개수: 5930
Fold 5 예측값 개수: 5930
Fold 6 예측값 개수: 5930
Fold 7 예측값 개수: 5930
Fold 8 예측값 개수: 5930
Fold 9 예측값 개수: 5930
Fold 10 예측값 개수: 5929


## 4. 제출하기

### 테스트 데이터 예측

In [28]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

x_test[cat_features] = x_test[cat_features].astype('str')


# test 데이터 결측치 처리 (train 데이터로 학습시킨 imputer 사용)
x_test_imputed = pd.DataFrame(imputer.transform(x_test), columns=x_test.columns)

x_test_imputed[cat_features] = x_test_imputed[cat_features].astype('str')

In [29]:
# Test 데이터 예측
test_preds = []
for model in best_models:
    test_pred = model.predict(x_test_imputed)
    test_preds.append(test_pred)

In [30]:
# 앙상블을 통한 최종 예측/10개 모델 중 5개 이상 모델이 True로 예측할 경우
final_test_preds = []

for i in range(len(test_preds[0])):
    combined_test_preds = [test_preds[j][i] == 'True' for j in range(len(test_preds))]
    num_ones = sum(combined_test_preds)  # True로 분류된 모델의 개수를 계산
    if num_ones >= 5:  # 5개 이상의 모델이 True로 분류했을 경우
        final_test_preds.append(True)   # 최종 예측을 Treu로 예측
    else:
        final_test_preds.append(False)  # 그 외의 경우에는 False로 예측

In [31]:
sum(final_test_preds) # True로 예측된 개수

1953

### 제출 파일 작성

In [62]:
# Submission 파일에 예측값 추가
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = final_test_preds
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**