목차

- 모듈 imprt 
- 결측치 채우기 (ver_win_rate_x 분류, ver_win_ratio_per_bu 분류, com_reg_ver_win_rate 회귀)
- 특정 범주 파생변수 생성
- business_area 결측치 채우기
- 학습코드 (인코딩, voting모델 정의 및 fit) 
- 제출파일 생성

# 모듈 import

In [1]:
import numpy as np
import pandas as pd
import sklearn
import xgboost as xgb
import lightgbm as lgb
import catboost
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as tts, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, mean_squared_error
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from xgboost import XGBClassifier, plot_importance as plot_importance_xgb
from lightgbm import LGBMClassifier, LGBMRegressor, plot_importance as plot_importance_lgb
from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
from bayes_opt import BayesianOptimization

%matplotlib inline

def seed_everything(seed: int = 24):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    global SEED
    SEED = seed

seed_everything(24)


# 전처리

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('submission.csv')

In [None]:
# drop
def drop_columns(df):
    df.drop(columns=['product_subcategory', 'product_modelname', 'customer_country.1', 'business_subarea'], inplace=True, axis=1)
    if 'id' in df.columns:
        df.drop(columns=['id'], inplace=True, axis=1)

    return df

# 결측값 0으로 처리 : 'ver_win_ratio_per_bu', 'com_reg_ver_win_rate', 'ver_win_rate_x' 결측값은 모델로 예측
def fill_missing_values_with_0(df):
    column = ['it_strategic_ver', 'id_strategic_ver', 'idit_strategic_ver', 'historical_existing_cnt']
    df[column] = df[column].fillna(0)
    
    return df

# customer_country
import googlemaps
gmaps = googlemaps.Client(key='AIzaSyAVUPrLICIAfdLfYEJDlc84qgzFX8noGWg')

def preprocess_country(df):
    primary_countries = [
       'Philippines', 'India', 'Nigeria', 'Saudi Arabia', 'Singapore', 'Brazil', 'South Africa', 'United States', 'Colombia',
       'Mexico', 'Ghana', 'Egypt', 'Rwanda', 'Ethiopia', 'Australia', 'Kenya', 'Indonesia', 'Oman', 'Pakistan', 'United Kingdom',
       'Guatemala', 'Panama', 'Canada', 'Bangladesh', 'Guinea', 'United Republic of Tanzania', 'Qatar', 'Afghanistan', 'Chile',
       'Mozambique', 'Türkiye', 'El Salvador', 'Togo', 'Jordan', 'Iraq', 'Israel', 'Sri Lanka', 'South Korea', 'Portugal', 'Mauritania',
       'Uruguay', 'Peru', 'Germany', 'Romania', 'Norway', 'Jamaica', 'Hungary', 'Poland', 'Spain', 'Argentina', 'Ecuador',
       'Senegal', 'Hong Kong', 'Malaysia', 'Japan', 'Kuwait', 'Ireland', 'Albania', 'Greece', 'Algeria', 'Nicaragua', 'Slovenia', 'Italy',
       'Netherlands', 'Dominican Republic', 'France', 'Uganda', 'Iran', 'Paraguay', 'Bolivia', 'Namibia', 'Tunisia', 'Puerto Rico',
       'Anguilla', 'Croatia', 'Fiji', 'Denmark', 'Sweden', 'Cyprus', 'Belgium', 'Venezuela', 'Maldives', 'Morocco', 'Switzerland',
       'Honduras', 'Austria', 'Russia', 'Burkina Faso', 'Thailand', 'Bahamas', "Côte d'Ivoire", 'Saint Lucia',
       'Democratic Republic of the Congo', 'Cambodia', 'Zimbabwe', 'Vietnam', 'Barbados', 'Suriname', 'Costa Rica', 'Botswana',
       'Curaçao', 'Guyana', 'Mali', 'China', 'Latvia', 'Libya', 'Central African Republic', 'Turks and Caicos Islands',
       'Azerbaijan', 'Yemen', 'Antigua', 'Lebanon', 'Angola', 'Bulgaria', 'Mongolia', 'Armenia', 'Trinidad and Tobago', 'Northern Mariana Islands', 
       'Nepal', 'Luxembourg', 'Somalia', 'Bahrain', 'Georgia', 'Mauritius', 'Uzbekistan', 'Taiwan', 'Iceland', 'Czechia', 'Monaco', 'Brunei', 'Malta',
       'Saint Kitts and Nevis', 'Myanmar', 'Sierra Leone', 'Sudan', 'Cameroon', 'Syria', 'The Gambia', 'Gabon', 'Montenegro', 'Laos',
       'Lithuania', 'Zambia', 'Estonia', 'Serbia', 'Benin', 'Macedonia', 'Bosnia and Herzegovina', 'Bermuda', 'Lesotho',
       'New Zealand', 'Ukraine', 'Republic of the Congo',  'Kazakhstan', 'Belarus', 'Palestine',  'Cayman Islands', 'Eswatini', 'Finland',  'Kosovo',
       'Djibouti', 'Belize', 'Saint Martin', 'U.S. Virgin Islands', 'United Arab Emirates', 'Aruba', 'Cuba', 'Haiti', 'Isle of Man', 'Slovakia'
    ]
    
    df['customer_country'] = df['customer_country'].replace('//', np.nan)
    
    for country in primary_countries:
        df.loc[train['customer_country'].str.contains(country, na=False), 'customer_country'] = country
    
    for index, loc in df.loc[df['customer_country'].isna() | ~df['customer_country'].isin(primary_countries), 'customer_country'].items():
        if pd.isna(loc):
            continue

        geocode_result = gmaps.geocode(loc)
        if geocode_result:
            for component in geocode_result[0]['address_components']:
                if 'country' in component['types']:
                    country_name = component['long_name']
                    train.at[index, 'customer_country'] = country_name
                    break
        else:
            continue
    
    df['customer_country'].fillna('Others', inplace=True)
    df.loc[~df['customer_country'].isin(primary_countries + ['Others']), 'customer_country'] = 'Others'

    return df

# customer_continent (파생변수)
def get_continent(country):
    customer_continent_mapping = {
        'Asia' : ['Philippines',  'Saudi Arabia', 'Singapore', 'United Arab Emirates', 
                  'Indonesia', 'Qatar','Israel', 'Sri Lanka', 'Malaysia', 'Kuwait', 
                  'Hong Kong', 'Uzbekistan', 'Brunei', 'Nepal', 'Maldives', 'Armenia', 'Myanmar', 'Cambodia', 
                  'Vietnam', 'Laos', 'Kazakhstan', 'Thailand', 'Syria'],
        'Africa' : ['Nigeria', 'South Africa', 'Ghana', 'Egypt', 'Rwanda', 'Ethiopia', 'Kenya', 'Guinea', 'Morocco',
                    'United Republic of Tanzania', 'Mozambique', 'Mauritania', 'Senegal', 'Algeria', 'Uganda', 'Mauritius',
                    'Namibia', 'Tunisia', 'Angola', 'Burkina Faso', "Côte d'Ivoire", 'Democratic Republic of the Congo', 
                    'Republic of the Congo', 'Zimbabwe', 'Botswana', 'Mali', 'Libya', 'Central African Republic', 'Somalia', 'Sierra Leone', 
                    'Sudan', 'Cameroon', 'The Gambia', 'Gabon', 'Zambia', 'Eswatini', 'Djibouti', 'Lesotho', 'Benin'],
        'Europe' : ['United Kingdom', 'Portugal', 'Germany', 'Romania', 'Norway', 'Hungary', 'Poland', 'Slovakia',
                    'Czechia', 'Spain', 'Ireland', 'Albania', 'Greece', 'Slovenia', 'Italy', 'Netherlands', 
                    'Croatia', 'Denmark', 'Sweden', 'Cyprus', 'Belgium', 'Switzerland', 'Austria', 'Russia', 
                    'Bulgaria', 'Luxembourg', 'Iceland', 'Monaco', 'Malta', 'Estonia', 'Serbia', 'France', 'Latvia',
                    'Macedonia', 'Bosnia and Herzegovina', 'Montenegro', 'Lithuania', 'Finland', 'Kosovo', 'Belarus', 'Ukraine'],
        'North America' : ['United States', 'Canada', 'Guatemala', 'Panama', 'Mexico', 'Colombia', 'Jamaica', 'Saint Martin',
                           'Puerto Rico', 'Anguilla', 'Dominican Republic', 'Bahamas', 'Barbados', 'Costa Rica', 'Aruba',
                           'Curaçao', 'Guyana', 'Northern Mariana Islands', 'U.S. Virgin Islands', 'Cayman Islands', 
                           'Bermuda', 'Belize', 'Cuba', 'Haiti', 'Isle of Man', 'Nicaragua', 'Honduras', 'Saint Lucia',
                           'Turks and Caicos Islands', 'Antigua', 'Saint Kitts and Nevis', 'Trinidad and Tobago'],
        'South America' : ['Brazil', 'Chile', 'El Salvador', 'Togo', 'Uruguay', 'Peru', 'Argentina', 'Ecuador', 
                           'Paraguay', 'Bolivia', 'Venezuela', 'Suriname'],
        'Oceania' : ['Australia', 'Fiji', 'New Zealand'],
        'Others' : ['Others'],
    }
    
    for continent, countries in customer_continent_mapping.items():
        if country in countries:
            return continent
    return 'Others'

# customer_type
def preprocess_customer_type(customer_type):
    customer_type_mapping = {
        'End-User' : ['End-user'],
        'End-Customer' : ['End-Customer', 'End Customer'],
        'Specifier/Influencer' : ['Specifier/Influencer', 'Specifier / Influencer', 'Specifier/ Influencer'],
        'Others' : ['Other', 'Others', 'Etc.', np.nan],
        'Software/Solution Provider' : ['Software/Solution Provider', 'Software / Solution Provider'],
        'Home Owner' : ['Homeowner', 'Home Owner'],
        'Manager/Director' : ['Manager / Director']
    }
    
    for category, jobs in customer_type_mapping.items():
        if customer_type in jobs:
            return category
    return customer_type

# customer_job
def get_customer_job(customer_job):
    customer_job_mapping = {
        'purchase' : ['purchasing', 'purchase', 'purchasing manager', 'purchaser', 'purchasing agent', 'drop, purchase maxhub', 'purchasing authority', 'purchasers', 'purchase dept', 'purchsing', 'requirements and buyer', 'buyer'],
        'director/purchase' : ['director purchaser', 'purchasing director', 'director purchaser', 'purchasing supervisor'],
        'coordinator/purchase' : ['purchasing coordinator', 'buyer, coordinating'],
        'install/purchase' : ['purchase and install', 'installation and purchaser'],
        'design/purchase' : ['designer purchaser', 'design/purchaser'],
        'install/designer' : ['design and install', 'designer/installer'],
        'media/communication' : ['media and communication', 'media and communications', 'broadcasting & media', 'media_e_comunicazione', 'média_és_kommunikáció', 'media_and_communication', 'medien_und_kommunikation', 'medios_de_comunicación'],
        'engineering' : ['engineering', 'engineer', 'engineering & technical', 'project engineer'],
        'director/engineering' : ['engineering director', 'director of engineering', 'chief of engineering', 'lead engineer', 'engineering & technical executive', 'chief engineer', 'principal engineer'],
        'system/engineering' : ['systems engineer', 'system engineer', 'systems administrator', 'systems design'],
        'design/engineering' : ['designer/ engineer', 'design engineer'],
        'consulting' : ['consulting', 'consultant', 'consultent', 'content creation, eq consultant'],
        'project_manager' : ['program and project management', 'project manager', 'project coordinator', 'project lead', 'project facilitator', 'producer/project manager', 'project director', 'gestión_de_proyectos', 
                             'project head', 'programm-_und_projektmanagement', 'program_and_project_management', 'program_and_project_manager', 'projectr mgmt', 'owner / project manager', 'project manage', 
                             'project sales/manage', 'project administrator', 'programm- und projektmanagement', 'projektmenedzsment\tprogram and project management', 'digital project manager', 'program-_és_projektmenedzsment', 
                             'projection manager'],
        'designer/project_manager' : ['designer/ project manager', 'project manager/designer'],
        'project_architect' : ['project architect', 'project designer'],
        'member' : ['project team member', 'mindenes'],
        'sales' : ['sales', 'sales manager', 'sales executive', 'salesman', 'technical sales', 'sale', 'sales rep', 'sales operations', 'field / outside sales', 'vendite', 'vertrieb', 'értékesítés'],
        'operation' : ['operations', 'strategy & operations specialist', 'facilities and operations', 'regional director of operations', 'operations executive', 'operaciones', 'üzemeltetés'],
        'director/operation' : ['operations manager', 'director of operations'],
        'administrative' : ['administrative', 'admin', 'administration', 'authorize (you are responsible for making the final decision)', 'adminisztráció', 'amministrativo', 'administración'],
        'administrative assistant' : ['administrative assistant', 'admin assistant'],
        'it' : ['information technology', 'it integrator', 'it department', 'it - information technology', 'computing & it', 'it/software', 'it',  'it tech.', 'it support', 'information technology\u200b', 'information_technology'],
        'director/it' : ['director it', 'it director', 'it specialist', 'it manager', 'director,it', 'director of it', "i'm directing it", 'it dairector', 'it project lead', 'it admin', 'it administrator', 
                         'deputy cio', 'it project lead'],
        'account manager' : ['account management', 'account exec/manager'],
        'education' : ['education', 'educator', 'higher education (college & university)', 'teacher', 'teaching', 'institute & academy'],
        'hr' : ['human resources', 'human_resources', 'hr posting', 'hr'],
        'finance' : ['finance', 'finanzen', 'finanzas', 'pénzügy'],
        'finance manager' : ['director of finance', 'finance executive'],
        'marketing' : ['marketing', 'marketing coordinator', 'event marketing', 'field marketing', 'marketing operations', 'marketing executive', 'technical marketing', 'product marketing'],
        'si' : ['si', 'system installer', 'installer/ system integrater'],
        'general manger' : ['general manager', 'gm', 'general manager - project manager', 'general manager (decision maker)', 'general management', 'genel müdür', 'genera manager'],
        'manager' : ['managgere', 'ordering manager', 'comanager', 'managing director', 'management', 'manger', 'managing contractor', 'managing partner', 'ops mgr'],
        'contractor' : ['general contractor', 'sub contractor', 'federal government contractor', 'contractor', 'electrical contractor', 'cintractor', 'managing contractor'],
        'owner' : ['owning company', 'owner', 'gm/part owner', 'product owner', 'business owner', 'owner representation'],
        'military and protective services' : ['military and protective services', 'military_and_protective_services'],
        'artist' : ['artist, lead on equipment selection','3d/vfx art'],
        'art/design' : ['arts and design', 'arts_and_design', 'art and design', 'arte_e_design', 'arte y diseño', 'művészet_és_design'],
        'medical imaging' : ['medical imaging specialist', 'spécialiste_en_imagerie_médicale', 'medical imaging  specialist', 'radiology professional', 'radiology  professional', 
                             'radiology_professional', 'profesional de radiología'],
        'medical solution' : ['medical solution provider', 'medical solution  provider', 'medical solution provider\u200b', 'medical solution'],
        'doctor' : ['surgery professional', 'doctor', 'surgery professional\u200b', 'főorvos', 'profesional de cirugía', 'cirugano', 'chirurgien'],
        'property owner' : ['property owner' 'building owner', 'proprietário(a)', 'building owner'],
        'ceo' : ['ceo', 'ceo/founder', 'chief eng.''c-level executive'],
        'end-user' : ['end user', 'primary end-user', 'main end user of the product', 'user', 'cliente final'],
        'recommender' : ['recommend', 'recommendation', 'recommend (you recommend specific products or technologies for the solution)', 'recommender'],
        'purchase/planner' : ['planner/purchaser', 'purchase/planner'],
        'install/planner' : ['planning and installation', 'planning and installation', 'install/planner'],
        'technical' : ['technical', 'tech service', 'tech', 'maintenance technician'],
        'technical/director' : ['head of technology', 'technical director', 'directeur technique'],
        'technical/designer' : ['technology designer', 'designer, creative technologist'],
        'av' : ['av technician', 'av tech','costar av team'],
        'av manager' : [ 'av project manager', 'av estimator', 'a/v project manager'],
        'bidder' : ['public bidder', 'bidder'],
        'installer' : ['installer.', 'installer','facilitator installation services'],
        'design/install' : ['install/designer', 'design and installation company', 'install/designer'],
        'research/install' : ['research/install', 'research and instalaltion'],
        'advertising' : ['advertising and promotions team', 'advertising'],
        'reseller' : ['vendor / reseller', 'revendedor', 'reseller', 'reseller/integrator', 'var'],
        'community/social services' : ['community and social services', 'community_and_social_services'],
        'video wall' : ['wall mounted screen mirroring', 'video wall', 'part of video wall', 'component of video wall', 'videowall'],
        'tv' : ['need 1 tv 55" edge led 4k uhd', 'replacing tv', 'tv studio manager', 'change tv', 'need one tv', 'hotel tv', 'fixing tv', 'replacement tv', 'guestroom tv'],
        'cctv' : ['cctv monetoring', 'cctv view'],
        'display/signage' : ['signage subcontractor p/m', 'digital signage', 'signage manager', 'signage for an attraction', 'sliding pictures of beauty salon', 'using for window display', 'signage subcontractor p/m',
                             'display screen from control', 'display our products', 'display screen', 'display screen from control', 'restaurant display', 'display', 'sign company', 'informatics, touch capability'],
        'repair' : ['repair uhd 120 hz units'],
        'manufacturer' : ['manufacturer', 'manufacturing factory / plant'],
        'procurement' : ['procurement', 'procurement specialist', 'procurment'],
        'sourcing/procurement' : ['sourcing/procurement', 'sourcing / procurement'],
        'supervisor' : ['maintenance supervisor', 'supervisor', 'overseer'],
        'testing' : ['testing and troubleshooting', 'tester', 'inquiry-to-buy/contact-us test', 'test4'],
        'solution' : ['solution provider', 'solution advisor', 'software solution', 'solution engineer'],
        'r&d' : ['research and developement',  'research & development', 'r&d project manager'],
        'research' : ['research', 'product research', 'product research', 'research products and prices', 'product researcher', 'project researcher'],
        'architect' : ['solutions architect', 'architect ass interiores'],
        'interior designer' : ['interior designer', 'interior stylist'],
        'integrator' : ['specifier/integrator', 'integration', 'integrator', 'integrador', 'intergrator'],
        'quoter' : ['sourcing & quoting for end user', 'asking for quote for client', 'quotation curator', 'quote gathering/proposer to owner', 'distributor quotation', 'customer experience', 'quoting project'],
        'leader' : ['lead', 'team leader', 'leader', 'team lead'],
        'technical design' : ['technical designer', 'technical design'],
        'creation and design' : ['kreation und design', 'kreation_und_design'],
        'designer' : ['designer', 'designer, producer', 'designers', 'graphic design'],
        'helpdesk' : ['helpdesk specialist', 'helpdesk specialist', 'help desk / desktop services'],
        'energy' : ['energy', 'renewable energy'],
        'distributor' : ['distributor', 'distribuidor'],
        'theater' : ['community theater', 'home theater'],
        'vice president' : ['vp/gm', 'vice president', 'underboss'],
        'distributor' : ['distribuidor', 'distributor'],
        'decision maker' : ['decision maker', 'design/decision maker', 'decider'],
        'equipment' : ['equipment custodian', 'equipment and app provider', 'equipment selection'],
        'photographer' : ['photos', 'photographer'],
        'quality assurance' : ['quality assurance', 'quality_assurance'],
        'healthcare services' : ['healthcare services', 'healthcare_services', 'mental health', 'healthcare professionals', 'healthcare'],
        'conference' : ['conference room', 'conference room', 'conference table', 'for confrence', 'for presentations'],
        'electronics' : ['electronics & telco', 'electronics evaluator'],
        'facilitator' : ['facilitator', 'facility administrator', 'facilities', 'facilitator installation services'],
        'coordinator' : ['coordinator', 'service coordinator', 'parts coordinator'],
        'developer' : ['application development', 'software developer', 'developer'],
        'business development' : ['business development', 'business_development'],
        'serving' : ['serving', 'serving robot', 'serving food', 'assist in serving food', 'waiter'],
        'exhibition' : ['museum / gallery', 'exhibition / convention center'],
        'clinic' : ['clinical specialist', 'clinic'],
        'office' : ['office', 'corporate / office', 'office it'],
        'executive' : ['execution', 'engagement executive'],
        'veterinarian' : ['tierarzt'],
        'principal' : ['principal', 'principal in charge'],
        'events' : ['store promotions', 'tradeshow event'],
        'others' : ['others', 'other', '5% of hotel needs', 'otro', 'otros', 'n.a', 'digital display vs signage need', 'no respoxse on phone will try again', 'we are in iceland', 'no requirment', 
                    'requirement close', 'the person with the credit card', 'nothing', 'other stores', 'sho lyrics', 'sonstiges', 'altro', 'autres', 'egyéb', 'ranger 2', 'menu', np.nan]
    }
    
    for category, jobs in customer_job_mapping.items():
        if customer_job in jobs:
            return category
    return customer_job

# customer_position
def get_customer_position_category(customer_position):
    customer_position_mapping = {
        'entry level' : ['entry level', 'entrylevel'], 
        'none' : ['none', 'this is a consume display requirement for home purpose.', 'not applicable', 'no influence', 'other - please specify - cedia association'], 
        'teacher' : ['teacher', 'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)'],
        'math/physics teacher' : ['math and physics teacher', 'physics and mathematics teacher'],
        'professor' : ['professor', 'prof.', 'education professional'],
        'assistant professor' : ['asst prof.', 'assistant professor'],
        'associate professor' : ['associate professor', 'associate professor in electronics engg'],
        'ceo/founder' : ['ceo/founder', 'ceo/fundador'],
        'c-level executive' : ['c-level executive', 'c-levelexecutive'],
        'architecture/consult' : ['architecture/consult', 'architect/consultant'],
        'decision-maker' : ['decision maker', 'decision-maker'],
        'decision-influencer' : ['decision-influencer', 'decision influencer'],
        'partner' : ['partner', 'business partner'],
        'vice president' : ['vice president', 'vicepresident', 'vp'], 
        'consultant' : ['consultant', 'consulting'],
        'business development' : ['business development', 'business development'],
        'president' : ['president', 'the big boss', 'chairman'],
        'exhibition' : ['exhibitiontv', 'exhibition'],
        'technical' : ['technical', 'técnico'],
        'owner' : ['proprietário(a)'],
        'sales' : ['subsidiary sales (ise)', 'sales'],
        'other' : ['other', 'others', 'bulgaria'],
        'developer' : ['lider de desarrollo'],
        'employee' : ['employee', 'mindenes'],
        'administrative' : ['administrative', 'gerente', 'genel müdür'], 
        'hospital' : ['hospital', 'főorvos'],
        'veterinarian' : ['tierarzt']
    }
    
    for category, customer_positions in customer_position_mapping.items():
        if customer_position in customer_positions:
            return category
    return customer_position

# product_category
def get_product_category(product_category):
    product_category_mapping = {
        'sinage' : ['signage', 'tv', 'ur640', '43us660h0sd.awz', '32lq621cbsb.awz', '32lq621cbsb.awz'],
        'special display' : ['特別顯示屏'],
        'standard display' : ['標準顯示屏'],
        'hospital display' : ['醫院電視'],
        'hotel display' : ['酒店電視'],
        'high brightness' : ['互動式顯示屏', 'high brightness', '高亮度顯示屏'],
        'multi divisions' : ['פיצול מרובה'],
        'board' : ['idb', 'board'],
        'monitor' : ['monitor', '28mq780'],
        'software' : ['軟體'],
        'all-in-one' : ['aio', 'allinone', 'leadallin'],
        'digital retail' : ['retaildigital'],
        'air conditioner' : ['air condition', 'split', 'ac', 'מזגנים', 'تكييفات', 'điều hòa', 'standard'],
        'residential air conditioner' : ['rac', 'ar condicionado residencial', 'résidentiel', 'เครื่องปรับอากาศเผื่อที่อยู่อาศัย'],
        'air handling unit' : ['ahu'],
        'multi air conditioner' : ['multi'],
        'single air conditioner' : ['single package'],
        'cassete air conditioner' : ['teto ou cassete inverter'],
        'heat pump' : ['pompy ciepła'],
        'heater' : ['heating', 'heater', 'isıtma', 'calefacción', 'حلول التدفئة', 'חימום', 'aquecimento'],
        'refrigerator' : ['refrigerator', 'soğutucu'],
        'cooling' : ['réfrigérant', 'pendingin'],
        'air conditiner/cooling' : ['تكييف وتبريد', 'مبرد'],
        'others': ['other', 'otros', 'outros', 'אחר', 'ฯลฯ', 'آخر', 'lainnya', 'not specified', 'inne', 'autre', 'khác', 'etc', np.nan]
    }
    
    for category, product_categories in product_category_mapping.items():
        if product_category in product_categories:
            return category
    return product_category

# lead_desc_length
from scipy import stats

def get_lead_desc_length_transformed(df):
    df['lead_desc_length'], fitted_lambda = stats.boxcox(df['lead_desc_length'])
    return df

# inquiry_type
def get_inquiry_type(inquiry_type):
    inquiry_type_mapping = {
        'others' : ['Other', 'other', 'other_', 'Others', 'others', 'ETC.', 'not specified', 'Not specified', '(Select ID_Needs)', np.nan],
        'quotation or purchase consultation' : ['Quotation or Purchase Consultation', 'Quotation or purchase consultation', 'Quotation or Purchase consultation', 'quotation_or_purchase_consultation', 
                                                'Request for quotation or purchase', 'Purchase or Quotation'],
        'usage or technical consultation' : ['Usage or technical consultation', 'Usage or Technical Consultation', 'usage or technical consultation', 'usage_or_technical_consultation'],
        'event inquiry' : ['Event Inquiry', 'Evento_SdelEstero'],
        'technical consultation' : ['Technical Consultation', 'Request for technical consulting', 'technical_consultation'],
        'lg magnit micro led inquiry' : ['estoy buscando para ecuador este producto lg magnit micro led, para un cliente de 138 pulgadas, con envió marítimo.'],
        'interactive screens quotation' : ['hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en guayaquil -ecuador.'],
        'body temperature measurement device inquiry' : ['Vui lòng báo giá giúp mình sản phẩm đo thân nhiệt Xin cảm ơn'],
        'probeam pricing inquiry' : ['probeam precio', 'Probeam precio'],
        'interactive screens for clinics' : ['Pantallas Interactivas para Clinicas'],
        'one quick support' : ['solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución one quick:', 'One Quick:Flex', 
                               'Solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución\xa0One Quick:\xa0'],
        'george v historical integrator' : ['intégrateur historique du george v'],
        'school inquiry' : ['for school'],
        'sales inquiry' : ['Sales Inquiry', 'Sales inquiry', 'sales'],
        'technical information and pricing inquiry' : ['toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung'],
        'lg product pricing and solutions inquiry' : ['tôi cần tham khảo giá và giải pháp từ lg'],
        'medical monitor for conventional and tomography inquiry' : ['preciso de um monitor médico para radiografia convencional e tomogrtafia.'],
        'lg magnit micro led inquiry' : ['estoy buscando para Ecuador este producto LG MAGNIT micro LED, para un cliente de 138 pulgadas, con envió marítimo.'],
        'george v historical integrator' : ['Intégrateur historique du George V'],
        'technical information and pricing inquiry' : ['Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung'],
        'lg product pricing and solutions inquiry' : ['tôi cần tham khảo giá và giải pháp từ LG'],
        'medical monitor for conventional and tomography inquiry' : ['Preciso de um monitor médico para radiografia convencional e tomogrtafia.'],
        'interactive screens quotation' : ['Hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en Guayaquil -Ecuador.']
    }
    for category, inquiry_types in inquiry_type_mapping.items():
        if inquiry_type in inquiry_types:
            return category
    return inquiry_type

# expected_timeline
def get_expected_timeline(df):
    mapping_dict = {
        'less_than_3_months': 'less than 3 months',
        '3_months_~_6_months': '3 months ~ 6 months',
        '9_months_~_1_year': '9 months ~ 1 year',
        '6_months_~_9_months': '9 months ~ 1 year',
        'more_than_a_year': 'more than a year',
        'less then 6 months': '3 months ~ 6 months',
        'less than 5 months': '3 months ~ 6 months',
        'more then 3 months': '3 months ~ 6 months',
        'less than 3 months. customer not answered . to call back': 'less than 3 months',
        'one month': 'less than 3 months',
        'duplicate lead - il220100042906. less than 3 months': 'less than 3 months',
        '9 months - 1 year': '9 months ~ 1 year',
        'less than 3 months ,meeting with the customer for the more details and tentative boq will ne 32 and 43': 'less than 3 months',
        'less than 3 months- outdoor led requiment': 'less than 3 months'
    }

    valid_values = ['less than 3 months', '3 months ~ 6 months', '6 months ~ 9 months', '9 months ~ 1 year', 'more than a year']

    df['expected_timeline'] = df['expected_timeline'].replace(mapping_dict)
    df['expected_timeline'] = df['expected_timeline'].apply(lambda x: x if x in valid_values else 'unknown')

    return df

# converted_rate 한번에 생성하는 함수
def get_converted_rate(columns, train, test):
    for col in columns:
        conversion_rates = {}
        for uni in train[f'{col}'].unique():
            conversions = train[(train[f'{col}'] == uni) & (train['is_converted'] == True)].shape[0]
            total = train[train[f'{col}'] == uni].shape[0]
            conversion_rates[uni] = conversions / total if total > 0 else 0

        train[f'{col}_converted_rate'] = train[f'{col}'].map(conversion_rates).fillna(0)
        test[f'{col}_converted_rate'] = test[f'{col}'].map(conversion_rates).fillna(0)
    
    return train, test

# business_area
def get_business_area(df):
    df['business_area'].fillna('others', inplace=True)
    return df

# business_area 가중치 부여 : 'corporate / office', 'retail', 'hotel & accommodation' (파생변수)
def get_ver_business_area(df):
    ver_business_area = ['corporate / office', 'retail', 'hotel & accommodation']
    df['ver_business_area'] = np.where(df['business_area'].isin(ver_business_area), 1, 0)
    return df

In [None]:
def preprocess(train, test):
    train = drop_columns(train)
    test = drop_columns(test)
    print("== [ drop ] complete == ", flush=True)
    
    train = fill_missing_values_with_0(train)
    test = fill_missing_values_with_0(test)
    print("== [ fill values with 0 ] complete == ", flush=True)
    
    train = preprocess_country(train)
    test = preprocess_country(test)
    print("== [ preprocess_country ] complete == ", flush=True)
    
    train['customer_continent'] = train['customer_country'].apply(get_continent)
    test['customer_continent'] = test['customer_country'].apply(get_continent)
    print("== [ customer_continent ] complete == ", flush=True)
    
    train['customer_type'] = train['customer_type'].apply(preprocess_customer_type)
    test['customer_type'] = test['customer_type'].apply(preprocess_customer_type)
    print("== [ customer_type ] complete == ", flush=True)
    
    train['customer_job'] = train['customer_job'].apply(get_customer_job)
    test['customer_job'] = test['customer_job'].apply(get_customer_job)
    print("== [ customer_job ] complete == ", flush=True)
    
    train['customer_position'] = train['customer_position'].apply(get_customer_position_category)
    test['customer_position'] = test['customer_position'].apply(get_customer_position_category)
    print("== [ customer_position ] complete == ", flush=True)
    
    train['product_category'] = train['product_category'].apply(get_product_category)
    test['product_category'] = test['product_category'].apply(get_product_category)
    print("== [ product_category ] complete == ", flush=True)
    
    train = get_lead_desc_length_transformed(train)
    test = get_lead_desc_length_transformed(test)
    print("== [ lead_desc_length ] complete == ", flush=True)
    
    train['inquiry_type'] = train['inquiry_type'].apply(get_inquiry_type)
    test['inquiry_type'] = test['inquiry_type'].apply(get_inquiry_type)
    print("== [ inquiry_type ] complete == ", flush=True)
    
    train = get_expected_timeline(train)
    test = get_expected_timeline(test)
    print("== [ expected_timeline ] complete == ", flush=True)
    
    train = get_business_area(train)
    test = get_business_area(test)
    print("== [ business_area ] complete == ", flush=True)
    
    train = get_ver_business_area(train)
    test = get_ver_business_area(test)
    print("== [ ver_business_area ] complete == ", flush=True)
    
    # 전환율 한번에 구하기
    columns = ['customer_continent', 'customer_idx', 'customer_type', 'customer_position', 'business_unit', 'response_corporate', 'lead_owner', 'product_category', 'inquiry_type', 'business_area',
               'bant_submit', 'expected_timeline', 'enterprise'] # 'ver_win_rate_x', 'ver_win_ratio_per_bu'는 학습으로 결측 처리한 후에 전환율 구하도록.
    train, test = get_converted_rate(columns, train, test)
    print("== [ converted_rate ] complete == ", flush=True)
    
    return train, test

train, test = preprocess(train, test)

In [None]:
train.to_csv('train_preprocessed.csv', index=False)
test.to_csv('test_preprocessed.csv', index=False)

# 결측치 채우기 (ver_win_rate_x, ver_win_ratio_per_bu, com_reg_ver_win_rate)

In [2]:
def encode_categorical_variables(train, test):                                        # 인코더 정의
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        
        train_unique_labels = train[col].astype(str).unique()
        le.fit(train_unique_labels)
        
        test_unique_labels = test[col].astype(str).unique()
        unseen_labels = set(test_unique_labels) - set(train_unique_labels)
        
        if unseen_labels:
            le_classes = le.classes_.tolist()  
            le_classes.append('other') 
            le.classes_ = np.array(le_classes)  
            test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
        else:

            test[col] = test[col].astype(str)
        
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col])
        
        label_encoders[col] = le
    
    return train, test, label_encoders

## 1. train data의 ver_win_rate_x 분류

In [3]:
train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [4]:
#예측에 사용할 변수 선택
train_rate = train.drop(columns=['ver_win_ratio_per_bu', 'com_reg_ver_win_rate'])

In [5]:
#결측을 기준으로 train test 분리
test = train_rate[train_rate['ver_win_rate_x'].isnull()]
train = train_rate[~train_rate['ver_win_rate_x'].isnull()]

In [6]:
train.shape, test.shape

((18417, 38), (40882, 38))

In [7]:
train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #분류일때 실행
test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #분류일때 실행
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)


In [8]:
#인코딩 
train, test, label_encoders = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be s

In [9]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_rate_x', axis=1)
train_y = train['ver_win_rate_x']

test_x = test.drop(columns='ver_win_rate_x', axis=1)   

In [10]:
X_train, X_valid, y_train, y_valid = tts(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [11]:
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_valid)
f1_score(y_valid, pred, average='macro')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1746
[LightGBM] [Info] Number of data points in the train set: 14733, number of used features: 37
[LightGBM] [Info] Start training from score -2.899577
[LightGBM] [Info] Start training from score -2.316459
[LightGBM] [Info] Start training from score -2.253126
[LightGBM] [Info] Start training from score -2.366558
[LightGBM] [Info] Start training from score -2.210755
[LightGBM] [Info] Start training from score -1.814205
[LightGBM] [Info] Start training from score -1.504994
[LightGBM] [Info] Start training from score -4.084416
[LightGBM] [Info] Start training from score -5.098035
[LightGBM] [Info] Start training from score -3.567160
[LightGBM] [Info] Start training from score -2.723647
[LightGBM] [Info] Start training from score -3.323083

1.0

In [None]:
#train_rate['ver_win_rate_x'].info()

In [12]:
test_pred = model.predict(test_x)
test_pred= label_encoders['ver_win_rate_x'].inverse_transform(test_pred)
test_pred = [float(x) for x in test_pred]  #type(test_pred[0])

In [13]:
train_rate.loc[test.index, 'ver_win_rate_x'] = test_pred        #예측벡터 입력

In [None]:
#train_rate['ver_win_rate_x'].info()

In [14]:
train_rate.to_csv('train_rate_x.csv', index=False) 

## 2. test data의 ver_win_rate_x 분류

In [15]:
train = pd.read_csv('train_rate_x.csv')
test = pd.read_csv('test_preprocessed.csv')

#예측에 사용할 변수 선택
train_rate = train.drop(columns=["is_converted"])
test_org = test.drop(columns=['ver_win_ratio_per_bu', 'com_reg_ver_win_rate',"is_converted"])

In [16]:
train_rate.shape, test_org.shape

((59299, 37), (5271, 37))

In [17]:
#결측을 기준으로 train test 분리
train = train_rate
test = test_org[test_org['ver_win_rate_x'].isnull()]       #주의!

In [18]:
train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #분류일때 실행
test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)


In [19]:
#인코딩 
train, test, label_encoders = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [20]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_rate_x', axis=1)
train_y = train['ver_win_rate_x']

test_x = test.drop(columns='ver_win_rate_x', axis=1)   

X_train, X_valid, y_train, y_valid = tts(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [21]:
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_valid)
f1_score(y_valid, pred, average='macro')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1929
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 36
[LightGBM] [Info] Start training from score -4.038571
[LightGBM] [Info] Start training from score -3.489261
[LightGBM] [Info] Start training from score -3.431566
[LightGBM] [Info] Start training from score -3.537361
[LightGBM] [Info] Start training from score -3.373937
[LightGBM] [Info] Start training from score -2.993185
[LightGBM] [Info] Start training from score -2.678024
[LightGBM] [Info] Start training from score -0.363119
[LightGBM] [Info] Start training from score -6.245411
[LightGBM] [Info] Start training from score -4.770748
[LightGBM] [Info] Start training from score -3.904442
[LightGBM] [Info] Start training from score -4.484933

1.0

In [22]:
#test_org['ver_win_rate_x'].info()

In [23]:
test_pred = model.predict(test_x)
test_pred= label_encoders['ver_win_rate_x'].inverse_transform(test_pred)
test_pred = [float(x) for x in test_pred]  #type(test_pred[0])

test_org.loc[test.index, 'ver_win_rate_x'] = test_pred        #예측벡터 입력

In [24]:
#test_org['ver_win_rate_x'].info()

In [25]:
test_org.to_csv('test_rate_x.csv', index=False) 

## 1. train data의 ver_win_ratio_per_bu 분류

In [26]:
train_bu = pd.read_csv('train_rate_x.csv')   
a = pd.read_csv('train.csv')
train_bu["ver_win_ratio_per_bu"]=a["ver_win_ratio_per_bu"]    #ver_win_ratio_per_bu복구하기

In [27]:
#결측을 기준으로 train test 분리
test = train_bu[train_bu['ver_win_ratio_per_bu'].isnull()]
train = train_bu[~train_bu['ver_win_ratio_per_bu'].isnull()]
train.shape, test.shape

((15304, 39), (43995, 39))

In [28]:
train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  #분류일때 실행
test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  #분류일때 실행
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)


In [29]:
#인코딩 
train, test, label_encoders = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be s

In [30]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_ratio_per_bu', axis=1)
train_y = train['ver_win_ratio_per_bu']

test_x = test.drop(columns='ver_win_ratio_per_bu', axis=1)  

X_train, X_valid, y_train, y_valid = tts(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [31]:
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_valid)
f1_score(y_valid, pred, average='macro')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 12243, number of used features: 37
[LightGBM] [Info] Start training from score -2.948121
[LightGBM] [Info] Start training from score -2.429847
[LightGBM] [Info] Start training from score -3.067073
[LightGBM] [Info] Start training from score -4.449865
[LightGBM] [Info] Start training from score -2.515005
[LightGBM] [Info] Start training from score -4.362854
[LightGBM] [Info] Start training from score -4.046734
[LightGBM] [Info] Start training from score -6.193834
[LightGBM] [Info] Start training from score -3.679368
[LightGBM] [Info] Start training from score -3.586710
[LightGBM] [Info] Start training from score -2.430775
[LightGBM] [Info] Start training from score -2.001153

1.0

In [32]:
#train_bu['ver_win_ratio_per_bu'].info()

In [33]:
test_pred = model.predict(test_x)
test_pred= label_encoders['ver_win_ratio_per_bu'].inverse_transform(test_pred)
test_pred = [float(x) for x in test_pred]  #type(test_pred[0])

train_bu.loc[test.index, 'ver_win_ratio_per_bu'] = test_pred        #예측벡터 입력

In [34]:
#train_bu['ver_win_ratio_per_bu'].info()

In [35]:
train_bu.to_csv('train_bu.csv', index=False) 

## 2. test data의 ver_win_ratio_per_bu 분류

In [36]:
train_bu = pd.read_csv('train_bu.csv')
test_bu = pd.read_csv('test_rate_x.csv')  

In [37]:
b = pd.read_csv('submission.csv')
test_bu["ver_win_ratio_per_bu"]=b["ver_win_ratio_per_bu"]    #ver_win_ratio_per_bu복구 

#예측에 사용할 변수 선택
train_bu = train_bu.drop(columns=["is_converted"])
test_bu = test_bu

In [38]:
#결측을 기준으로 train test 분리
train = train_bu
test = test_bu[test_bu['ver_win_ratio_per_bu'].isnull()]       #주의!

In [39]:
train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  #분류일때 실행
test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)


In [40]:
#인코딩 
train, test, label_encoders = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [41]:
def oversample_train_data(train):                                                    #오버샘플링 (4가지 중 한개) 비추천!
    X = train.drop('ver_win_ratio_per_bu', axis=1)
    y = train['ver_win_ratio_per_bu']

    # 오버샘플링 방법 선택
    # smote = SMOTE()
    # X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # adasyn = ADASYN()
    # X_resampled, y_resampled = adasyn.fit_resample(X, y)

    # borderline_smote = BorderlineSMOTE()
    # X_resampled, y_resampled = borderline_smote.fit_resample(X, y)
    
    ros = RandomOverSampler()
    X_resampled, y_resampled = ros.fit_resample(X, y)

    train_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    train_resampled['ver_win_ratio_per_bu'] = y_resampled

    return train_resampled


train = oversample_train_data(train)

In [42]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='ver_win_ratio_per_bu', axis=1)
train_y = train['ver_win_ratio_per_bu']

test_x = test.drop(columns='ver_win_ratio_per_bu', axis=1)   

train_x, valid_x, train_y, valid_y = tts(train_x, train_y, test_size=0.2, random_state=24, shuffle=True
                                                      #stratify=train_y
                                                     )  

In [44]:
lgbm_clf = LGBMClassifier(n_estimators=500, objective='multiclass',n_jobs= -1,random_state=24)               # macro f1 
early_stopping_callback = lgb.early_stopping(stopping_rounds=50)
# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
lgbm_clf.fit(train_x, train_y, callbacks=[early_stopping_callback], eval_metric="multi_logloss", eval_set=eval_set)

# F1 Score 계산
f1 = f1_score(valid_y, lgbm_clf.predict(valid_x), average='macro')
print('Macro F1 Score: {0:.4f}'.format(f1))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1937
[LightGBM] [Info] Number of data points in the train set: 645306, number of used features: 37
[LightGBM] [Info] Start training from score -3.135344
[LightGBM] [Info] Start training from score -3.134489
[LightGBM] [Info] Start training from score -3.128065
[LightGBM] [Info] Start training from score -3.142497
[LightGBM] [Info] Start training from score -3.135522
[LightGBM] [Info] Start training from score -3.127040
[LightGBM] [Info] Start training from score -3.135843
[LightGBM] [Info] Start training from score -3.136735
[LightGBM] [Info] Start training from score -3.136592
[LightGBM] [Info] Start training from score -3.135843
[LightGBM] [Info] Start training from score -3.131822
[LightGBM] [Info] Start training from score -3.13887

In [None]:
#test_bu['ver_win_ratio_per_bu'].info()

In [45]:
test_pred = lgbm_clf.predict(test_x)
test_pred= label_encoders['ver_win_ratio_per_bu'].inverse_transform(test_pred)
test_pred = [float(x) for x in test_pred]  #type(test_pred[0])

test_bu.loc[test.index, 'ver_win_ratio_per_bu'] = test_pred        #예측벡터 입력

In [None]:
#test_bu['ver_win_ratio_per_bu'].info()

In [46]:
test_bu.to_csv('test_bu.csv', index=False) 

## 1. train data의 com_reg_ver_win_rate 회귀

In [47]:
train_rate2 = pd.read_csv('train_bu.csv')
c = pd.read_csv('train.csv')
train_rate2["com_reg_ver_win_rate"]=c["com_reg_ver_win_rate"]    #com_reg_ver_win_rate복구하기

In [48]:
#결측을 기준으로 train test 분리
test = train_rate2[train_rate2['com_reg_ver_win_rate'].isnull()]
train = train_rate2[~train_rate2['com_reg_ver_win_rate'].isnull()]
train.shape, test.shape

((14568, 40), (44731, 40))

In [49]:
#인코딩 
train, test, label_encoders = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str).apply(lambda x: x if x in train_unique_labels else 'other')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = le.transform(train[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be s

In [50]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='com_reg_ver_win_rate', axis=1)
train_y = train['com_reg_ver_win_rate']

test_x = test.drop(columns='com_reg_ver_win_rate', axis=1)  

train_x, valid_x, train_y, valid_y = tts(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [55]:
lgbm_regressor = LGBMRegressor(n_estimators=500, objective='regression', n_jobs=-1,random_state=24)  # 회귀 모델 설정
early_stopping_callback = lgb.early_stopping(stopping_rounds=50)

# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
lgbm_regressor.fit(train_x, train_y, callbacks=[early_stopping_callback], eval_metric="rmse", eval_set=eval_set)

# RMSE 계산
preds = lgbm_regressor.predict(valid_x)
mse = mean_squared_error(valid_y, preds)
rmse = np.sqrt(mse)  
print('RMSE: {0:.4f}'.format(rmse))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1687
[LightGBM] [Info] Number of data points in the train set: 11654, number of used features: 39
[LightGBM] [Info] Start training from score 0.091061
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.00427199	training's l2: 1.82499e-05	valid_1's rmse: 0.0208495	valid_1's l2: 0.000434703
RMSE: 0.0208


In [56]:
test_pred = lgbm_regressor.predict(test_x)
train_rate2.loc[test.index, 'com_reg_ver_win_rate'] = test_pred        #예측벡터 입력

In [57]:
train_rate2.to_csv('train_rate2.csv', index=False) 

## 2. test data의 com_reg_ver_win_rate 회귀

In [58]:
train_rate2 = pd.read_csv('train_rate2.csv')
test_rate2 = pd.read_csv('test_bu.csv')  

In [59]:
d = pd.read_csv('submission.csv')
test_rate2["com_reg_ver_win_rate"]=d["com_reg_ver_win_rate"]    #com_reg_ver_win_rate 복구 

#예측에 사용할 변수 선택
train_rate2 = train_rate2.drop(columns=["is_converted"])
test_rate2 = test_rate2

In [60]:
#결측을 기준으로 train test 분리
train = train_rate2
test = test_rate2[test_rate2['com_reg_ver_win_rate'].isnull()]       #주의!

In [61]:
#인코딩 
train, test, label_encoders = encode_categorical_variables(train, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = le.transform(test[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [62]:
#목적 변수에 따라 X, Y 분리 
train_x = train.drop(columns='com_reg_ver_win_rate', axis=1)
train_y = train['com_reg_ver_win_rate']

test_x = test.drop(columns='com_reg_ver_win_rate', axis=1)  

train_x, valid_x, train_y, valid_y = tts(train_x, train_y, test_size=0.2, random_state=42, shuffle=True,
                                                      #stratify=train_y
                                                     )  

In [64]:
lgbm_regressor = LGBMRegressor(n_estimators=500, objective='regression', n_jobs=-1,random_state=24)  # 회귀 모델 설정
early_stopping_callback = lgb.early_stopping(stopping_rounds=50)

# 모델 학습
eval_set = [(train_x, train_y), (valid_x, valid_y)]
lgbm_regressor.fit(train_x, train_y, callbacks=[early_stopping_callback], eval_metric="rmse", eval_set=eval_set)

# RMSE 계산
preds = lgbm_regressor.predict(valid_x)
mse = mean_squared_error(valid_y, preds)
rmse = np.sqrt(mse)  
print('RMSE: {0:.4f}'.format(rmse))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1966
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 38
[LightGBM] [Info] Start training from score 0.115437
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.00478254	training's l2: 2.28727e-05	valid_1's rmse: 0.00761541	valid_1's l2: 5.79945e-05
RMSE: 0.0076


[495]	training's rmse: 0.0048886	training's l2: 2.38984e-05	valid_1's rmse: 0.00815684	valid_1's l2: 6.6534e-05
[496]	training's rmse: 0.00488523	training's l2: 2.38655e-05	valid_1's rmse: 0.00815573	valid_1's l2: 6.65159e-05
[497]	training's rmse: 0.00487993	training's l2: 2.38137e-05	valid_1's rmse: 0.00815261	valid_1's l2: 6.6465e-05
[498]	training's rmse: 0.00487599	training's l2: 2.37753e-05	valid_1's rmse: 0.00815326	valid_1's l2: 6.64756e-05
[499]	training's rmse: 0.00487194	training's l2: 2.37358e-05	valid_1's rmse: 0.00815061	valid_1's l2: 6.64324e-05
[500]	training's rmse: 0.00486899	training's l2: 2.37071e-05	valid_1's rmse: 0.00815085	valid_1's l2: 6.64363e-05
RMSE: 0.0082


In [65]:
test_pred = lgbm_regressor.predict(test_x)
test_rate2.loc[test.index, 'com_reg_ver_win_rate'] = test_pred        #예측벡터 입력

In [66]:
test_rate2.to_csv('test_rate2.csv', index=False) 

In [67]:
train = pd.read_csv('train_rate2.csv')
test = pd.read_csv('test_rate2.csv')

e = pd.read_csv('submission.csv')
test["is_converted"]=e["is_converted"]           #drop 했던 is_converted 복구

train['ver_win_rate_x'] = train['ver_win_rate_x'].astype(str)  #이 두 변수는 범주형으로 취금하기위해 object로 변환
test['ver_win_rate_x'] = test['ver_win_rate_x'].astype(str)

train['ver_win_ratio_per_bu'] = train['ver_win_ratio_per_bu'].astype(str)  
test['ver_win_ratio_per_bu'] = test['ver_win_ratio_per_bu'].astype(str)

In [68]:
train.shape, test.shape

((59299, 40), (5271, 40))

# 특정 범주 파생변수 생성

In [69]:
base= ['inquiry_type','inquiry_type','inquiry_type','inquiry_type',
       'bant_submit',   
       'product_category', 'product_category', 'product_category', 'product_category',
       'product_category','product_category','product_category',
       'expected_timeline',     
       'customer_continent',    
       'customer_country', 'customer_country',
       'customer_idx', 'customer_idx', 'customer_idx', 'customer_idx',
       'customer_idx', 'customer_idx', 'customer_idx', 'customer_idx',
       'customer_idx', 'customer_idx', 'customer_idx',
       'customer_position', 'customer_position', 'customer_position',
       'lead_owner', 'lead_owner', 'lead_owner', 'lead_owner',
       'lead_owner', 'lead_owner', 'lead_owner', 'lead_owner',
       'lead_owner', 'lead_owner', 'lead_owner', 'lead_owner',
       'lead_owner', 'lead_owner'
      ]

new= [ 'is_usage','is_product_info','is_sercvices','is_demo',
       'is_bant_0.5',   
       'ver_video', 'ver_signage', 'ver_led','ver_hotel_tv',
       'ver_inter_board','ver_one_quick','ver_sinage',
       'is_time_year',     
       'is_Asia',  
       'is_hongkong', 'is_us',
       'is_47466', 'is_37680', 'is_18030', 'is_21321',
       'is_33350', 'is_25309', 'is_32240', 'is_31864',
       'is_19804', 'is_40344', 'is_7195',
       'is_intern', 'is_entry_level', 'is_trainee',
       'isow_97', 'isow_437', 'isow_487', 'isow_831',
       'isow_480', 'isow_375', 'isow_589', 'isow_166',
       'isow_279', 'isow_833', 'isow_4', 'isow_570',
       'isow_147', 'isow_148'
      ]

category= ['usage or technical consultation','Product Information','Services','Request a Demo',
        0.5,   
       'video wall signage', 'interactive signage', 'led signage','hotel tv',
       'interactive digital board', 'one:quick series','sinage', 
       'more than a year',
       'Asia',
       'Hong Kong', 'United States',
        47466, 37680, 18030, 21321,
        33350, 25309, 32240, 31864,
        19804, 40344, 7195,
       'intern', 'entry level',"trainee",
        97, 437, 487, 831,
        480, 375, 589, 166,
        279, 833, 4, 570,
        147, 148
      ]

def add_multiple_category_flags(train, test, base_variables, new_variables, category_names):
    
    if not (len(base_variables) == len(new_variables) == len(category_names)):
        raise ValueError("The length of base_variables, new_variables, and category_names must be the same.")
    for base_var, new_var, cat_name in zip(base_variables, new_variables, category_names):
        train[new_var] = (train[base_var] == cat_name).astype('int8')
        test[new_var] = (test[base_var] == cat_name).astype('int8')
    return train, test

train, test = add_multiple_category_flags(train, test, base, new ,category)
# sums = train[new].sum()    #파생변수가 유의미한지 확인
# sums

In [70]:
train.shape, test.shape

((59299, 84), (5271, 84))

# business_area 결측치 채우기

In [71]:
train_org = train
test_org = test

In [72]:
a = pd.read_csv('train.csv')
b = pd.read_csv('submission.csv')
train_org["business_area"]=a["business_area"] 
test_org["business_area"]=b["business_area"] 

train_org['business_area'].fillna('transportation', inplace=True)
test_org['business_area'].fillna('transportation', inplace=True)

train=train_org
test=test_org

In [73]:
train.shape, test.shape

((59299, 84), (5271, 84))

# 학습코드

In [74]:
def evaluate_model_performance(model, X_test, y_test):                #평가함수 정의
    pred = model.predict(X_test)
    # pred 객체의 dtype이 object라면 bool type으로 변경
    if pred.dtype == object:
        # "True" 문자열을 True로, "False" 문자열을 False로 변환
        pred = np.where(pred == 'True', True, False)

    pred_proba = model.predict_proba(X_test)[:, 1]

    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, zero_division=0)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)

    metrics = {'recall': recall, 'precision' : precision, 'accuracy': accuracy,'f1': f1, 'auc': roc_auc}
    return metrics

In [75]:
def encode_categorical_variables(train, test):                                           #인코딩
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        le = le.fit(train[col])
        train[col] = le.transform(train[col])
        
        for label in np.unique(test[col]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[col] = le.transform(test[col])
    
    return train, test

train, test = encode_categorical_variables(train, test)

def encode_with_other_category(train, test, column_name):
    le = LabelEncoder()
    train[column_name] = train[column_name].astype(str)  # int를 str로 변환
    le.fit(train[column_name])
    
    # 테스트 데이터에 대해 'other' 카테고리 처리
    # train에 없는 라벨을 기준으로 한다.
    test[column_name] = test[column_name].astype(str)  # int를 str로 변환
    test[column_name] = test[column_name].apply(lambda x: x if x in le.classes_ else 'other')
    
    # 'other' 라벨 추가
    le_classes = np.append(le.classes_, 'other')
    le.classes_ = le_classes
    
    # 라벨 인코딩 적용
    train[column_name] = le.transform(train[column_name])
    test[column_name] = le.transform(test[column_name])
    
    return train, test

train, test = encode_with_other_category(train, test, "customer_idx")
train, test = encode_with_other_category(train, test, "lead_owner")

train.shape, test.shape

((59299, 84), (5271, 84))

In [76]:
def drop_columns_and_return(train, test, cols):                                #특정 변수 drop
    train = train.drop(cols, axis=1)
    test = test.drop(cols, axis=1)
    return train, test

cols= ['customer_idx_converted_rate']

train, test= drop_columns_and_return(train, test, cols)
train.shape, test.shape

((59299, 83), (5271, 83))

In [77]:
train = train[train['historical_existing_cnt'] < 1000].reset_index(drop=True)        #이상치 제거 

In [78]:
train.loc[train['com_reg_ver_win_rate']<0, 'com_reg_ver_win_rate'] = 0                #비율변수 음수제거 
test.loc[test['com_reg_ver_win_rate']<0, 'com_reg_ver_win_rate'] = 0

## valid set 분리

In [79]:
TARGET='is_converted'                      #타겟변수 설정

x_test = test.drop([TARGET], axis=1)      

X = train.drop([TARGET], axis=1)        
y = train[TARGET]
train_x, val_x, train_y, val_y = tts(X, y, test_size=0.2,  shuffle=True, stratify=y, random_state = 1323)          #독립적인 valid set 분리

df = pd.concat([train_x, train_y], axis=1)
X = df.drop(columns=[TARGET])                  #튜닝에 사용할 데이터 분리
y = df[TARGET]

## 하이퍼파라미터 딕셔너리

In [80]:
best_lgbm = {'bagging_fraction': 0.22856079462337447, 'feature_fraction': 0.4685080392510799, 'lambda_l1': 0.08646922336364364, 'lambda_l2': 0.6791678142223252, 'learning_rate': 0.0635789922898151, 'max_bin': 497.0083792968057, 'min_child_weight': 30.46339220546423, 'min_data_in_leaf': 133.2338884253692, 'num_leaves': 53.606003580706314}

In [81]:
best_xgb = {'colsample_bytree': 0.6289435661861347, 'gamma': 1.316515005466068, 'learning_rate': 0.028887347188029905, 'max_depth': 12.879601798124325, 'min_child_weight': 9.7769937118953, 'reg_alpha': 8.089313835645845, 'reg_lambda': 0.6088517056470933, 'subsample': 0.7261564417044092}

In [82]:
best_cat = {'bagging_temperature': 0.5842169744343242, 'colsample_bylevel': 0.3112738802618131, 'depth': 11.176354970758771, 'iterations': 3030.2393068060537, 'leaf_estimation_iterations': 10.318537978123299, 'learning_rate': 0.09662488471087746, 'min_data_in_leaf': 20.22730448655225, 'od_wait': 202.68681713197077, 'random_strength': 3.4962556309576995, 'reg_lambda': 20.928935878120313, 'subsample': 0.2421578301404889}

## voting 모델 정의

In [83]:
lgbm = LGBMClassifier(    
    
    n_jobs= -1,                  #cpu 코어수
    n_estimators=800, 
    learning_rate=round(best_lgbm['learning_rate'], 5),
    #max_depth=int(best_lgbm['max_depth']),
    num_leaves=int(best_lgbm['num_leaves']),
    colsample_bytree = round(best_lgbm['feature_fraction'], 5),
    subsample=round(best_lgbm['bagging_fraction'], 5),
    max_bin=int(best_lgbm['max_bin']),
    reg_alpha=round(best_lgbm['lambda_l1'], 5),
    reg_lambda=round(best_lgbm['lambda_l2'], 5),
    min_child_weight= int(best_lgbm['min_child_weight']), 
    min_child_samples=int(best_lgbm['min_data_in_leaf']), 
    #class_weight= 'balanced',  #라벨 불균형 고려
    
    class_weight= {0: 1, 1: 2},
    verbose=-1,                #학습정보 미출력                          
    objective='binary',
    random_state=48,
) 
                          
xgb =XGBClassifier(
    
    n_estimators=500,
    learning_rate=round(best_xgb['learning_rate'], 5),
    max_depth=int(best_xgb['max_depth']),
    min_child_weight=int(best_xgb['min_child_weight']),
    gamma=best_xgb['gamma'],
    subsample=round(best_xgb['subsample'], 5),
    colsample_bytree=round(best_xgb['colsample_bytree'], 5),
    reg_alpha=round(best_xgb['reg_alpha'], 5),
    reg_lambda=round(best_xgb['reg_lambda'], 5),
    
    #scale_pos_weight=round(best_xgb['scale_pos_weight'], 5),
    scale_pos_weight=2,
    
    use_label_encoder=False,
    objective='binary:logistic',
    eval_metric="logloss",
    #early_stopping_rounds=100,
    random_state=47,
)

cbc = CatBoostClassifier(
    
    iterations=int(best_cat['iterations']),
    learning_rate=best_cat['learning_rate'],
    depth=int(best_cat['depth']),
    l2_leaf_reg=best_cat['reg_lambda'],
    bagging_temperature=best_cat['bagging_temperature'],
    random_strength=best_cat['random_strength'],
    min_data_in_leaf=int(best_cat['min_data_in_leaf']),
    leaf_estimation_iterations=int(best_cat['leaf_estimation_iterations']),
    subsample=best_cat['subsample'],
    colsample_bylevel=best_cat['colsample_bylevel'],
    od_wait=int(best_cat['od_wait']),
    eval_metric="Logloss",
    
    #scale_pos_weight=best_cat['scale_pos_weight'],
    class_weights = [1, 2.5],
    
    verbose=100,
    #early_stopping_rounds=50,    #od_wait랑 동시에 사용불가
    random_seed=42,  # 필요한 경우 주석 해제
    task_type='CPU'  # 'GPU'로 설정 가능, GPU 사용 가능 환경에서
)



# rf = RandomForestClassifier(                                        #class_weight= {0: 1, 1: 2}  쓸지말지 결정
    
#     n_estimators=int(best_rf['n_estimators']),
#     max_depth=int(best_rf['max_depth']),
#     min_samples_split=int(best_rf['min_samples_split']),
#     min_samples_leaf=int(best_rf['min_samples_leaf']),
#     max_features=best_rf['max_features'],
#     max_samples=best_rf['max_samples'],
#     random_state=42,  # Uncomment this if you want reproducible results
#     n_jobs=-1,  # Use all CPU cores
#     class_weight= {0: 1, 1: 7}
# )

# # lr = LogisticRegression(
# #         C=0.41994536370093083, 
# #         penalty='l2', 
# #         l1_ratio=0.7209270556562788,
# #         max_iter=1000,
# #         solver='saga',
# #         random_state=SEED)


# # bagging= BaggingClassifier(
# #     n_estimators=int(best_bagging['n_estimators']),
# #     max_samples=best_bagging['max_samples'],
# #     max_features=best_bagging['max_features'],
# #     random_state=int(best_bagging['random_state']),
# #     n_jobs=-1,
    
# #     bootstrap= True,
# #     oob_score= True,                        #붓스트랩이 true일때만 적용가능  
# #     bootstrap_features= False,
# # )

# bagging = BaggingClassifier(random_state=SEED)
# models = [lgbm, xgb, cbc, rf, bagging]

## fit

In [84]:
estimators = [('lgbm', lgbm), ('xgb', xgb), ('cbc', cbc)]
best_model = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1]) 
best_model.fit(X, y)

0:	learn: 0.5872382	total: 98.3ms	remaining: 4m 57s
100:	learn: 0.0898590	total: 2.67s	remaining: 1m 17s
200:	learn: 0.0564486	total: 5.14s	remaining: 1m 12s
300:	learn: 0.0420784	total: 7.68s	remaining: 1m 9s
400:	learn: 0.0333365	total: 10.3s	remaining: 1m 7s
500:	learn: 0.0269603	total: 12.8s	remaining: 1m 4s
600:	learn: 0.0227485	total: 15.4s	remaining: 1m 2s
700:	learn: 0.0195975	total: 17.9s	remaining: 59.5s
800:	learn: 0.0172774	total: 20.5s	remaining: 57s
900:	learn: 0.0155328	total: 23.1s	remaining: 54.5s
1000:	learn: 0.0139654	total: 25.6s	remaining: 52s
1100:	learn: 0.0128023	total: 28.2s	remaining: 49.4s
1200:	learn: 0.0119452	total: 30.7s	remaining: 46.8s
1300:	learn: 0.0110750	total: 33.3s	remaining: 44.3s
1400:	learn: 0.0102027	total: 35.9s	remaining: 41.7s
1500:	learn: 0.0095597	total: 38.4s	remaining: 39.1s
1600:	learn: 0.0090532	total: 40.9s	remaining: 36.5s
1700:	learn: 0.0086178	total: 43.4s	remaining: 33.9s
1800:	learn: 0.0081974	total: 45.9s	remaining: 31.3s
1900:

In [85]:
evaluation_results = evaluate_model_performance(best_model, val_x, val_y)    #최종 valid f1
print(evaluation_results)

{'recall': 0.8824742268041237, 'precision': 0.8806584362139918, 'accuracy': 0.9806070826306914, 'f1': 0.8815653964984551, 'auc': 0.9936049813978587}


In [86]:
# 제출용 prediction
best_threshold = 0.058
pred_probs = best_model.predict_proba(x_test)[:, 1]
final_preds = pred_probs >= best_threshold
print(sum(final_preds) / len(final_preds)) 

0.33428191993929046


# 제출파일 생성

In [87]:
# True, False로 수정
def convert_to_boolean_vector(input_vector):
    return np.array(input_vector == 1, dtype=bool)          
    
result_vector = convert_to_boolean_vector(final_preds)

In [None]:
submission = pd.read_csv('submission.csv')
submission['is_converted'] = result_vector
submission.to_csv('submission.csv')

## Thank you