In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import io
import warnings
import h5py
warnings.filterwarnings(action='ignore')
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
train=pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
test=pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv')

In [3]:
main_features=test.columns.to_list()+['target']

In [4]:
cat_col=[]
num_col=[]
for i in main_features:
    try:
        pd.to_numeric(train[i])
        num_col.append(i)

    except:
        cat_col.append(i)

In [5]:
from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest,VotingClassifier,StackingClassifier
from sklearn.cluster import DBSCAN,KMeans
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.feature_selection import chi2,f_classif,mutual_info_classif,SelectKBest,SequentialFeatureSelector,VarianceThreshold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,Normalizer,StandardScaler,QuantileTransformer,PolynomialFeatures,OrdinalEncoder,PowerTransformer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import confusion_matrix

In [6]:
X,y=train.drop('target',axis=1),train.target
X_train,X_valid,y_train,y_valid=train_test_split(X,y,train_size=0.8,random_state=42,stratify=y)

In [7]:
class custom_col_drop(BaseEstimator, TransformerMixin):
    def __init__(self, drop_list=None, select_list=None):
        self.drop_list = drop_list
        self.select_list = select_list
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if self.select_list:
            X = X[self.select_list]
        if self.drop_list:
            X = X.drop(self.drop_list, axis=1)
        return X

In [8]:
features=test.columns.to_list()

In [9]:
seed=42
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        4, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
#     'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24,
#     'cat_features':      cat_cols,
    'task_type'    :     'GPU'
}
xgb_params = {
#     'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'learning_rate':      0.08501257473292347, 
    'lambda':             8.879624125465703, 
    'alpha':              0.6779926606782505, 
    'max_depth':          6, 
    'subsample':          0.6012681388711075, 
    'colsample_bytree':   0.8437772277074493, 
    'colsample_bylevel':  0.5476090898823716, 
    'colsample_bynode':   0.9928601203635129, 
    'scale_pos_weight':   3.29440313334688,
}

In [10]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

In [11]:
model=Pipeline([
    ('col_drop',custom_col_drop(drop_list=['isic_id','patient_id','attribution'],select_list=features)),
    ('col_trans',ColumnTransformer([
        ('numerical_col',Pipeline([
            ('standard_scale',StandardScaler()),
            ('impute',SimpleImputer(strategy='most_frequent')),
        ]),
         make_column_selector(dtype_include=['int64','float64'])),
        ('cat_col_ord',OrdinalEncoder(),make_column_selector(dtype_include=['object'])),
    ],remainder='drop')),
    ('impute',SimpleImputer(strategy='most_frequent')),
    
    ('feature_union', FeatureUnion([
        ('original_features', 'passthrough'),  
        ('pca', Pipeline([
            ('pca_transform', PCA(n_components=10,random_state=42)),
        ])),
        ('kmeans', Pipeline([
            ('kmeans_transform', KMeans(n_clusters=30, random_state=42)), 
            ('kmeans_labels', 'passthrough'), 
        ])),
    ])),
    
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=42)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=0.01, random_state=42)),  
    ('voting',VotingClassifier([
        ('catboost',CatBoostClassifier(**cb_params)),
        ('lgbm',LGBMClassifier(**lgb_params)),
        ('xgb',XGBClassifier(**xgb_params))
        
    ],
    voting='soft',
    weights=[0.47,0.40,0.28]
    )),
    
    
#     ('stacking',StackingClassifier([
#         ('catboost',CatBoostClassifier(**cb_params)),
#         ('lgbm',LGBMClassifier(**lgb_params)),
#         ('xgb',XGBClassifier(**xgb_params))
        
#     ],
#         stack_method='predict_proba',
#     final_estimator=CatBoostClassifier(**cb_params),
#     ))
])

In [12]:
# model.fit(X_train,y_train)

In [13]:
model.fit(X,y)

In [14]:
# custom_metric(model,X_valid,y_valid)

In [15]:
pred=model.predict_proba(test)[:,1]

In [16]:
sub=test[['isic_id']]

In [17]:
sub['target']=pred

In [18]:
sub.to_csv('submission.csv',index=False)