In [2]:
!pip install --user iterative-stratification



In [4]:
!pip install --upgrade pandas "dask[complete]"

Collecting dask[complete]
  Downloading dask-2023.7.0-py3-none-any.whl (1.2 MB)
                                              0.0/1.2 MB ? eta -:--:--
     --------------                           0.4/1.2 MB 13.9 MB/s eta 0:00:01
     --------------------------------         1.0/1.2 MB 10.0 MB/s eta 0:00:01
     ---------------------------------------- 1.2/1.2 MB 9.4 MB/s eta 0:00:00
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.2-cp39-cp39-win_amd64.whl (99 kB)
                                              0.0/99.8 kB ? eta -:--:--
     ---------------------------------------- 99.8/99.8 kB 6.0 MB/s eta 0:00:00
Collecting distributed==2023.7.0 (from dask[complete])
  Downloading distributed-2023.7.0-py3-none-any.whl (981 kB)
                                              0.0/981.6 kB ? eta -:--:--
     ------------------                    491.5/981.6 kB 15.5 MB/s eta 0:00:01
     ------------------------------------- 981.6/981.6 kB 12.5 MB/s eta 0:00:00
Installing c

In [5]:
import optuna
from optuna.samplers import TPESampler

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.pipeline import Pipeline

# Models
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, RepeatedMultilabelStratifiedKFold
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier

In [12]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
mixed_desc=pd.read_csv("data/mixed_desc.csv")
         
train.drop(columns=["id"],inplace=True)
test.drop(columns=["id"],inplace=True)
mixed_desc.drop(columns=["CIDs"],inplace=True)
col="EC1_EC2_EC3_EC4_EC5_EC6"

mixed_desc[col.split("_")]= mixed_desc[col].str.split('_', expand=True).astype(int)
mixed_desc.drop(col, axis=1, inplace=True)

original = mixed_desc[train.columns]

train = pd.concat([train,original]).reset_index(drop=True)
train.drop(columns=col.split("_")[2:],inplace=True)


In [13]:
def generate_features(train, test, cat_cols, num_cols):
    
    df = pd.concat([train, test], axis = 0, copy = False)
    
    for c in cat_cols + num_cols:
        
        df[f'count_{c}'] = df.groupby(c)[c].transform('count')
        
    for c in cat_cols:
        for n in num_cols:
                df[f'mean_{n}_per_{c}'] = df.groupby(c)[n].transform('mean')
            
    return df.iloc[:len(train),:], df.iloc[len(train):, :]

In [14]:
target_cols = ['EC1', 'EC2']
cols_to_drop = ['id']

features = [c for c in train.columns if c not in target_cols + cols_to_drop]

cat_cols = ['EState_VSA2','HallKierAlpha','NumHeteroatoms','PEOE_VSA10','PEOE_VSA14','PEOE_VSA6',
            'PEOE_VSA7','PEOE_VSA8', 'SMR_VSA10','SMR_VSA5','SlogP_VSA3','fr_COO','fr_COO2']

num_cols = [c for c in features if c not in cat_cols]

In [15]:
X_train = train[features]
Y_train = train[target_cols]
X_test = test[features]

In [16]:
X_train, X_test = generate_features(X_train, X_test, cat_cols, num_cols)

In [17]:
y  = Y_train
X  = X_train
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
#from sklearn.model_selection import RepeatedMultilabelStratifiedKFold
import numpy as np

# XGBoost classifier parameters
xgb_params = {'n_estimators': 300,
              'tree_method': 'hist',
              'max_depth': 5,
              'reg_alpha': 0.06790740746476749,
              'reg_lambda': 0.03393770327994609,
              'min_child_weight': 1,
              'gamma': 2.5705812096617772e-05,
              'learning_rate': 0.07132617944894756,
              'colsample_bytree': 0.11664298814833247,
              'colsample_bynode': 0.9912092923877247,
              'colsample_bylevel': 0.29178614622079735,
              'subsample': 0.7395301853144935,
              'random_state': 42
              }

# LightGBM classifier parameters
lgbm_params = {'n_estimators': 300,
 'boosting_type': 'gbdt',
 'max_depth': 5,
 'reg_alpha': 6.720380454685094,
 'reg_lambda': 7.074828689930955e-05,
 'min_child_samples': 15,
 'subsample': 0.5182995486972547,
 'learning_rate': 0.027352422199502537,
 'colsample_bytree': 0.2257179878033366,
 'colsample_bynode': 0.7098194984886731,
 'random_state': 84315}

# Define the classifiers
xgb_classifier = MultiOutputClassifier(XGBClassifier(**xgb_params))
lgbm_classifier = MultiOutputClassifier(LGBMClassifier(**lgbm_params))

# Create the pipelines
xgb_clf = Pipeline([('classifier', xgb_classifier)])
lgbm_clf = Pipeline([('classifier', lgbm_classifier)])

# Initialize variables
oof_preds_xgb = np.zeros(y.shape)
oof_preds_lgbm = np.zeros(y.shape)
test_preds_xgb = np.zeros((test.shape[0], y.shape[1]))
test_preds_lgbm = np.zeros((test.shape[0], y.shape[1]))
oof_losses_xgb = []
oof_losses_lgbm = []
n_splits = 5
kf = RepeatedMultilabelStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=42)
train_losses_xgb = []
train_losses_lgbm = []
over_train=[]
over_valid=[]
# Loop over folds
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold:', fn)
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    # Train and predict with XGBoost classifier
    xgb_clf.fit(X_train, y_train)
    train_preds_xgb = xgb_clf.predict_proba(X_train)
    train_preds_xgb = np.array(train_preds_xgb)[:, :, 1].T
    #train_loss_xgb = roc_auc_score(np.ravel(y_train), np.ravel(train_preds_xgb))
    #train_losses_xgb.append(train_loss_xgb)

    val_preds_xgb = xgb_clf.predict_proba(X_val)
    val_preds_xgb = np.array(val_preds_xgb)[:, :, 1].T
    oof_preds_xgb[val_idx] = val_preds_xgb
    loss_xgb = roc_auc_score(np.ravel(y_val), np.ravel(val_preds_xgb))
    oof_losses_xgb.append(loss_xgb)
    preds_xgb = xgb_clf.predict_proba(X_test)
    preds_xgb = np.array(preds_xgb)[:, :, 1].T
    test_preds_xgb += preds_xgb / n_splits

    

    # Train and predict with LightGBM classifier
    lgbm_clf.fit(X_train, y_train)
    train_preds_lgbm = lgbm_clf.predict_proba(X_train)
    train_preds_lgbm = np.array(train_preds_lgbm)[:, :, 1].T
    #train_loss_lgbm = roc_auc_score(np.ravel(y_train), np.ravel(train_preds_lgbm))
    #train_losses_lgbm.append(train_loss_lgbm)

    val_preds_lgbm = lgbm_clf.predict_proba(X_val)
    val_preds_lgbm = np.array(val_preds_lgbm)[:, :, 1].T
    oof_preds_lgbm[val_idx] = val_preds_lgbm

    loss_lgbm = roc_auc_score(np.ravel(y_val), np.ravel(val_preds_lgbm))
    oof_losses_lgbm.append(loss_lgbm)
    preds_lgbm = lgbm_clf.predict_proba(X_test)
    preds_lgbm = np.array(preds_lgbm)[:, :, 1].T
    test_preds_lgbm += preds_lgbm / n_splits
    overall_train_preds = (train_preds_xgb+train_preds_lgbm)/2
    overall_train_loss = roc_auc_score(np.ravel(y_train), np.ravel(overall_train_preds))
    overall_valid_preds = (val_preds_xgb+val_preds_lgbm)/2
    overall_valid_loss = roc_auc_score(np.ravel(y_val), np.ravel(overall_valid_preds))
    over_train.append(overall_train_loss)
    over_valid.append(overall_valid_loss)
    print("overall_train_loss",overall_train_loss)
    print("overall_valid_loss",overall_valid_loss)

print("over_train",np.mean(over_train))
print("over_valid",np.mean(over_valid))

Starting fold: 0
overall_train_loss 0.8784055236294086
overall_valid_loss 0.6889285570415727
Starting fold: 1
overall_train_loss 0.8781267148690797
overall_valid_loss 0.6915416227328217
Starting fold: 2
overall_train_loss 0.8762593265369617
overall_valid_loss 0.6735388939302278
Starting fold: 3
overall_train_loss 0.8782379780804732
overall_valid_loss 0.6836092836257309
Starting fold: 4
overall_train_loss 0.8754022657529807
overall_valid_loss 0.6871969744822807
over_train 0.8772863617737808
over_valid 0.6849630663625268


In [18]:
sample_submission=pd.read_csv("data/sample_submission.csv")
sample_submission.iloc[:,1:] = 0.5*test_preds_xgb+0.5*test_preds_lgbm

In [19]:
sample_submission.to_csv("submission.csv",index=False)

