# Random UnderSampling 
With and Without replacement

In [1]:
import yaml
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (RepeatedStratifiedKFold, cross_validate)
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import xgboost as xgb
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from src.utils import print_training_results

In [2]:
# Plot configs:
sns.set_context("paper", font_scale= 1.8)
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plotsize = (22, 5)
plt.rcParams['figure.figsize'] = plotsize
#  Pandas config
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_columns = 200

## 1) Input Data
The input data can be found, and download here: [zenodo link](https://zenodo.org/records/10251230)

In [3]:
df = pd.read_csv("../data/data_143_features.csv")
print(df.shape)
df.head()

(50847, 144)


Unnamed: 0,log2_fpkm,near_fantom_enhancer,near_cancer_associated_snp,number_of_exons,within_pol2_loop,near_hnisz_super_enhancer,within_ctcf_loop,near_hnisz_enhancer,has_mouse_ortholog,locus_is_heterozygous_deleted,is_intergenic,transcript_length,locus_is_amplified,is_antisense,locus_locus_distance,tss_pc_distance,near_vista_enhancer,locus_is_homozygous_deleted,ARID3A,ATF1,ATF2,ATF3,BACH1,BCLAF1,BHLHE40,BRCA1,CBX3,CBX8,CEBPB,CEBPZ,CHD1,CHD2,CHD7,CREB1,CTBP2,CTCF,CTCFL,CUX1,E2F1,E2F4,E2F6,EGR1,ELF1,ELK1,EP300,ESRRA,ETS1,EZH2,FOS,FOSL1,FOSL2,FOXA1,FOXM1,GABPA,GATA1,GATA2,GATA3,GTF2F1,HCFC1,HDAC1,HDAC2,HDAC6,HSF1,IKZF1,IRF1,JUN,JUND,KDM1A,KDM5A,KDM5B,MAFF,MAFK,MAX,MAZ,MEF2A,MTA3,MXI1,MYBL2,MYC,NANOG,NCOR1,NFE2,NFIC,NFYA,NFYB,NR2C2,NR2F2,NR3C1,NRF1,PHF8,PML,POLR2A,POU5F1,RAD21,RBBP5,RCOR1,RELA,REST,RFX5,RNF2,RXRA,SAP30,SETDB1,SIN3A,SIX5,SMARCA4,SMARCB1,SMARCC2,SMC3,SP1,SPI1,SREBF1,SREBF2,SRF,STAT5A,SUPT20H,SUZ12,TAF1,TAF7,TAL1,TBL1XR1,TBP,TCF12,TCF7L2,TEAD4,THAP1,TRIM28,UBTF,USF1,USF2,YY1,ZBTB33,ZBTB7A,ZC3H11A,ZKSCAN1,ZMIZ1,ZNF143,ZNF217,ZNF263,ZNF274,ZNF384,ZZZ3,number_tfs,hit
0,2.9612,0,0,2.5,0,0,0,0,0,0,1,3019.5,0,0,43833,48473,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,-0.1795,0,0,2.0,0,0,0,0,0,0,1,609.0,0,0,42685,44966,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,-1.3535,0,1,2.0,0,0,0,0,1,0,1,582.0,0,0,0,22062,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,-1.0786,0,0,4.0,0,0,0,0,0,0,1,639.0,0,0,0,7516,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,-3.3219,0,0,2.0,0,0,0,0,0,0,1,295.0,0,0,92687,93681,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


## 2) Train/Test Split

In [4]:
X = df.drop(columns=["hit"]).copy(deep=True)
print(X.shape)
X.head()

(50847, 143)


Unnamed: 0,log2_fpkm,near_fantom_enhancer,near_cancer_associated_snp,number_of_exons,within_pol2_loop,near_hnisz_super_enhancer,within_ctcf_loop,near_hnisz_enhancer,has_mouse_ortholog,locus_is_heterozygous_deleted,is_intergenic,transcript_length,locus_is_amplified,is_antisense,locus_locus_distance,tss_pc_distance,near_vista_enhancer,locus_is_homozygous_deleted,ARID3A,ATF1,ATF2,ATF3,BACH1,BCLAF1,BHLHE40,BRCA1,CBX3,CBX8,CEBPB,CEBPZ,CHD1,CHD2,CHD7,CREB1,CTBP2,CTCF,CTCFL,CUX1,E2F1,E2F4,E2F6,EGR1,ELF1,ELK1,EP300,ESRRA,ETS1,EZH2,FOS,FOSL1,FOSL2,FOXA1,FOXM1,GABPA,GATA1,GATA2,GATA3,GTF2F1,HCFC1,HDAC1,HDAC2,HDAC6,HSF1,IKZF1,IRF1,JUN,JUND,KDM1A,KDM5A,KDM5B,MAFF,MAFK,MAX,MAZ,MEF2A,MTA3,MXI1,MYBL2,MYC,NANOG,NCOR1,NFE2,NFIC,NFYA,NFYB,NR2C2,NR2F2,NR3C1,NRF1,PHF8,PML,POLR2A,POU5F1,RAD21,RBBP5,RCOR1,RELA,REST,RFX5,RNF2,RXRA,SAP30,SETDB1,SIN3A,SIX5,SMARCA4,SMARCB1,SMARCC2,SMC3,SP1,SPI1,SREBF1,SREBF2,SRF,STAT5A,SUPT20H,SUZ12,TAF1,TAF7,TAL1,TBL1XR1,TBP,TCF12,TCF7L2,TEAD4,THAP1,TRIM28,UBTF,USF1,USF2,YY1,ZBTB33,ZBTB7A,ZC3H11A,ZKSCAN1,ZMIZ1,ZNF143,ZNF217,ZNF263,ZNF274,ZNF384,ZZZ3,number_tfs
0,2.9612,0,0,2.5,0,0,0,0,0,0,1,3019.5,0,0,43833,48473,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,-0.1795,0,0,2.0,0,0,0,0,0,0,1,609.0,0,0,42685,44966,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,-1.3535,0,1,2.0,0,0,0,0,1,0,1,582.0,0,0,0,22062,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,-1.0786,0,0,4.0,0,0,0,0,0,0,1,639.0,0,0,0,7516,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,-3.3219,0,0,2.0,0,0,0,0,0,0,1,295.0,0,0,92687,93681,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
nan_features = X.isna().sum().reset_index().rename(columns={"index": "features", 0: "is_na"})
nan_features[nan_features["is_na"] == 1]

Unnamed: 0,features,is_na


In [6]:
y = df[['hit']].astype('int64').copy(deep=True)
num_classses = Counter(y.values.ravel())
print(num_classses)
y.head()

Counter({0: 49936, 1: 911})


Unnamed: 0,hit
0,0
1,0
2,0
3,0
4,0


## 3) XGBoost model

In [7]:
# Model configs
with open("config_files/config_under_sampling.yaml", "r") as file:
    config = yaml.safe_load(file)

# ML configs
SEED = config["SEED"]["seed"]
xgboost_configs = config["xgboost"]

In [8]:
# CV:
stratified_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state= SEED)

# scoring:
scoring = {"sensitivity": "recall", 
            "auroc": "roc_auc",
            "f1_score": "f1", 
            "precision": "precision",
            }

# XGBoost model
xgboost_model = xgb.XGBClassifier(**xgboost_configs)

## 4) Random Undersampling

### 4.1) Randon Undersampling Without Replacement

In [9]:
%%time

under_sampling_strategies = [0.03, 0.04, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]

for i in under_sampling_strategies:
    print(f"Under Sampling strategy: {i: .0%}")
    random_pipeline = make_pipeline(RandomUnderSampler(sampling_strategy= i, random_state= SEED, replacement= False), xgboost_model)
    cv_result = cross_validate(random_pipeline, 
                                X, 
                                y.values.ravel(), 
                                cv= stratified_cv, 
                                scoring= list(scoring.values()), 
                                return_train_score= False, 
                                verbose=0)
    print_training_results(scoring_dict=scoring, input_model=xgboost_model, cv_results=cv_result)

Under Sampling strategy:  3%


----------
XGBClassifier mean-sensitivity: 0.1627 (+/-  0.04)
XGBClassifier mean-auroc: 0.8250 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2360 (+/-  0.05)
XGBClassifier mean-precision: 0.4363 (+/-  0.09)
----------
Under Sampling strategy:  4%
----------
XGBClassifier mean-sensitivity: 0.2001 (+/-  0.05)
XGBClassifier mean-auroc: 0.8270 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2621 (+/-  0.06)
XGBClassifier mean-precision: 0.3868 (+/-  0.08)
----------
Under Sampling strategy:  5%
----------
XGBClassifier mean-sensitivity: 0.2341 (+/-  0.05)
XGBClassifier mean-auroc: 0.8281 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2818 (+/-  0.06)
XGBClassifier mean-precision: 0.3583 (+/-  0.06)
----------
Under Sampling strategy:  10%
----------
XGBClassifier mean-sensitivity: 0.3556 (+/-  0.06)
XGBClassifier mean-auroc: 0.8270 (+/-  0.02)
XGBClassifier mean-f1_score: 0.3073 (+/-  0.04)
XGBClassifier mean-precision: 0.2716 (+/-  0.04)
----------
Under Sampling strategy:  20%
----------
XGBClassifi

### 4.2) Randon Undersampling With Replacement

In [10]:
%%time

under_sampling_strategies = [0.03, 0.04, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]

for i in under_sampling_strategies:
    print(f"Under Sampling strategy: {i: .0%}")
    random_pipeline = make_pipeline(RandomUnderSampler(sampling_strategy= i, random_state= SEED, replacement= True), xgboost_model)
    cv_result = cross_validate(random_pipeline, 
                                X, 
                                y.values.ravel(), 
                                cv= stratified_cv, 
                                scoring= list(scoring.values()), 
                                return_train_score= False, 
                                verbose=0)
    print_training_results(scoring_dict=scoring, input_model=xgboost_model, cv_results=cv_result)

Under Sampling strategy:  3%
----------
XGBClassifier mean-sensitivity: 0.1895 (+/-  0.05)
XGBClassifier mean-auroc: 0.8238 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2531 (+/-  0.06)
XGBClassifier mean-precision: 0.3858 (+/-  0.08)
----------
Under Sampling strategy:  4%
----------
XGBClassifier mean-sensitivity: 0.2301 (+/-  0.06)
XGBClassifier mean-auroc: 0.8257 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2815 (+/-  0.06)
XGBClassifier mean-precision: 0.3665 (+/-  0.07)
----------
Under Sampling strategy:  5%
----------
XGBClassifier mean-sensitivity: 0.2594 (+/-  0.05)
XGBClassifier mean-auroc: 0.8258 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2918 (+/-  0.05)
XGBClassifier mean-precision: 0.3356 (+/-  0.06)
----------
Under Sampling strategy:  10%
----------
XGBClassifier mean-sensitivity: 0.3604 (+/-  0.04)
XGBClassifier mean-auroc: 0.8307 (+/-  0.02)
XGBClassifier mean-f1_score: 0.2980 (+/-  0.04)
XGBClassifier mean-precision: 0.2546 (+/-  0.03)
----------
Under Sampling strategy