In [None]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [None]:
class config:
    ROOT_DIR = "/kaggle/input/predicting-pulsar-starintermediate"
    SEED = 0
    N_FOLD = 5

## Load and Arrange Dataset

In [None]:
train_df = pd.read_csv(f"{config.ROOT_DIR}/pulsar_data_train.csv")
test_df = pd.read_csv(f"{config.ROOT_DIR}/pulsar_data_test.csv")

In [None]:
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

In [None]:
x_train_df = train_df.drop("target_class", axis=1)
x_test_df = test_df.drop("target_class", axis=1)
y_train = train_df["target_class"].values

In [None]:
scaler = StandardScaler()
scaler.fit(x_train_df.values)

x_train = scaler.transform(x_train_df.values)
x_test = scaler.transform(x_test_df.values)

## Original

In [None]:
skf = StratifiedKFold(n_splits=config.N_FOLD, random_state=config.SEED, shuffle=True)

oof = np.zeros(len(y_train))
for fold, (fit_i, val_i) in enumerate(skf.split(x_train, y_train)):
    
    x_fit = x_train[fit_i, :]
    y_fit = y_train[fit_i]
    x_val = x_train[val_i, :]
    y_val = y_train[val_i]
    
    model = LogisticRegression()
    model.fit(x_fit, y_fit)
    
    y_pred = model.predict(x_val)
    
    oof[val_i] = y_pred

res_d = classification_report(y_train, oof, output_dict=True)
acc, pre, rec, f1 = res_d["accuracy"], res_d["1.0"]["precision"], res_d["1.0"]["recall"], res_d["1.0"]["f1-score"]
print(f"accuracy={acc:.4}, precision={pre:.4}, recall={rec:.4}, f1-score={f1:.4}")

cm = confusion_matrix(y_train, oof)
display(pd.DataFrame(cm, columns=["TP", "FP"], index=["FN", "TN"]))

## UnderSampling

In [None]:
skf = StratifiedKFold(n_splits=config.N_FOLD, random_state=config.SEED, shuffle=True)

oof = np.zeros(len(y_train))
for fold, (fit_i, val_i) in enumerate(skf.split(x_train, y_train)):
    
    x_fit = x_train[fit_i, :]
    y_fit = y_train[fit_i]
    x_val = x_train[val_i, :]
    y_val = y_train[val_i]
    
    # UnderSampling
    positive_count_train = int(y_fit.sum())
    rus = RandomUnderSampler(sampling_strategy={0:positive_count_train, 1:positive_count_train}, random_state=config.SEED)
    x_fit_resampled, y_fit_resampled = rus.fit_resample(x_fit, y_fit)
    
    model = LogisticRegression()
    model.fit(x_fit_resampled, y_fit_resampled)
    
    y_pred = model.predict(x_val)
    
    oof[val_i] = y_pred

res_d = classification_report(y_train, oof, output_dict=True)
acc, pre, rec, f1 = res_d["accuracy"], res_d["1.0"]["precision"], res_d["1.0"]["recall"], res_d["1.0"]["f1-score"]
print(f"accuracy={acc:.4}, precision={pre:.4}, recall={rec:.4}, f1-score={f1:.4}")

cm = confusion_matrix(y_train, oof)
display(pd.DataFrame(cm, columns=["P(predict)", "N(Predict)"], index=["P(True)", "N(True)"]))

## OverSampling(SMOTE)

In [None]:
skf = StratifiedKFold(n_splits=config.N_FOLD, random_state=config.SEED, shuffle=True)

oof = np.zeros(len(y_train))
for fold, (fit_i, val_i) in enumerate(skf.split(x_train, y_train)):
    
    x_fit = x_train[fit_i, :]
    y_fit = y_train[fit_i]
    x_val = x_train[val_i, :]
    y_val = y_train[val_i]
    
    # OverSampling
    sm = SMOTE()
    x_fit_resampled, y_fit_resampled = sm.fit_resample(x_fit, y_fit)
    
    model = LogisticRegression()
    model.fit(x_fit_resampled, y_fit_resampled)
    
    y_pred = model.predict(x_val)
    
    oof[val_i] = y_pred

res_d = classification_report(y_train, oof, output_dict=True)
acc, pre, rec, f1 = res_d["accuracy"], res_d["1.0"]["precision"], res_d["1.0"]["recall"], res_d["1.0"]["f1-score"]
print(f"accuracy={acc:.4}, precision={pre:.4}, recall={rec:.4}, f1-score={f1:.4}")

cm = confusion_matrix(y_train, oof)
display(pd.DataFrame(cm, columns=["P(predict)", "N(Predict)"], index=["P(True)", "N(True)"]))

## UnderSampling Ensemble

In [None]:
skf = StratifiedKFold(n_splits=config.N_FOLD, random_state=config.SEED, shuffle=True)

oof = np.zeros(len(y_train))
for fold, (fit_i, val_i) in enumerate(skf.split(x_train, y_train)):
    
    x_fit = x_train[fit_i, :]
    y_fit = y_train[fit_i]
    x_val = x_train[val_i, :]
    y_val = y_train[val_i]
    
    # UnderSampling
    y_preds = []
    for i in range(10):
        positive_count_train = int(y_fit.sum())
        rus = RandomUnderSampler(sampling_strategy={0:positive_count_train, 1:positive_count_train}, random_state=config.SEED+i)
        x_fit_resampled, y_fit_resampled = rus.fit_resample(x_fit, y_fit)
    
        model = LogisticRegression()
        model.fit(x_fit_resampled, y_fit_resampled)
    
        _y_pred = model.predict(x_val)
        y_preds.append(_y_pred)
    
    y_pred = (np.array(y_preds).mean(0) > 0.5).astype(int)
    oof[val_i] = y_pred

res_d = classification_report(y_train, oof, output_dict=True)
acc, pre, rec, f1 = res_d["accuracy"], res_d["1.0"]["precision"], res_d["1.0"]["recall"], res_d["1.0"]["f1-score"]
print(f"accuracy={acc:.4}, precision={pre:.4}, recall={rec:.4}, f1-score={f1:.4}")

cm = confusion_matrix(y_train, oof)
display(pd.DataFrame(cm, columns=["P(predict)", "N(Predict)"], index=["P(True)", "N(True)"]))