# WORK IN PROGRESS

Check back for updates!

## Import Packages

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import optuna

import gc; gc.enable()

## Downcasting

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Data Prep

In [None]:
PATH = '../input/dataprep-nov21/gauss.parquet'
X = pd.read_parquet(PATH)

X.head()

In [None]:
PATH = '../input/dataprep-nov21/target.parquet'
pred = pd.read_parquet(PATH)

pred.head()

In [None]:
M, N = X.shape[0], pred.shape[0]
M, N, M-N

In [None]:
X = reduce_memory_usage(X)
y = pred.target

train = X.head(N)
test = X.tail(M-N)

del X; gc.collect()

In [None]:
train.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

## Sample for Small-Scale Experiments

In [None]:
PATH = '../input/reliefe-nov21/fi.parquet'
fi = pd.read_parquet(PATH)
fi.head()

In [None]:
keep_cols = fi.sort_values(by='med_rank').head(20)['index'].tolist()

In [None]:
train = train[keep_cols]
gc.collect()

In [None]:
_, X_train, _, y_train = train_test_split(train.values, y.values, 
                                          test_size=0.2, random_state=42, stratify=y.values)
del _; gc.collect()

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, 
                                          test_size=0.5, random_state=42, stratify=y_train)
gc.collect()

In [None]:
X_train = pd.DataFrame(X_train, columns=train.columns.tolist())
X_test = pd.DataFrame(X_test, columns=train.columns.tolist())

In [None]:
X_train.head()

In [None]:
X_train.shape

## Label Denoising Experiments

In [None]:
!pip install cleanlab

In [None]:
from cleanlab.classification import LearningWithNoisyLabels

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

In [None]:
scaler = RobustScaler()
clf = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=-1)
pipe = make_pipeline(scaler, clf)

pipe.fit(X_train, y_train)

# predict on train/test
y_hat = pipe.predict_proba(X_train)[:,1]
pred = pipe.predict_proba(X_test)[:,1]

# metrics
print("Orig. Train AUC:", roc_auc_score(y_train, y_hat))
print("Orig. Train Acc:", balanced_accuracy_score(y_train, y_hat > 0.5))
print()
print("Test AUC:", roc_auc_score(y_test, pred))
print("Test Acc:", balanced_accuracy_score(y_test, pred > 0.5))

In [None]:
# initialize
y_new = y_train

# repeated experiments with label updates to see if we can filter out noisy labels
# and to see the downstream effects of the performance on the test-set
for k in range(5):
    print(f'Trial {k}:')
    scaler = RobustScaler()
    base = LogisticRegression(class_weight='balanced', random_state=42, n_jobs=-1)
    clf = LearningWithNoisyLabels(clf=base, seed=42, pulearning=True, prune_method='both', n_jobs=1)
    pipe = make_pipeline(scaler, clf)
    pipe.fit(X_train.values, y_new)
    
    # predict on train/test
    y_hat = pipe.predict_proba(X_train)[:,1]
    pred = pipe.predict_proba(X_test)[:,1]
    
    # metrics
    print()
    print("Orig. Train AUC:", roc_auc_score(y_train, y_hat))
    print("Orig. Train Acc:", balanced_accuracy_score(y_train, y_hat > 0.5))
    print()
    print("New Train AUC:", roc_auc_score(y_new, y_hat))
    print("New Train Acc:", balanced_accuracy_score(y_new, y_hat > 0.5))
    print()
    print("Test AUC:", roc_auc_score(y_test, pred))
    print("Test Acc:", balanced_accuracy_score(y_test, pred > 0.5))
    print()
    print(f'\tMasked: {clf.noise_mask.sum()} out of {X_train.shape[0]}')
    print()
    print('=='*30)
    print()
    
    # update labels
    y_new = (0.5*y_hat + 0.5*y_new > 0.9).astype(np.int)
#     del clf; gc.collect()

## Manifold Embedding w/ UMAP

In [None]:
from umap import UMAP

In [None]:
mapper = UMAP(n_components=2, n_neighbors=50, random_state=2021)
mapper.fit(X_train)

embedded = mapper.transform(X_train)

SIZE = (13, 8)
plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
embedded = mapper.transform(X_test)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

## Metric Learning w/ UMAP (given noisy labels)

In [None]:
mapper = UMAP(n_components=2, n_neighbors=50, random_state=2021)
mapper.fit(X_train, y_train)

embedded = mapper.transform(X_train)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
embedded = mapper.transform(X_test)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

## Metric Learning w/ UMAP (given cleaned labels)

In [None]:
y_new = pipe.predict(X_train.values)

mapper = UMAP(n_components=2, n_neighbors=50, random_state=2021)
mapper.fit(X_train, y_new)

embedded = mapper.transform(X_train)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
embedded = mapper.transform(X_test)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

## Q: Can we detect the noisy labels in the test-set?

In [None]:
embedded = mapper.transform(X_train)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
from sklearn.mixture import BayesianGaussianMixture

In [None]:
gmm = BayesianGaussianMixture(n_components=2, random_state=42)
gmm.fit(embedded)

In [None]:
labels = gmm.predict(embedded)
labels

In [None]:
from sklearn.metrics import homogeneity_score

In [None]:
homogeneity_score(y_new, labels)

In [None]:
mapper = UMAP(n_components=2, n_neighbors=50, random_state=2021)
mapper.fit(X_train, labels)

embedded = mapper.transform(X_train)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
embedded = mapper.transform(X_test)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
X = X_train.append(X_test, ignore_index=True)
y_mask = np.append(labels, y_test*0-1)

In [None]:
mapper = UMAP(n_components=2, n_neighbors=50, random_state=2021)
mapper.fit(X, y_mask)

embedded = mapper.transform(X_train)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_train, alpha=0.5, cmap='plasma')
plt.show()

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=labels, alpha=0.5, cmap='plasma')
plt.show()

In [None]:
embedded = mapper.transform(X_test)

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=y_test, alpha=0.5, cmap='plasma')
plt.show()

plt.figure(figsize=SIZE)
plt.scatter(embedded[:,0], embedded[:,1], c=gmm.predict(embedded), alpha=0.5, cmap='plasma')
plt.show()