In [None]:
!python -m pip install doubtlab

In [None]:
from sklearn.linear_model import LogisticRegression

from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import CleanlabReason, ProbaReason, WrongPredictionReason
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


# Doubtlab


Doubtlab provides general tricks that may help you find bad, or noisy, labels in your dataset. You can use doubtlab to check your own datasets for bad labels. Many of the methods that provided are based on the interaction between a dataset and a model trained on that dataset.

Doubtlab provides many methods for bad/noisy label detection.

### General Reasons
- **RandomReason:** assign doubt randomly, just for sure
- **OutlierReason:** assign doubt when the model declares a row an outlier

### Classification Reasons
- **ProbaReason:** assign doubt when a models' confidence-values are low
- **LongConfidenceReason:** assign doubt when a wrong class gains too much confidence
- **ShortConfidenceReason:** assign doubt when the correct class gains too little confidence
- **DisagreeReason:** assign doubt when two models disagree on a prediction
- **OutlierReason:** assign doubt when the model declares a row an outlier
- **CleanLabReason:** assign doubt according to cleanlab

### Regression Reasons
- **AbsoluteDifferenceReason:** assign doubt when the absolute difference is too high
- **RelativeDifferenceReason:** assign doubt when the relative difference is too high

https://github.com/koaning/doubtlab <br>
https://koaning.github.io/doubtlab/ <br>


In [None]:
train=pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test=pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')

In [None]:
features = [col for col in train.columns if 'f' in col]
y=train['target']
X=train[features]

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

model = LogisticRegression(solver='liblinear',max_iter=1_000, random_state=42)
model.fit(X, y)

In [None]:
reasons = {
    'proba': ProbaReason(model=model),
    'wrong_pred': WrongPredictionReason(model=model)
}

doubt = DoubtEnsemble(**reasons)
# Get the ordered indices of examples worth checking again
indices = doubt.get_indices(X, y)
# Get dataframe with "reason"-ing behind the sorting
predicates = doubt.get_predicates(X, y)

In [None]:
target_1 = predicates[predicates['predicate_wrong_pred']==0].shape[0]
target_2 = predicates[predicates['predicate_wrong_pred']==1].shape[0]
plt.figure(figsize=(15, 7))
plt.pie([target_1,target_2], labels = ["0" , "1"],autopct='%1.1f%%',colors = ["#17becf", "#1f77b4"])
plt.title('Wrong Prediction Reason')

In [None]:
predicates.to_csv('predicates.csv')

In [None]:
predicates.sample(n=100).head(40)