# Summary

We used logistic regression to try finding and correcting the mislabeled samples in the training set. We performed four iterations, always relabelling the most discrepant samples.

The underlying model does not seem to be a simple logistic regression and thus there is a risk that some of the corrections are wrong, especially in the latter iterations.

We output the target values we corrected in each iteration as a \*.npy file, together with predictions of the logistic regression. The score raises from 0.74529 to 0.74637 between the first and fourth iteration.

Built upon the [notebook](https://www.kaggle.com/hamzaghanmi/make-it-simple).

# Import data and scale the features

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import random
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

# Import libraries

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')
y = train['target']

In [None]:
cols = ['f'+str(i) for i in range(100)]

In [None]:
# apply standar scaler to the data
scaler = StandardScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

# Helper functions

In [None]:
def fit_linear_regression(train, test):
    """
    Fits logistic regression to the train data, prints the accuracy and auc
    and returns the fitted model and test predictions
    """
    preds_test = np.zeros(test.shape[0])
    preds_train = np.zeros(train.shape[0])

    model = LogisticRegression(solver='liblinear')
    model.fit(train[cols],y)

    preds_test  += model.predict_proba(test[cols])[:,1]
    preds_train += model.predict_proba(train[cols])[:,1]

    auc = roc_auc_score(y, preds_train)
    acc = accuracy_score(y, model.predict(train[cols]))

    print(f"accuracy: {round(acc*100,3)} , auc: {round(auc*100,3)}")
    
    return model, preds_test

In [None]:
def plot_histograms(train, var = 'LRLC', b = np.arange(-4, 4, 0.1)):
    """
    Plots two histograms - distributions of "var" for training samples with target zero / one.
    """
    y0 = train[train['target'] == 0]
    y1 = train[train['target'] == 1]

    plt.hist(y0[var], bins = b, alpha = 0.3);
    plt.title('target = 0');
    plt.xlabel(var);
    plt.axvline(0, color = 'k');

    plt.show()

    plt.hist(y1[var], bins = b, alpha = 0.3);
    plt.title('target = 1');
    plt.xlabel(var);
    plt.axvline(0, color = 'k');

# First round

Fit the model, save the resulting predictions

In [None]:
model, preds_test = fit_linear_regression(train, test)
sub['target']=preds_test
sub.to_csv('submission.csv', index=False)

Add the logistic regression variable to the training set, plot its distribution on target == 0 and target == 1 subsets

In [None]:
c0 = model.intercept_[0]
ci = model.coef_[0]

train['LRLC'] = c0
test['LRLC'] = c0
for i in range(100):
    train['LRLC'] += ci[i] * train['f' + str(i)]
    test['LRLC']  += ci[i] * test['f' + str(i)]
    
plot_histograms(train, 'LRLC', np.arange(-3, 3, 0.1))

Correct the samples we believe are mislabeled. Save the result.

In [None]:
THR = 0.3
mislabelled_guys = ((train['LRLC'] > THR) & (train['target'] == 0)) | ((train['LRLC'] < - THR) & (train['target'] == 1))
train.loc[mislabelled_guys, 'target'] = 1 - train.loc[mislabelled_guys, 'target']
np.save('targets1.npy', train['target'].values)

# Second round

From now on, we just repeat the same procedure three more times, always saving the results.

In [None]:
model, preds_test = fit_linear_regression(train, test)
sub['target']=preds_test
sub.to_csv('submission2.csv', index=False)

In [None]:
c0 = model.intercept_[0]
ci = model.coef_[0]

train['LRLC2'] = c0
test['LRLC2'] = c0
for i in range(100):
    train['LRLC2'] += ci[i] * train['f' + str(i)]
    test['LRLC2']  += ci[i] * test['f' + str(i)]

In [None]:
plot_histograms(train, 'LRLC2')

In [None]:
THR = 1.2
mislabelled_guys = ((train['LRLC2'] > THR) & (train['target'] == 0)) | ((train['LRLC2'] < - THR) & (train['target'] == 1))
train.loc[mislabelled_guys, 'target'] = 1 - train.loc[mislabelled_guys, 'target']
np.save('targets2.npy', train['target'].values)

# Third round

In [None]:
model, preds_test = fit_linear_regression(train, test)
sub['target']=preds_test
sub.to_csv('submission3.csv', index=False)

In [None]:
c0 = model.intercept_[0]
ci = model.coef_[0]

train['LRLC3'] = c0
test['LRLC3'] = c0
for i in range(100):
    train['LRLC3'] += ci[i] * train['f' + str(i)]
    test['LRLC3']  += ci[i] * test['f' + str(i)]
    
plot_histograms(train, 'LRLC3')

In [None]:
THR = 1.2
mislabelled_guys = ((train['LRLC3'] > THR) & (train['target'] == 0)) | ((train['LRLC3'] < - THR) & (train['target'] == 1))
train.loc[mislabelled_guys, 'target'] = 1 - train.loc[mislabelled_guys, 'target']
np.save('targets3.npy', train['target'].values)

# Fourth round

In [None]:
model, preds_test = fit_linear_regression(train, test)
sub['target']=preds_test
sub.to_csv('submission4.csv', index=False)

In [None]:
c0 = model.intercept_[0]
ci = model.coef_[0]

train['LRLC4'] = c0
test['LRLC4'] = c0
for i in range(100):
    train['LRLC4'] += ci[i] * train['f' + str(i)]
    test['LRLC4']  += ci[i] * test['f' + str(i)]
    
plot_histograms(train, 'LRLC4')

In [None]:
THR = 1.2
mislabelled_guys = ((train['LRLC4'] > THR) & (train['target'] == 0)) | ((train['LRLC4'] < - THR) & (train['target'] == 1))
train.loc[mislabelled_guys, 'target'] = 1 - train.loc[mislabelled_guys, 'target']

np.save('targets4.npy', train['target'].values)