# Using residuals to estimate a missing feature

## Set up notebook

In [1]:
import numpy as np
import sklearn as sk

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
def standardise_data(X_train, X_test):
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

## Generate synthetic dataset for classification

In [3]:
# Generate synthetic data set with three features and two classes
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html

X_full, y = make_classification(
    n_samples=5000, 
    n_features=3,
    n_informative=3,
    n_redundant=0,
    n_classes=2,
    class_sep=1.5,
    random_state=42)

In [4]:
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_full, y, test_size = 0.25)

Get two features, and hold the third one back as 'Z'

In [5]:
X_train = X_train_full[:,0:2]
X_test = X_test_full[:,0:2]
Z_train = X_train_full[:,2]
Z_test = X_test_full[:,2]

## Test model based on all features

In [6]:
# Standardise data
X_train_std, X_test_std = standardise_data(X_train_full, X_test_full)

# Fit model
model = LogisticRegression()
model.fit(X_train_std,y_train)

# Predict training and test set labels
y_pred_test = model.predict(X_test_std)
accuracy = np.mean(y_pred_test == y_test)
print (f'Accuracy: {accuracy:0.3f}')

Accuracy: 0.973


## Test model based on two features

In [7]:
# Standardise data
X_train_std, X_test_std = standardise_data(X_train, X_test)

# Fit model
model = sk.linear_model.LogisticRegression()
model.fit(X_train_std,y_train)

# Predict training and test set labels
y_pred_test = model.predict(X_test_std)
accuracy = np.mean(y_pred_test == y_test)
print (f'Accuracy: {accuracy:0.3f}')

Accuracy: 0.686


In [8]:
model.predict_proba(X_test_std)

array([[0.80551727, 0.19448273],
       [0.50251199, 0.49748801],
       [0.16500738, 0.83499262],
       ...,
       [0.27287238, 0.72712762],
       [0.79231367, 0.20768633],
       [0.41547021, 0.58452979]])