Imputation should be done over the training set, and then propagated to the test set. This means that the mean/median to be used to fill missing values both in train and test set, should be extracted from the train set only. And this is to avoid overfitting.

In [None]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape


In [None]:
#Let's inspect the frequency of the labels in the different variables

X_train.groupby(['BsmtQual'])['BsmtQual'].count().sort_values(ascending=False).plot.bar()

In [None]:
# make sure you understand every line of code.
# If unsure, run them separately in a cell in the notebook until you familiarise with the output
# of each line

def impute_na(df_train, df_test, variable):
    # add additional variable to indicate missingness
    df_train[variable+'_NA'] = np.where(df_train[variable].isnull(), 1, 0)
    df_test[variable+'_NA'] = np.where(df_test[variable].isnull(), 1, 0)
    
    # random sampling
    df_train[variable+'_random'] = df_train[variable]
    df_test[variable+'_random'] = df_test[variable]
    
    # extract the random sample to fill the na
    random_sample_train = df_train[variable].dropna().sample(df_train[variable].isnull().sum(), random_state=0)
    random_sample_test = df_train[variable].dropna().sample(df_test[variable].isnull().sum(), random_state=0)
    
    # pandas needs to have the same index in order to merge datasets
    random_sample_train.index = df_train[df_train[variable].isnull()].index
    random_sample_test.index = df_test[df_test[variable].isnull()].index
    
    df_train.loc[df_train[variable].isnull(), variable+'_random'] = random_sample_train
    df_test.loc[df_test[variable].isnull(), variable+'_random'] = random_sample_test

In [None]:
# and let's replace the NA
for variable in ['columns', 'FireplaceQu', 'GarageType']:
    impute_na(X_train, X_test, variable)