# Getting Started: Mean Imputation

This notebook provides a simple method to imputing missing values, and provides code for making a submission file.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from sklearn.impute import SimpleImputer

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-jun-2022/')

data = pd.read_csv(input_path / 'data.csv', index_col='row_id')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='row-col')

## Use scikit-learn SimpleImputer for a simple benchmark

There are many ways to impute missing values in data. One of the easiest is to simply replace the missing values of a feature with the mean of the feature values. `scikit-learn` has a method for this: `SimpleImputer`

In [None]:
imp = SimpleImputer(
        missing_values=np.nan,
        strategy='mean')

data[:] = imp.fit_transform(data)

## Use `row-col` from the sample submission to find the imputed values

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

submission.to_csv('mean_benchmark.csv')