In [1]:
#r "nuget: Microsoft.ML"
using Microsoft.ML;
using Microsoft.ML.Data;

This example comes from the ML.NET documentation: https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.featureselectioncatalog.selectfeaturesbasedoncount?view=ml-dotnet

In [2]:
class TransformedData
{
    public float[] NumericVector { get; set; }

    public string[] StringVector { get; set; }
}

class InputData
{
    [VectorType(3)]
    public float[] NumericVector { get; set; }

    [VectorType(3)]
    public string[] StringVector { get; set; }
}

In [3]:
static IEnumerable<InputData> GetData()
{
    var data = new List<InputData>
    {
        new InputData
        {
            NumericVector = new float[] { 4, float.NaN, 6 },
            StringVector = new string[] { "A", "WA", "Male"}
        },
        new InputData
        {
            NumericVector = new float[] { 4, 5, 6 },
            StringVector = new string[] { "A", "", "Female"}
        },
        new InputData
        {
            NumericVector = new float[] { 4, 5, 6 },
            StringVector = new string[] { "A", "NY", null}
        },
        new InputData
        {
            NumericVector = new float[] { 4, float.NaN, float.NaN },
            StringVector = new string[] { "A", null, "Male"}
        }
    };
    return data;
}

In [4]:
var mlContext = new MLContext();

In [7]:
var rawData = GetData();
rawData

index,NumericVector,StringVector
0,"[ 4, NaN, 6 ]","[ A, WA, Male ]"
1,"[ 4, 5, 6 ]","[ A, , Female ]"
2,"[ 4, 5, 6 ]","[ A, NY, <null> ]"
3,"[ 4, NaN, NaN ]","[ A, <null>, Male ]"


In [8]:
var data = mlContext.Data.LoadFromEnumerable(rawData);

We will use the SelectFeaturesBasedOnCount transform estimator, to retain only those slots which have at least 'count' non-default values per slot.

Multi column example. This pipeline transform two columns using the provided parameters.

In [9]:
var pipeline = mlContext.Transforms.FeatureSelection
    .SelectFeaturesBasedOnCount(
        new InputOutputColumnPair[] {
            new InputOutputColumnPair("NumericVector"),
            new InputOutputColumnPair("StringVector")
        }, 
        count: 3
    );

In [10]:
var transformedData = pipeline.Fit(data).Transform(data);

In [13]:
mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false)

index,NumericVector,StringVector
0,"[ 4, 6 ]","[ A, Male ]"
1,"[ 4, 6 ]","[ A, Female ]"
2,"[ 4, 6 ]","[ A, ]"
3,"[ 4, NaN ]","[ A, Male ]"
