In [1]:
#r "nuget: Microsoft.ML"
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

In [2]:
if (!File.Exists("badges.data"))
{
    using var client = new WebClient();
    client.DownloadFile("https://archive.ics.uci.edu/ml/machine-learning-databases/badges/badges.data", "badges.data");
}

File.ReadLines("badges.data").Take(5)

index,value
0,
1,+ Naoki Abe
2,- Myriam Abramson
3,+ David W. Aha
4,+ Kamal M. Ali


> # Data Set Information:

> Part of the problem in using an automated program to discover the unknown target function is to decide how to encode names such that the program can be used. The data below are presented in the form of a +/- label followed by the person's name. It is up to the learning-system user to decide how to convert this data into something usable by the system (e.g., what attributes to use if your favorite learner requires feature-vector data).

In [49]:
class ParsedRow
{
    public bool IsPlus { get; set; }
    
    public string FirstName { get; set; }
    
    public string MiddleInitial { get; set; }
    
    public string LastName { get; set; }
}

In [57]:
using System.Text.RegularExpressions;

static bool IsPlus(string str) =>
    str switch
    {
        null => false,
        ""   => false,
        _    => str[0] switch
        {
            '+' => true,
            _  => false,
        }
    };

static string GetMatch(string str, string pattern)
{
    var groups = Regex.Match(str, pattern).Groups;
    return groups.Count switch
    {
        2 => groups[1].Value,
        _ => ""
    };
}

static string GetFirstName(string str) => GetMatch(str, @"^. ([a-zA-Z]+) ");
    
static string GetMiddle(string str) => GetMatch(str, @" ([a-zA-Z]{1})\. ");

static string GetLastName(string str) => GetMatch(str, @" ([a-zA-Z]+)$");

In [58]:
IsPlus("- Ray Vernagus")

In [59]:
GetFirstName("+ Ray Vernagus")

Ray

In [60]:
GetMiddle("+ Ray W. Vernagus")

W

In [61]:
GetLastName("+ Ray Vernagus")

Vernagus

In [62]:
File.ReadLines("badges.data")
    .Skip(1)
    .Take(5)
    .Select(line =>
        new ParsedRow
        {
            IsPlus = IsPlus(line),
            FirstName = GetFirstName(line),
            MiddleInitial = GetMiddle(line),
            LastName = GetLastName(line)
        })

index,IsPlus,FirstName,MiddleInitial,LastName
0,True,Naoki,,Abe
1,False,Myriam,,Abramson
2,True,David,W,Aha
3,True,Kamal,M,Ali
4,False,Eric,,Allender


In [66]:
var data = File.ReadLines("badges.data")
    .Skip(1)
    .Select(line =>
        new ParsedRow
        {
            IsPlus = IsPlus(line),
            FirstName = GetFirstName(line),
            MiddleInitial = GetMiddle(line),
            LastName = GetLastName(line)
        })
    .ToList();

In [68]:
data.First().FirstName.ToCharArray()

index,value
0,N
1,a
2,o
3,k
4,i
