In [23]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Tools for the HASY dataset.

Type `./hasy_tools.py --help` for the command line tools and `help(hasy_tools)`
in the interactive Python shell for the module options of hasy_tools.
"""

import logging
import csv
import os
import random
from PIL import Image, ImageDraw
import sys
from sklearn.model_selection import train_test_split

import numpy as np
import scipy.ndimage
import matplotlib.pyplot as plt

In [None]:
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    level=logging.INFO,
                    stream=sys.stdout)

__version__ = "v2.0"

In [None]:
def _load_csv(filepath, delimiter=',', quotechar="'"):
    """
    Load a CSV file.

    Parameters
    ----------
    filepath : str
        Path to a CSV file
    delimiter : str, optional
    quotechar : str, optional

    Returns
    -------
    list of dicts : Each line of the CSV file is one element of the list.
    """
    data = []
    csv_dir = os.path.dirname(filepath)
    with open(filepath, 'rb') as csvfile:
        reader = csv.DictReader(csvfile,
                                delimiter=delimiter,
                                quotechar=quotechar)
        for row in reader:
            if 'path' in row:
                row['path'] = os.path.abspath(os.path.join(csv_dir,
                                                           row['path']))
            data.append(row)
    return data

In [None]:
def generate_index(csv_filepath):
    """
    Generate an index 0...k for the k labels.

    Parameters
    ----------
    csv_filepath : str
        Path to 'test.csv' or 'train.csv'

    Returns
    -------
    dict : Maps a symbol_id as in test.csv and
        train.csv to an integer in 0...k, where k is the total
        number of unique labels.
    """
    symbol_id2index = {}
    data = _load_csv(csv_filepath)
    i = 0
    for item in data:
        if item['symbol_id'] not in symbol_id2index:
            symbol_id2index[item['symbol_id']] = i
            i += 1
    return symbol_id2index

In [2]:
def load_images(csv_filepath, symbol_id2index, one_hot=True, flatten=False):
    """
    Load the images into a 4D uint8 numpy array [index, y, x, depth].

    Parameters
    ----------
    csv_filepath : str
        'test.csv' or 'train.csv'
    symbol_id2index : dict
        Dictionary generated by generate_index
    one_hot : bool, optional
        Make label vector as 1-hot encoding, otherwise index
    flatten : bool, optional
        Flatten feature vector

    Returns
    -------
    images, labels : Images is a 4D uint8 numpy array [index, y, x, depth]
                     and labels is a 2D uint8 numpy array [index][1-hot enc].
    """
    WIDTH, HEIGHT = 32, 32
    dataset_path = os.path.dirname(csv_filepath)  # Main directory of HASY
    data = _load_csv(csv_filepath)
    if flatten:
        images = np.zeros((len(data), WIDTH * HEIGHT))
    else:
        images = np.zeros((len(data), WIDTH, HEIGHT, 1))
    labels = []
    for i, data_item in enumerate(data):
        fname = os.path.join(dataset_path, data_item['path'])
        if flatten:
            img = scipy.ndimage.imread(fname, flatten=False, mode='L')
            images[i, :] = img.flatten()
        else:
            images[i, :, :, 0] = scipy.ndimage.imread(fname,
                                                      flatten=False,
                                                      mode='L')
        label = symbol_id2index[data_item['symbol_id']]
        labels.append(label)
    data = images, np.array(labels)
    if one_hot:
        data = (data[0], np.eye(len(symbol_id2index))[data[1]])
    return data

In [3]:
def _is_valid_png(filepath):
    """
    Check if the PNG image is valid.

    Parameters
    ----------
    filepath : str
        Path to a PNG image

    Returns
    -------
    bool : True if the PNG image is valid, otherwise False.
    """
    try:
        test = Image.open(filepath)
        test.close()
        return True
    except:
        return False

In [4]:
def _verify_all():
    """Verify all PNG files in the training and test directories."""
    for csv_data_path in ['classification-task/fold-1/test.csv',
                          'classification-task/fold-1/train.csv']:
        train_data = _load_csv(csv_data_path)
        for data_item in train_data:
            if not _is_valid_png(data_item['path']):
                logging.info("%s is invalid." % data_item['path'])
        logging.info("Checked %i items of %s." %
                     (len(train_data), csv_data_path))


def create_random_overview(img_src, x_images, y_images):
    """Create a random overview of images."""
    # Create canvas
    background = Image.new('RGB',
                           (35 * x_images, 35 * y_images),
                           (255, 255, 255))
    bg_w, bg_h = background.size
    # Paste image on canvas
    for x in range(x_images):
        for y in range(y_images):
            path = random.choice(img_src)['path']
            img = Image.open(path, 'r')
            img_w, img_h = img.size
            offset = (35 * x, 35 * y)
            background.paste(img, offset)
    # Draw lines
    draw = ImageDraw.Draw(background)
    for y in range(y_images):  # horizontal lines
        draw.line((0, 35 * y - 2, 35 * x_images, 35 * y - 2), fill=0)
    for x in range(x_images):  # vertical lines
        draw.line((35 * x - 2, 0, 35 * x - 2, 35 * y_images), fill=0)
    # Store
    background.save('hasy-overview.png')

In [5]:
def _get_colors(data, verbose=False):
    """
    Get how often each color is used in data.

    Parameters
    ----------
    data : dict
        with key 'path' pointing to an image
    verbose : bool, optional

    Returns
    -------
    color_count : dict
        Maps a grayscale value (0..255) to how often it was in `data`
    """
    color_count = {}
    for i in range(256):
        color_count[i] = 0
    for i, data_item in enumerate(data):
        if i % 1000 == 0 and i > 0 and verbose:
            print("%i of %i done" % (i, len(data)))
        fname = os.path.join('.', data_item['path'])
        img = scipy.ndimage.imread(fname, flatten=False, mode='L')
        for row in img:
            for pixel in row:
                color_count[pixel] += 1
    return color_count

In [6]:
def data_by_class(data):
    """
    Organize `data` by class.

    Parameters
    ----------
    data : list of dicts
        Each dict contains the key `symbol_id` which is the class label.

    Returns
    -------
    dbc : dict
        mapping class labels to lists of dicts
    """
    dbc = {}
    for item in data:
        if item['symbol_id'] in dbc:
            dbc[item['symbol_id']].append(item)
        else:
            dbc[item['symbol_id']] = [item]
    return dbc

In [7]:
def _get_color_statistics(csv_filepath, verbose=False):
    """
    Count how often white / black is in the image.

    Parameters
    ----------
    csv_filepath : str
        'test.csv' or 'train.csv'
    verbose : bool, optional
    """
    symbolid2latex = _get_symbolid2latex()
    data = _load_csv(csv_filepath)
    black_level, classes = [], []
    for symbol_id, elements in data_by_class(data).items():
        colors = _get_colors(elements)
        b = colors[0]
        w = colors[255]
        black_level.append(float(b) / (b + w))
        classes.append(symbol_id)
        if verbose:
            print("%s:\t%0.4f" % (symbol_id, black_level[-1]))
    print("Average black level: %0.4f" % np.average(black_level))
    print("Median black level: %0.4f" % np.median(black_level))
    print("Minimum black level: %0.4f (class: %s)" %
          (min(black_level),
           [symbolid2latex[c]
            for bl, c in zip(black_level, classes) if bl <= min(black_level)]))
    print("Maximum black level: %0.4f (class: %s)" %
          (max(black_level),
           [symbolid2latex[c]
            for bl, c in zip(black_level, classes) if bl >= max(black_level)]))

In [8]:
def _get_symbolid2latex(csv_filepath='symbols.csv'):
    """Return a dict mapping symbol_ids to LaTeX code."""
    symbol_data = _load_csv(csv_filepath)
    symbolid2latex = {}
    for row in symbol_data:
        symbolid2latex[row['symbol_id']] = row['latex']
    return symbolid2latex

In [9]:
def _analyze_class_distribution(csv_filepath,
                                max_data=1000,
                                bin_size=25):
    """Plot the distribution of training data over graphs."""
    symbol_id2index = generate_index(csv_filepath)
    index2symbol_id = {}
    for index, symbol_id in symbol_id2index.items():
        index2symbol_id[symbol_id] = index
    data, y = load_images(csv_filepath, symbol_id2index, one_hot=False)

    data = {}
    for el in y:
        if el in data:
            data[el] += 1
        else:
            data[el] = 1
    classes = data
    images = len(y)

    # Create plot
    print("Classes: %i" % len(classes))
    print("Images: %i" % images)

    class_counts = sorted([count for _, count in classes.items()])
    print("\tmin: %i" % min(class_counts))

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    # plt.title('HASY training data distribution')
    plt.xlabel('Amount of available testing images')
    plt.ylabel('Number of classes')

    # Where we want the ticks, in pixel locations
    ticks = [int(el) for el in list(np.linspace(0, 200, 21))]
    # What those pixel locations correspond to in data coordinates.
    # Also set the float format here
    ax1.set_xticks(ticks)
    labels = ax1.get_xticklabels()
    plt.setp(labels, rotation=30)

    min_examples = 0
    ax1.hist(class_counts, bins=range(min_examples, max_data + 1, bin_size))
    # plt.show()
    filename = '{}.pdf'.format('data-dist')
    plt.savefig(filename)
    logging.info("Plot has been saved as {}".format(filename))

    symbolid2latex = _get_symbolid2latex()

    top10 = sorted(classes.items(), key=lambda n: n[1], reverse=True)[:10]
    top10_data = 0
    for index, count in top10:
        print("\t%s:\t%i" % (symbolid2latex[index2symbol_id[index]], count))
        top10_data += count
    total_data = sum([count for index, count in classes.items()])
    print("Top-10 has %i training data (%0.2f%% of total)" %
          (top10_data, float(top10_data) * 100.0 / total_data))
    print("%i classes have more than %i data items." %
          (sum([1 for _, count in classes.items() if count > max_data]),
           max_data))

In [10]:
def _analyze_pca(csv_filepath):
    """
    Analyze how much data can be compressed.

    Parameters
    ----------
    csv_filepath : str
        Path relative to dataset_path to a CSV file which points to images
    """
    from sklearn.decomposition import PCA
    import itertools as it

    symbol_id2index = generate_index(csv_filepath)
    data, y = load_images(csv_filepath, symbol_id2index, one_hot=False)
    data = data.reshape(data.shape[0], data.shape[1] * data.shape[2])
    pca = PCA()
    pca.fit(data)
    sum_ = 0.0
    done_values = [None, None, None]
    done_points = [False, False, False]
    chck_points = [0.9, 0.95, 0.99]
    for counter, el in enumerate(pca.explained_variance_ratio_):
        sum_ += el
        for check_point, done, i in zip(chck_points, done_points, it.count()):
            if not done and sum_ >= check_point:
                done_points[i] = counter
                done_values[i] = sum_
    for components, variance in zip(done_points, done_values):
        print("%i components explain %0.2f of the variance" %
              (components, variance))

In [11]:
def _get_euclidean_dist(e1, e2):
    """Calculate the euclidean distance between e1 and e2."""
    e1 = e1.flatten()
    e2 = e2.flatten()
    return sum([(el1 - el2)**2 for el1, el2 in zip(e1, e2)])**0.5

In [12]:
def _inner_class_distance(data):
    """Measure the eucliden distances of one class to the mean image."""
    distances = []
    mean_img = None
    for e1 in data:
        fname1 = os.path.join('.', e1['path'])
        img1 = scipy.ndimage.imread(fname1, flatten=False, mode='L')
        if mean_img is None:
            mean_img = img1.tolist()
        else:
            mean_img += img1
    mean_img = mean_img / float(len(data))
    # mean_img = thresholdize(mean_img, 'auto')
    scipy.misc.imshow(mean_img)
    for e1 in data:
        fname1 = os.path.join('.', e1['path'])
        img1 = scipy.ndimage.imread(fname1, flatten=False, mode='L')
        dist = _get_euclidean_dist(img1, mean_img)
        distances.append(dist)

    return (distances, mean_img)

In [13]:
def thresholdize(img, threshold=0.5):
    """Create a black-and-white image from a grayscale image."""
    img_new = []
    if threshold == 'auto':
        img_flat = sorted(img.flatten())
        threshold_ind = int(0.85 * len(img_flat))
        threshold = img_flat[threshold_ind]
    for row in img:
        bla = []
        for col in row:
            if col > threshold:
                bla.append(1)
            else:
                bla.append(0)
        img_new.append(bla)
    return np.array(img_new)

In [14]:
def _analyze_distances(csv_filepath):
    """Analyze the distance between elements of one class and class means."""
    symbolid2latex = _get_symbolid2latex()
    data = _load_csv(csv_filepath)
    data = data_by_class(data)
    mean_imgs = []
    for class_, data_class in data.items():
        latex = symbolid2latex[class_]
        d, mean_img = _inner_class_distance(data_class)
        # scipy.misc.imshow(mean_img)
        print("%s: min=%0.4f, avg=%0.4f, median=%0.4f max=%0.4f" %
              (latex, np.min(d), np.average(d), np.median(d), np.max(d)))
        distarr = sorted([(label, mean_c, _get_euclidean_dist(mean_c,
                                                              mean_img))
                          for label, mean_c in mean_imgs],
                         key=lambda n: n[2])
        for label, mean_c, d in distarr:
            print("\t%s: %0.4f" % (label, d))
        mean_imgs.append((latex, mean_img))

In [15]:
def _analyze_variance(csv_filepath):
    """Calculate the variance of each pixel."""
    symbol_id2index = generate_index(csv_filepath)
    data, y = load_images(csv_filepath, symbol_id2index, one_hot=False)
    # Calculate mean
    sum_ = np.zeros((32, 32))
    for el in data:
        el = np.squeeze(el)
        sum_ += el
    mean_ = sum_ / float(len(data))
    scipy.misc.imshow(mean_)

    # Calculate variance
    centered_ = np.zeros((32, 32))
    for el in data:
        el = np.squeeze(el)
        centered_ += (el - mean_)**2
    centered_ = (1. / len(data)) * centered_**0.5
    scipy.misc.imshow(centered_)
    for row in list(centered_):
        row = list(row)
        print(" ".join(["%0.1f" % nr for nr in row]))

In [16]:
def _analyze_correlation(csv_filepath):
    """
    Analyze and visualize the correlation of features.

    Parameters
    ----------
    csv_filepath : str
        Path to a CSV file which points to images
    """
    import pandas as pd
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    symbol_id2index = generate_index(csv_filepath)
    data, y = load_images(csv_filepath,
                          symbol_id2index,
                          one_hot=False,
                          flatten=True)
    df = pd.DataFrame(data=data)

    logging.info("Data loaded. Start correlation calculation. Takes 1.5h.")
    fig = plt.figure()
    ax1 = fig.add_subplot(111)

    # Where we want the ticks, in pixel locations
    ticks = np.linspace(0, 1024, 17)
    # What those pixel locations correspond to in data coordinates.
    # Also set the float format here
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)
    labels = ax1.get_xticklabels()
    plt.setp(labels, rotation=30)

    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    # Add colorbar, make sure to specify tick locations to match desired
    # ticklabels
    fig.colorbar(cax, ticks=[-0.15, 0, 0.15, 0.30, 0.45, 0.60, 0.75, 0.90, 1])
    filename = '{}.pdf'.format('feature-correlation')
    plt.savefig(filename)

In [17]:
def _create_stratified_split(csv_filepath, n_splits):
    """
    Create a stratified split for the classification task.

    Parameters
    ----------
    csv_filepath : str
        Path to a CSV file which points to images
    n_splits : int
        Number of splits to make
    """
    from sklearn.cross_validation import StratifiedKFold
    data = _load_csv(csv_filepath)
    labels = [el['symbol_id'] for el in data]
    skf = StratifiedKFold(labels, n_folds=n_splits)
    i = 1
    kdirectory = 'classification-task'
    if not os.path.exists(kdirectory):
            os.makedirs(kdirectory)
    for train_index, test_index in skf:
        print("Create fold %i" % i)
        directory = "%s/fold-%i" % (kdirectory, i)
        if not os.path.exists(directory):
            os.makedirs(directory)
        else:
            print("Directory '%s' already exists. Please remove it." %
                  directory)
        i += 1
        train = [data[el] for el in train_index]
        test_ = [data[el] for el in test_index]
        for dataset, name in [(train, 'train'), (test_, 'test')]:
            with open("%s/%s.csv" % (directory, name), 'wb') as csv_file:
                csv_writer = csv.writer(csv_file)
                csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id'))
                for el in dataset:
                    csv_writer.writerow(("../../%s" % el['path'],
                                         el['symbol_id'],
                                         el['latex'],
                                         el['user_id']))

In [18]:
def _create_pair(r1_data, r2_data):
    """Create a pair for the verification test."""
    symbol_index = random.choice(r1_data.keys())
    r1 = random.choice(r1_data[symbol_index])
    is_same = random.choice([True, False])
    if is_same:
        symbol_index2 = symbol_index
        r2 = random.choice(r1_data[symbol_index2])
    else:
        symbol_index2 = random.choice(r2_data.keys())
        while symbol_index2 == symbol_index:
            symbol_index2 = random.choice(r2_data.keys())
        r2 = random.choice(r2_data[symbol_index2])
    return (r1['path'], r2['path'], is_same)

In [19]:
def _create_verification_task(sample_size=32, test_size=0.05):
    """
    Create the datasets for the verification task.

    Parameters
    ----------
    sample_size : int
        Number of classes which will be taken completely
    test_size : float in (0, 1)
        Percentage of the remaining data to be taken to test
    """
    # Get the data
    data = _load_csv('hasy-data-labels.csv')
    for el in data:
        el['path'] = "../hasy-data/" + el['path'].split("hasy-data/")[1]
    data = sorted(data_by_class(data).items(),
                  key=lambda n: len(n[1]),
                  reverse=True)
    symbolid2latex = _get_symbolid2latex()

    # Get complete classes
    random.seed(1337)
    symbols = random.sample(range(len(data)), k=sample_size)
    symbols = sorted(symbols, reverse=True)
    test_data_excluded = []
    for symbol_index in symbols:
        # for class_label, items in data:
        class_label, items = data.pop(symbol_index)
        test_data_excluded += items
        print(symbolid2latex[class_label])

    # Get data from remaining classes
    data_n = []
    for class_label, items in data:
        data_n = data_n + items
    ys = [el['symbol_id'] for el in data_n]
    x_train, x_test, y_train, y_test = train_test_split(data_n,
                                                        ys,
                                                        test_size=test_size)

    # Write the training / test data
    print("Test data (excluded symbols) = %i" % len(test_data_excluded))
    print("Test data (included symbols) = %i" % len(x_test))
    print("Test data (total) = %i" % (len(x_test) + len(test_data_excluded)))
    kdirectory = 'verification-task'
    if not os.path.exists(kdirectory):
        os.makedirs(kdirectory)
    with open("%s/train.csv" % kdirectory, 'wb') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id'))
        for el in x_train:
            csv_writer.writerow((el['path'],
                                 el['symbol_id'],
                                 el['latex'],
                                 el['user_id']))

    x_test_inc_class = data_by_class(x_test)
    x_text_exc_class = data_by_class(test_data_excluded)
    # V1: Both symbols belong to the training set (included symbols)
    with open("%s/test-v1.csv" % kdirectory, 'wb') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(('path1', 'path2', 'is_same'))
        for i in range(100000):
            test_data_tuple = _create_pair(x_test_inc_class, x_test_inc_class)
            csv_writer.writerow(test_data_tuple)

    # V2: r1 belongs to a symbol in the training set, but r2 might not
    with open("%s/test-v2.csv" % kdirectory, 'wb') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(('path1', 'path2', 'is_same'))
        for i in range(100000):
            test_data_tuple = _create_pair(x_test_inc_class, x_text_exc_class)
            csv_writer.writerow(test_data_tuple)

    # V3: r1 and r2 both don't belong to symbols in the training set
    with open("%s/test-v3.csv" % kdirectory, 'wb') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(('path1', 'path2', 'is_same'))
        for i in range(100000):
            test_data_tuple = _create_pair(x_text_exc_class, x_text_exc_class)
            csv_writer.writerow(test_data_tuple)

In [20]:
def _count_users(csv_filepath):
    """
    Count the number of users who contributed to the dataset.

    Parameters
    ----------
    csv_filepath : str
        Path to a CSV file which points to images
    """
    data = _load_csv(csv_filepath)
    user_ids = {}
    for el in data:
        if el['user_id'] not in user_ids:
            user_ids[el['user_id']] = [el['path']]
        else:
            user_ids[el['user_id']].append(el['path'])
    max_els = 0
    max_user = 0
    for user_id, elements in user_ids.items():
        if len(elements) > max_els:
            max_els = len(elements)
            max_user = user_id
    print("Dataset has %i users." % len(user_ids))
    print("User %s created most (%i elements, %0.2f%%)" %
          (max_user, max_els, float(max_els) / len(data) * 100.0))

In [21]:
def _get_parser():
    """Get parser object for hasy_tools.py."""
    import argparse
    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
    parser = ArgumentParser(description=__doc__,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument("--dataset",
                        dest="dataset",
                        default='classification-task/fold-1/train.csv',
                        help="specify which data to use")
    parser.add_argument("--verify",
                        dest="verify",
                        action="store_true",
                        default=False,
                        help="verify PNG files")
    parser.add_argument("--overview",
                        dest="overview",
                        action="store_true",
                        default=False,
                        help="Get overview of data")
    parser.add_argument("--analyze_color",
                        dest="analyze_color",
                        action="store_true",
                        default=False,
                        help="Analyze the color distribution")
    parser.add_argument("--class_distribution",
                        dest="class_distribution",
                        action="store_true",
                        default=False,
                        help="Analyze the class distribution")
    parser.add_argument("--distances",
                        dest="distances",
                        action="store_true",
                        default=False,
                        help="Analyze the euclidean distance distribution")
    parser.add_argument("--pca",
                        dest="pca",
                        action="store_true",
                        default=False,
                        help=("Show how many principal components explain "
                              "90%% / 95%% / 99%% of the variance"))
    parser.add_argument("--variance",
                        dest="variance",
                        action="store_true",
                        default=False,
                        help="Analyze the variance of features")
    parser.add_argument("--correlation",
                        dest="correlation",
                        action="store_true",
                        default=False,
                        help="Analyze the correlation of features")
    parser.add_argument("--create-classification-task",
                        dest="create_folds",
                        action="store_true",
                        default=False,
                        help=argparse.SUPPRESS)
    parser.add_argument("--create-verification-task",
                        dest="create_verification_task",
                        action="store_true",
                        default=False,
                        help=argparse.SUPPRESS)
    parser.add_argument("--count-users",
                        dest="count_users",
                        action="store_true",
                        default=False,
                        help="Count how many different users have created "
                             "the dataset")
    return parser

In [22]:
if __name__ == "__main__":
    args = _get_parser().parse_args()
    if args.verify:
        _verify_all()
    if args.overview:
        img_src = _load_csv(args.dataset)
        create_random_overview(img_src, x_images=10, y_images=10)
    if args.analyze_color:
        _get_color_statistics(csv_filepath=args.dataset)
    if args.class_distribution:
        _analyze_class_distribution(csv_filepath=args.dataset,
                                    max_data=200,
                                    bin_size=5)
    if args.pca:
        _analyze_pca(csv_filepath=args.dataset)
    if args.distances:
        _analyze_distances(csv_filepath=args.dataset)
    if args.variance:
        _analyze_variance(csv_filepath=args.dataset)
    if args.correlation:
        _analyze_correlation(csv_filepath=args.dataset)
    if args.create_folds:
        _create_stratified_split(args.dataset, int(args.create_folds))
    if args.count_users:
        _count_users(csv_filepath=args.dataset)
    if args.create_verification_task:
        _create_verification_task()

usage: ipykernel_launcher.py [-h] [--dataset DATASET] [--verify] [--overview] [--analyze_color] [--class_distribution]
                             [--distances] [--pca] [--variance] [--correlation] [--count-users]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\sawer\AppData\Roaming\jupyter\runtime\kernel-b6adcc6b-0bc9-4600-910e-d1c20d3e6506.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
