# Reproduce Fashion Mnist datasets classification from Kevin

In [1]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import os
from qiskit import QuantumCircuit
from qiskit_aer import AerSimulator 
import re
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import gzip
import struct


# Load the Fashion MNIST dataset
#fashion_mnist = fetch_openml(name="Fashion-MNIST")
DATA_PATH="/pscratch/sd/l/luckow/data/qml/"

In [2]:
# # Extract labels
# mnist_labels = fashion_mnist.target.astype(int)

# # Class names for Fashion MNIST
# class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# # Print the first few labels and their corresponding class names
# for i in range(5):
#     print(f"IDX: {i},  Label: {mnist_labels[i]} - Class: {class_names[mnist_labels[i]]}")


print labels

In [3]:

# Path to the Fashion MNIST labels file
labels_path = os.path.join(DATA_PATH, "train-labels-idx1-ubyte.gz")
# Class names for Fashion MNIST
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
def read_labels(filepath):
    with gzip.open(filepath, 'rb') as file:
        # Read the magic number and number of labels
        magic, num_labels = struct.unpack(">II", file.read(8))
        # Read the labels
        labels = struct.unpack(f'>{num_labels}B', file.read(num_labels))
    return labels

# Read labels from the file
mnist_labels = read_labels(labels_path)


# Optionally, print the first few labels with their class names
for i in range(5):
    print(f"Index: {i}, Label: {mnist_labels[i]} - Class: {class_names[mnist_labels[i]]}")


Index: 0, Label: 9 - Class: Ankle boot
Index: 1, Label: 0 - Class: T-shirt/top
Index: 2, Label: 0 - Class: T-shirt/top
Index: 3, Label: 3 - Class: Dress
Index: 4, Label: 0 - Class: T-shirt/top


In [11]:
def create_train_data(qasm_files_path):
    # Get the list of QASM files in the directory
    qasm_files = [f for f in os.listdir(qasm_files_path) if f.endswith('.qasm')]

    # Prepare data and labels
    train_data = []
    train_labels = []

    # Use the Aer simulator backend
    backend = AerSimulator()

    for qasm_file in tqdm(qasm_files):
        file_path = os.path.join(qasm_files_path, qasm_file)
        
        # Read the QASM file
        with open(file_path, 'r') as file:
            qasm_code = file.read()
        
        # Create a QuantumCircuit from the QASM code
        circuit = QuantumCircuit.from_qasm_str(qasm_code)
        circuit.measure_all()

        # Execute the circuit
        job = backend.run(circuit, shots=8192*2)
        
        # Get the results
        result = job.result()
        counts = result.get_counts()
        
        #print(f'Results for {qasm_file}: {counts}')

        # Convert counts to a fixed-length feature vector
        # Assuming the circuits have at most 11 qubits
        vector_length = 2**11
        feature_vector = np.zeros(vector_length)
        for state, count in counts.items():
            index = int(state, 2)
            feature_vector[index] = count
        
        train_data.append(feature_vector)
        
        # lookup label using the mnist-fashing index encoded in the filename 
        # Define the regular expression pattern to extract the number
        pattern = r'_(\d+)\.qasm'
        # Search for the pattern in the filename
        match = re.search(pattern, qasm_file)

        # Extract the number from the match object
        if match:
            number = int(match.group(1))
            #print(number)  # Output: 58659
            label = mnist_labels[number]
            train_labels.append(label)
        else:
            print("No match found")

    return train_data, train_labels

    
QASM_TRAIN_CIRCUIT_PATH=os.path.join(DATA_PATH, "train_general_3")
QASM_TEST_CIRCUIT_PATH=os.path.join(DATA_PATH, "test_general_3")

train_data, train_labels = create_train_data(QASM_TRAIN_CIRCUIT_PATH)
test_data, test_labels = create_train_data(QASM_TEST_CIRCUIT_PATH)

# Combine training and test data
all_data = train_data + test_data
all_labels = train_labels + test_labels


  0%|          | 3/60000 [00:00<1:37:26, 10.26it/s]

100%|██████████| 60000/60000 [35:48<00:00, 27.92it/s]  
100%|██████████| 10000/10000 [06:04<00:00, 27.41it/s]


In [14]:
np.save('all_data.npy', all_data)
np.save('all_labels.npy', all_labels)

# Train different Classifiers
## Load the data

In [4]:
import numpy as np

# Load the data from disk
all_data = np.load('all_data.npy')
all_labels = np.load('all_labels.npy')

# Print shapes to verify
print(all_data.shape)  # Should print (1000, 2048) or the shape of your actual data
print(all_labels.shape)  # Should print (1000,) or the shape of your actual labels

# Convert data and labels to numpy arrays
data = np.array(all_data)
labels = np.array(all_labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1, random_state=42)

(70000, 2048)
(70000,)


## Random Forrest Classifier

In [7]:
# Train a RandomForest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

              precision    recall  f1-score   support

           0       0.23      0.19      0.21       706
           1       0.26      0.33      0.29       737
           2       0.18      0.16      0.17       680
           3       0.22      0.21      0.21       701
           4       0.20      0.21      0.21       739
           5       0.21      0.20      0.20       674
           6       0.18      0.12      0.14       680
           7       0.37      0.52      0.43       685
           8       0.34      0.26      0.29       691
           9       0.32      0.38      0.35       707

    accuracy                           0.26      7000
   macro avg       0.25      0.26      0.25      7000
weighted avg       0.25      0.26      0.25      7000

Accuracy: 0.25842857142857145


## SVM

In [15]:
# Define the polynomial kernel SVM
svm = SVC(kernel='poly', degree=2, gamma=1, coef0=0)

# Use one-vs-rest strategy
ovr_svm = OneVsRestClassifier(svm)

# Define parameter grid for GridSearchCV
#param_grid = {'estimator__C': [0.1, 1, 10, 100]}  # Regularization constants to try

# Perform grid search with cross-validation
grid_search = GridSearchCV(ovr_svm, param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Best regularization constant: {grid_search.best_params_}')
print(f'Accuracy on test data: {accuracy}')


Single Classifier with Grid Search

In [5]:
# Define the polynomial kernel SVM
svm = SVC(kernel='poly', degree=2, gamma=1, coef0=0)
# classifier = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

# Use one-vs-rest strategy
ovr_svm = OneVsRestClassifier(svm)

ovr_svm.fit(X_train, y_train)





Evaluate Classifier

In [None]:
# # Predict on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

## Neural Network

In [10]:
from sklearn.neural_network import MLPClassifier

# Initialize the MLP classifier
classifier = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), max_iter=300, solver='adam', random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')


              precision    recall  f1-score   support

           0       0.20      0.16      0.18       706
           1       0.26      0.31      0.28       737
           2       0.17      0.19      0.18       680
           3       0.20      0.20      0.20       701
           4       0.18      0.14      0.16       739
           5       0.20      0.20      0.20       674
           6       0.18      0.20      0.19       680
           7       0.37      0.42      0.40       685
           8       0.31      0.31      0.31       691
           9       0.35      0.32      0.33       707

    accuracy                           0.24      7000
   macro avg       0.24      0.24      0.24      7000
weighted avg       0.24      0.24      0.24      7000

Accuracy: 0.24442857142857144
