In [1]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold
from qiskit import BasicAer
#from qiskit.aqua import QuantumInstance
#from qiskit.aqua.algorithms import QSVM
#from qiskit.aqua.components.multiclass_extensions import (ErrorCorrectingCode, AllPairs, OneAgainstRest)
from qiskit.circuit.library import ZZFeatureMap, TwoLocal
from qiskit.utils import algorithm_globals
import scipy.io
import numpy as np
import timeit

In [2]:
import numpy as np
from scipy.sparse import csr_matrix

def chisqr_featselection(F, L, m):
    """
    Select top m features using chi-squared test.
    Parameters:
        F: numpy array or sparse matrix, shape=(n_samples, n_features)
            Feature matrix.
        L: numpy array, shape=(n_samples,)
            Labels for each example in F.
        m: int
            Number of features to select.
    Returns:
        topFeatures: numpy array, shape=(n_features,)
            Binary vector indicating which features are selected (1) or not (0).
    """
    # convert F to a sparse matrix
    F = csr_matrix(F)

    # total number of examples
    n = len(L)

    # number of examples with the category
    p = np.sum(L > 0)

    # number of examples without the category
    q = n - p

    # compute the chi-square score for each feature
    chi2 = np.zeros(F.shape[1])
    for i in range(F.shape[1]):
        # number of examples with the feature and the category
        a = np.sum(F[L > 0][:, i])
        
        # number of examples with the feature but not the category
        b = np.sum(F[L == 0][:, i])
        
        # number of examples without the feature but with the category
        c = p - a
        
        # number of examples without the feature and without the category
        d = q - b
        
        # chi-square score
        #chi2[i] = (n * (a*d-b*c) * (a*d + b*c - (n * (a + b) * (c + d)))**2) / ((a + b) * (c + d) * (a + c) * (b + d))
        chi2[i] = (n * (a*d - b*c)) / ((a + b) * (a + c) * (b + d) * (c + d))

     #get the indices of the top m features
    topFeatures = np.zeros(F.shape[1])
    topFeatures[np.argsort(chi2)[::-1][:m]] = 1

    return topFeatures


In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
newsgroups = fetch_20newsgroups(subset='all')
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Convert text to features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
trainingSet = vectorizer.fit_transform(newsgroups_train.data)
testSet = vectorizer.transform(newsgroups_test.data)
fea = vectorizer.fit_transform(newsgroups.data)
gnd = np.array(newsgroups.target)
# Convert topic labels to integers
trainTopic = newsgroups_train.target
testTopic = newsgroups_test.target


em_nb_list = []
em_svm_list = []
em_cknn_list = []
em_dt_list = []
em_list = []

all_topics = np.zeros((20,))
for i in range(20):
    all_topics[i] = np.sum(trainTopic == i)
topics = np.where(all_topics > 0)[0]

times_qdetect = []
times_dt = []
times_nb = []
times_knn = []
times_svm = []

for topic in topics:
    trainTopic_topic = (trainTopic == topic)
    # Chi-square feature selection method
    top_features = chi2(trainingSet, trainTopic_topic)[1]
    top_features_idx = np.argsort(top_features)[-100:]

    top_features = top_features[top_features_idx]

    featsIdx = np.nonzero(top_features)[0]




In [10]:
trainTopic.shape ,testTopic.shape , trainingSet.shape , testSet.shape ,fea.shape ,gnd.shape ,featsIdx.shape ,topic.shape ,topics.shape

((11314,),
 (7532,),
 (11314, 5000),
 (7532, 5000),
 (18846, 5000),
 (18846,),
 (100,),
 (),
 (20,))

In [5]:
def eval_measures(testLabel, yfit):
    idx = (testLabel==1)
    p = len(testLabel[idx])
    n = len(testLabel[~idx])
    N = p+n

    tp = sum(testLabel[idx]==yfit[idx])
    #tp = sum(testLabel[idx]==yfit[idx][0])
    

    tn = sum(testLabel[~idx]==yfit[~idx])
    fp = n-tn
    fn = p-tp

    accuracy = (tp+tn)/N
    precision = tp/(tp+fp)
    recall = tp / (tp + fn + 1e-10)

    #recall = tp/(tp+fn)
    f_measure = 2*((precision*recall)/(precision + recall))
    print (accuracy, precision, recall, f_measure)
    evaluation = [accuracy, precision, recall, f_measure]
    return evaluation


In [6]:
import numpy as np
import scipy.sparse.linalg

def qdetect(trF, trL, teF, type):
    # trF: training feature matrix
    # trL: training labels
    # teF: test feature matrix
    # type: type of representation
    
    trSetDim = trF.shape
    
    X = trF
    
    N = trSetDim[0]    # number of documents in the training set
    k = trSetDim[1]    # number of features
    r = trL       # labels
    
    # compute probability p's (in category)
    R=sum(r)       # total number of documents in category
    p=np.zeros(k)
    for j in range(k):
        p[j]=0
        #p[j]=(sum(np.sign(X[0:N,j])*(r[0:N]))+0.5)/ (R+1)
        p[j] = (np.all(np.sign(X[0:N, j].toarray().flatten()) * r[0:N]) + 0.5) / (R + 1)
    if(sum(p)>0):
        p=np.sqrt(p/sum(p))
    
    # compute probability q's (not in category)
    q=np.zeros(k)
    for j in range(k):
        q[j]=0
        #q[j]=(sum(np.sign(X[0:N,j])*(1-r[0:N]))+0.5)/(sum(1-r[0:N])+1)
        q[j] = (np.all(np.sign(X[0:N, j].toarray().flatten()) * (1 - r[0:N])) + 0.5) / (np.sum(1 - r[0:N]) + 1)
    if(sum(q)>0):
        q=np.sqrt(q/sum(q))
    print(p.shape)
    print(q.shape)
    print(trF.shape)
    print(trL.shape)
    print(teF.shape)
    if k > 2 and np.isreal(q).all() and np.isreal(p).all() and np.isnan(p).sum() == 0 and np.isnan(q).sum() == 0:
        if type == 1:
         # eigendecomposition step where it allows obtaining diagonal matrix containing eigenvalues on the main diagonal, and another matrix whose columns are the corresponding eigenvectors      
            #[UB, sB] = scipy.sparse.linalg.eigs(np.matmul(p.T, p) - np.matmul(q.T, q), k - 2)
            [UB, sB] = scipy.sparse.linalg.eigs(p.T @ p - q.T @ q, k - 2)
            
            if sB[0] > 0:
                eigB1 = UB[:, 0]
                eigB0 = UB[:, 0:k-2]
            else:
                eigB1 = UB[:, 0:k-2]
                eigB0 = UB[:, 0]
    print(eigB1.shape)           
         # Prediction step
    predictions = np.zeros((teF.shape[0], 1))
    s1 = np.zeros((teF.shape[0], 1))
    s0 = np.zeros((teF.shape[0], 1))
    

    for j in range(teF.shape[0]):
        #s1[j] = np.trace(np.matmul(np.matmul(teF[j,:], np.matmul(eigB1, eigB1.T)), teF[j,:].T))
        s1[j, 0] = np.trace(np.matmul(np.matmul(teF[j, :], eigB1), np.matmul(eigB1.T, teF[j, :].T)))
         
        if s1[j, 0] > 0.5:
            predictions[j, 0] = 1
            
            P = predictions
    else:
        P = np.full((teF.shape[0], 1), np.nan)
    return P



In [7]:
from sklearn.model_selection import StratifiedKFold
from time import time

In [8]:
for i, topic in enumerate(topics):
    
    
    # Chi-square feature selection method
    
    topFeatures = chisqr_featselection(trainingSet, trainTopic, 100)
    featsIdx = np.where(topFeatures > 0)[0]

    # Naive Bayes with feature selection
    nb = MultinomialNB()
    #nb.fit(trainingSet[:, featsIdx], trainTopic == topic)
    nb.fit(trainingSet[:, featsIdx], (trainTopic == topic).ravel())

    label = nb.predict(testSet[:, featsIdx])
    em_ns = eval_measures(testTopic == topic, label)
    em_nb_list.append(em_ns)
    import timeit
    #times_nb[i] = timeit.timeit(lambda: nb.predict(testSet[:, featsIdx]), number=1)
    times_nb.append(timeit.timeit(lambda: nb.predict(testSet[:, featsIdx]), number=1))

    # SVM with feature selection
    svm = SVC(kernel='linear')
    #svm.fit(trainingSet, trainTopic )
    svm.fit(trainingSet[:, featsIdx], trainTopic)
    label = svm.predict(testSet[:, featsIdx])
    em_ss = eval_measures(testTopic == topic, label)
    em_svm_list.append(em_ss)
    times_svm.append(timeit.timeit(lambda: svm.predict(testSet[:, featsIdx]), number=1))
    
    
   

    # KNN with feature selection
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(trainingSet[:, featsIdx], trainTopic == topic)
    label = knn.predict(testSet[:, featsIdx])
    em_ks = eval_measures(testTopic == topic, label)
    em_cknn_list.append(em_ks)
    times_knn.append(timeit.timeit(lambda: knn.predict(testSet[:, featsIdx]), number=1))
    

   
   
   
    

    
    #Decision Tree with feature selection
    dt = DecisionTreeClassifier()
    dt.fit(trainingSet[:, featsIdx], trainTopic == topic)
    label = dt.predict(testSet[:, featsIdx])
    em_ds = eval_measures(testTopic == topic, label)
    em_dt_list.append(em_ds)
    times_dt.append(timeit.timeit(lambda: dt.predict(testSet[:, featsIdx]), number=1))
    
    
    
    
    
    
  
    
    #Binary Selection
    
    C = StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(trainingSet[:, featsIdx], trainTopic==i)
    trS = trainingSet[:, featsIdx]
    teS = testSet[:, featsIdx]
    if len(featsIdx) > teS.shape[1]:
        featsIdx = featsIdx[:teS.shape[1]]
        teS = teS[:, featsIdx]
    #teS = teS[:, featsIdx]
    #teS = teS[:, featsIdx] if teS.shape[1] > len(featsIdx) else teS[:, featsIdx[:teS.shape[1]]]

    

    # Classification with Quantum Detection
    #p = qdetect(trS, trainTopic, teS, 1)
    p = qdetect(trS, (trainTopic==topic), teS, 1)
    
    #np.where(trainTopic==topic, 1, -1).any()

    #p = qdetect(trS, np.where(trainTopic==topic, 1, -1).astype(bool), teS, 1)

    em = eval_measures(testTopic==topic, p)
    em_list.append(em)
    times_qdetect.append(timeit.timeit())
    
    
   # print(sum(times_qdetect))
   
    print(sum(times_nb))
    print(sum(times_svm))
    print(sum(times_knn))
    print(sum(times_dt))
    print(sum(times_qdetect))
    
    
    
   

  precision = tp/(tp+fp)


0.9576473712161444 nan 0.0 nan
0.0001327668613913967 0.0001386193512614361 0.0031347962382435317 0.00026549847338377453


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  precision = tp/(tp+fp)


0.9576473712161444 nan 0.0 nan


  precision = tp/(tp+fp)


0.9576473712161444 nan 0.0 nan
(100,)
(100,)
(11314, 100)
(11314,)
(7532, 100)


IndexError: tuple index out of range