<a href="https://colab.research.google.com/github/stanlee321/unitary-matrix-text-classification/blob/master/QuantumFunctions_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unitary Text Classification

## 2. Implementation

In [None]:
# Some imports

from scipy.linalg import fractional_matrix_power as MatrixPow
from scipy.stats import unitary_group
import numpy as np
from scipy import linalg
import itertools


In [None]:
## Setup Custom functions

In [None]:
########################################
def calculate_epsilon_simple(count_dict,
                            word,
                            unk_e: float = 0.000001,
                            lf_e: float = 0.0001,
                            hf_e: float = 0.001,
                            lf_range: int = 0,
                            hf_range: int = 1250):
    try:
        count = count_dict[word]
    except:
        return unk_e, "unk"
    
    if lf_range <= count <= hf_range:
        return lf_e, "lf"
    return hf_e, "hf"


In [None]:

def create_unique_words_with_epsilon(
    vocav_frec,     # Vocab used
    unique_words,   # List of unique words
    lf_e: float=0.0001,
    hf_e: float=0.001,
    lf_range: int=0,
    hf_range: int=1250):
    """

    """

    w_e_pairs_list = []

    for w in tqdm(unique_words):

        e, kind_word = calculate_epsilon_simple(
                vocav_frec,
                w,
                lf_e = lf_e,
                hf_e = hf_e,
                lf_range = lf_range,
                hf_range = hf_range
        )
        w_e_pairs_list.append( (w, e, kind_word ) )

    return w_e_pairs_list

In [None]:

def RandomUnitaryBaseElement(n,epsilon):
    """
    Input:
        n: Matrix size
        epsilon: small mumber where zero limits to the identity matrix
    """
    return MatrixPow( unitary_group.rvs(n), epsilon )

def QuasiDiagonalBasisElement(n,epsilon, dtheta,i):
    diag1 = np.roll( np.append(np.ones(n-1), np.exp(1j*dtheta)) ,1)
    return np.diag(np.roll(diag1,i)).dot(RandomUnitaryBaseElement(n,epsilon)) 

In [None]:

def flatten_list(L):
        return list( itertools.chain.from_iterable( L ) )

def Grid_dU_Diagonal(n,epsilon):
    axis0 = np.array([epsilon]+[0]*(n-1))
    
    return [ np.diag( np.exp(1j*epsilon*np.roll(axis0,n)) ) for n in range(n) ]


def CanonicalCosetBase(X_01):
    """
    Constructs a canonical coset matrix from the row vector X_01
    """
    m = len(X_01) 
    X_01 = np.array([X_01])
    X_00 = np.sqrt( 1-np.real(X_01@X_01.conj().T)  )
    X_11 = linalg.sqrtm( np.eye(m) - X_01.conj().T@X_01 )
    
    return np.block( [ [ X_00          , X_01],
                       [-X_01.T.conj() , X_11]] ) 

def CanonicalCoset(X,n):
    """
    Constructs a canonical coset matrix from the row vector X_01 embedded in a matrix space nxn
    """
    m = len(X)
    
    if m==n-1:
        return CanonicalCosetBase(X)
    
    identity = np.eye(n-m-1)
    
    zero = [0]*(len(X)+1)
    zero = np.array([ zero for _ in range(len(identity))])    
    
    #return CanonicalCosetBase(X) 
    
    return np.block([ [identity,zero],
                      [zero.T, CanonicalCosetBase(X) ] ])


def Grid_dU_CanonicalCoset( n, m ,epsilon ):
    """
    Left Canonical Coset m, for nxn unitary matrices 
    """
    axis0 = np.array([epsilon]+[0]*(m-1))
    
    axis_list = np.array([ np.roll(axis0,k) for k in range(m) ] )
    
    pre = [  [ CanonicalCoset( epsilon*X , n ), CanonicalCoset( 1j*epsilon*X , n ) ] for X in axis_list ]
    
    return flatten_list(pre)


def Grid_dU(n, epsilon ):
    
    CCosets_list = [  Grid_dU_CanonicalCoset( n, m ,epsilon ) for m in  range( n-1, 0, -1 ) ]
    
    return flatten_list(CCosets_list) + Grid_dU_Diagonal(n,epsilon)

In [None]:
def Text_DiagonalBaseDict(n, w_e_pairs_list, dtheta, epsilon_grid):
    """
    This basis samples random matrices around the diagonal
    """

    # Create dequeue for els in grid
    deq = collections.deque( Grid_dU(n, epsilon_grid) )

    basis = {}

    for i, (w, epsilon, kind) in enumerate(tqdm(w_e_pairs_list)):       
        if kind == "lf":
            if len(deq) > 0:
                basis[w] = deq.pop()
            else:
                basis[w] = QuasiDiagonalBasisElement(n,epsilon, dtheta,i)
        else:
            basis[w] = QuasiDiagonalBasisElement(n,epsilon,dtheta,i)
    
    return basis

In [None]:
########################
#########################

In [None]:
def HouseholderLeftDecomposition(W):
    H = W.copy()  
    n = H.shape[0]

    ee = np.identity(n)

    factor_list = []

    for i in range(H.shape[0]-1):

        col = H[:,i]
        phase = np.angle(col[i])
        u = np.array([col + np.exp(1j*phase)*ee[i] ])
        uu = u.conj().T.dot(u)
        Q = (ee - 2*uu/np.real(np.trace(uu))).T
        H = Q.dot(H)
        factor_list.append(Q)

    factor_list.append(H)

    return factor_list

def CanonicalCosetLeftDecomposition(W):
    
    def ReflectionMatrix(n,i):
        if i < n-1:
            r = np.identity(n)
            r[i,i] = -1
            return r
        elif i == n-1:
            r = -np.identity(n)
            r[i,i] = 1
            return r
    
    n = W.shape[0]
    
    factors = HouseholderLeftDecomposition(W)
    
    factors = [ F@ReflectionMatrix(n,i) for i,F in enumerate(factors) ]
    
    return factors



# The trick is to extract the independent parameters from each Householder matrix

def CanonicalCosetPosition(C,i):
    n = C.shape[0]
    if i < n-1:
        return C[i,i+1:]
    elif i == n-1:
        return np.diag(C)


def CanonicalCosetLeftDecompositionParametersComplex(W):
    factors = CanonicalCosetLeftDecomposition(W)
    
    positions = [ CanonicalCosetPosition(f,i) for i,f in enumerate(factors) ]
    
    return positions

def CanonicalCosetLeftDecompositionParametersReal(W):
    
    def CanonicalCosetPosition(C,i):
        n = C.shape[0]
        if i<n-1:
            return C[i,i+1:]
        elif i == n-1:
            return np.diag(C)
    
    n = W.shape[0]
    
    factors = CanonicalCosetLeftDecomposition(W)
    
    positions = np.concatenate(
        [ CanonicalCosetPosition(f,i) for i,f in enumerate(factors) if i<n-1 ])
    
    # positions = np.concatenate([ np.real(positions) , np.imag(positions) ])
    # phases = np.real(np.angle( np.diag(factors[n-1]) ))

    positions = np.array([ (np.real(x),np.imag(x)) for x in positions  ]).flatten()
    phases = np.real(np.angle( np.diag(factors[n-1]) ))
    
    
    return  np.concatenate( [positions,phases] )


## create function for create the composite matrix
#  we multiply the sequence of matrices
# and we obtain a final composite matrix

def create_composite_matrix(text_to_matrix_dict:dict , input_tokens:list):

    M = text_to_matrix_dict[ input_tokens[0] ]

    for a in input_tokens:
        M = M.dot(text_to_matrix_dict[a])

    return M

def create_embeding_matrix(words_to_unitary_dict:dict, text_inputs:list ):
    """
    Creates the embeding representation for some list of texts inputs. 
        e.g. : text_inputs = ["this, "is", "one", "example"]
    """

    # Create composite matrix for token inputs 
    W = create_composite_matrix(words_to_unitary_dict, input_tokens = text_inputs)

    # Create embeding
    E = CanonicalCosetLeftDecompositionParametersReal(W)

    return E

## Text Classification Setup

In [None]:
### Setup text 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
#from torchtext.datasets import AG_NEWS
# import datasets
from torchtext.datasets import IMDB

from torch.autograd import Function
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision import transforms, datasets

torch.manual_seed(1)

<torch._C.Generator at 0x7fe3785c1290>

In [None]:
import collections
from collections import Counter

import nltk
from nltk.corpus import wordnet as wn

from tqdm import tqdm
import pandas as pd
import time

## WordCloud
from wordcloud import WordCloud

import matplotlib.pyplot as plt

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')

#train_iter, val_iter = AG_NEWS()

train_iter, val_iter = IMDB()

#next(train_iter)

aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 30.0MB/s]


In [None]:

labels = []
counter = Counter()
sizes = []

for (label, line) in train_iter:
    counter.update(tokenizer(line))
    labels.append(label)
    sizes.append(len(line))
    
for (label, line) in val_iter:
    counter.update(tokenizer(line))
    labels.append(label)



In [None]:
# load again the dataset because of the generator used before
train_iter, val_iter = IMDB()

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter + val_iter), specials=["<unk>"])

vocab.set_default_index(vocab["<unk>"])


In [None]:
VOCAB_SIZE = len(vocab)
print(VOCAB_SIZE)
UNIQUE_WORDS = vocab.get_itos()
FREC_DICT =  dict(counter)

147157


In [None]:

w_e_pairs_simple = create_unique_words_with_epsilon(
        FREC_DICT,
        UNIQUE_WORDS,
        lf_e=0.0001,
        hf_e=0.001,
        lf_range=0,
        hf_range=825
    )

print(w_e_pairs_simple[:10])

100%|██████████| 147157/147157 [00:00<00:00, 587251.41it/s]

[('<unk>', 1e-06, 'unk'), ('the', 0.001, 'hf'), ('.', 0.001, 'hf'), (',', 0.001, 'hf'), ('and', 0.001, 'hf'), ('a', 0.001, 'hf'), ('of', 0.001, 'hf'), ('to', 0.001, 'hf'), ("'", 0.001, 'hf'), ('is', 0.001, 'hf')]





In [None]:
labels_tokens = [sample[2] for sample in w_e_pairs_simple ]

labels_tokens[:10]

c = Counter(labels_tokens)
c

Counter({'hf': 1299, 'lf': 145857, 'unk': 1})

In [None]:
N = 5
theta = 0.0001

epsilon_grid = 0.001

# Token to Unitary(N) map
AA_dict = Text_DiagonalBaseDict( N, w_e_pairs_simple, theta, epsilon_grid )


100%|██████████| 147157/147157 [07:15<00:00, 337.68it/s]


In [None]:
from sklearn.preprocessing import StandardScaler
import seaborn as sns

tqdm.pandas()

In [None]:
# Create tokenizer 
text_pipeline = lambda x: tokenizer(x)

# Create labels for this kind of dataset
label_pipeline = lambda x: 1 if x == "pos" else 0


def convert_text_to_features(AA_dict, text):

    text_tokens = text_pipeline(text)

    embedding = create_embeding_matrix( words_to_unitary_dict = AA_dict, text_inputs = text_tokens  )

    sample = {"Text": np.float32(embedding), "Class": label}

    return sample


def create_features_label_df(input_dataframe, AA_dict:dict ):
    
    _df = input_dataframe.copy()
    
    # Calculate embeeded representation for text
    _df["features"] = _df["Text"].progress_apply(
        lambda x: convert_text_to_features(AA_dict, x)["Text"])
    
    # Create features columns
    features = _df["features"].apply(pd.Series)
    features = features.rename(columns = lambda x : 'feature_' + str(x))
    new_df = pd.concat([_df[["Text","Class"]], features[:]], axis=1)

    return new_df

Setup the dataset

In [None]:
# Load again the datset
train_iter, test_iter = IMDB()

train_dataset = list(train_iter)
test_dataset = list(test_iter)

num_train = int(len(train_dataset) * 0.95)

split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])


train_pairs = [{"Text": line, "Class": label} for (label, line) in split_train_]
valid_pairs = [{"Text": line, "Class": label} for (label, line) in split_valid_]
test_pairs  = [{"Text": line, "Class": label} for (label, line) in test_dataset]


# create Pandas DataFrame
text_labels_df_train = pd.DataFrame(train_pairs)
text_labels_df_valid = pd.DataFrame(valid_pairs)
text_labels_df_test = pd.DataFrame(test_pairs)

In [None]:
text_labels_df_train

Unnamed: 0,Text,Class
0,An extra is called upon to play a general in a...,pos
1,almost every review of this movie I'd seen was...,pos
2,I did not have too much interest in watching T...,pos
3,Ulises is a literature teacher that arrives to...,neg
4,I found The FBI Story considerably entertainin...,pos
...,...,...
23745,Peter M. Cohen has a winner satire on the mati...,pos
23746,"Henry, a veterinarian (Paul Rudd), and his bos...",neg
23747,Though the pieces are uneven this collection o...,pos
23748,A very good start. I was a bit surprised to fi...,pos


With the AA_dict (words to Random unitary matrix dict), we create the feature vectors.

In [None]:

train_fea_df = create_features_label_df(text_labels_df_train, AA_dict = AA_dict)
valid_fea_df = create_features_label_df(text_labels_df_valid, AA_dict = AA_dict)
test_fea_df  =  create_features_label_df(text_labels_df_test, AA_dict = AA_dict)

100%|██████████| 23750/23750 [00:23<00:00, 1017.29it/s]
100%|██████████| 1250/1250 [00:01<00:00, 998.78it/s] 
100%|██████████| 25000/25000 [00:24<00:00, 1022.40it/s]


In [None]:

# Train pairs
X_train_raw = train_fea_df.iloc[:, 2:]
y_train = train_fea_df.iloc[:, 1]

# Valid pairs
X_valid_raw = valid_fea_df.iloc[:, 2:]
y_valid = valid_fea_df.iloc[:, 1]

# Test pairs
X_test_raw = test_fea_df.iloc[:, 2:]
y_test = test_fea_df.iloc[:, 1]


In [None]:
# Standardize Input
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train_raw)
X_valid = scaler.transform(X_valid_raw)
X_test = scaler.transform(X_test_raw)

In [None]:
# MODEL

This is a simple model with one hidden layer of NxN units.

In [None]:
class TextClassificationModelQM(nn.Module):

    def __init__(self,  embed_dim, num_class):
        super(TextClassificationModelQM, self).__init__()
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, embedding):
        
        return self.fc(embedding)

In [None]:

def train(dataloader):
    print("Training...")
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    running_loss = 0.0

    for idx, (text, label) in enumerate(dataloader):
        
        optimizer.zero_grad()
        predited_label = model(text)
        loss = criterion(predited_label, label)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        optimizer.step()

        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)

        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

        running_loss += loss.item() *text.shape[0]

    return running_loss
    
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predited_label = model(text)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


In [None]:
# Create model
num_class = len(list(set(labels)))
print(num_class)

model = TextClassificationModelQM(N*N, num_class).to(device)

2


In [None]:
# Hyperparameters
EPOCHS = 10         # epoch
LR = 0.001           # learning rate
BATCH_SIZE = 32     # batch size for training


In [None]:
# This is used for create the dataset loader

class CustomTextDatasetNEW(Dataset):
    def __init__(self, X_data, y_data):
        
        self.X_data = X_data
        self.y_data = y_data
        
    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, index):
        
        target = label_pipeline(self.y_data[index])
        
        return self.X_data[index],  torch.tensor(target, dtype=torch.int64).to(device)

In [None]:

# define data set object

TD_train = CustomTextDatasetNEW(
    torch.FloatTensor(X_train).to(device),y_train)

TD_valid = CustomTextDatasetNEW(
    torch.FloatTensor(X_valid).to(device),y_valid)

TD_test = CustomTextDatasetNEW(
    torch.FloatTensor(X_test).to(device), y_test)



train_dataloader = DataLoader(TD_train, batch_size=BATCH_SIZE,
                              shuffle=True, )
valid_dataloader = DataLoader(TD_valid, batch_size=BATCH_SIZE,
                              shuffle=False,)
test_dataloader = DataLoader(TD_test, batch_size=BATCH_SIZE,
                             shuffle=False, )


Train

In [None]:

criterion = torch.nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=LR)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 3.0, gamma = 0.5)

total_accu = None


losses = []

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    running_loss = train(train_dataloader)
    
    accu_val = evaluate(valid_dataloader)

    #if total_accu is not None and total_accu > accu_val:

    scheduler.step()

    #else:

    total_accu = accu_val
    
    epoch_loss = running_loss / len(train_dataloader)
    losses.append(epoch_loss)

    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
    
    print('Epoch-{0} loss: {1}'.format(epoch, losses[-1]))

    print('-' * 59)

Training...
| epoch   1 |   500/  743 batches | accuracy    0.726
-----------------------------------------------------------
| end of epoch   1 | time:  1.21s | valid accuracy    1.000 
Epoch-1 lr: 0.001
Epoch-1 loss: 16.168740903424702
-----------------------------------------------------------
Training...
| epoch   2 |   500/  743 batches | accuracy    1.000
-----------------------------------------------------------
| end of epoch   2 | time:  0.98s | valid accuracy    1.000 
Epoch-2 lr: 0.001
Epoch-2 loss: 3.4194983155397676
-----------------------------------------------------------
Training...
| epoch   3 |   500/  743 batches | accuracy    1.000
-----------------------------------------------------------
| end of epoch   3 | time:  0.96s | valid accuracy    1.000 
Epoch-3 lr: 0.0005
Epoch-3 loss: 0.942288370619993
-----------------------------------------------------------
Training...
| epoch   4 |   500/  743 batches | accuracy    1.000
----------------------------------------