## non-negative matrix factorization (NMF) based topic modeling
This notebook presents the NMF approach

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random

# step 1: import
import torch
from torch import nn 
from torch import optim
from torch.optim.lr_scheduler import StepLR

from utils import *
from estimators import *

### Gensim
#import gensim
#import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
### load NMF utility functions
from nmf_util import *
### load coherence score
#import gensim.downloader as api
from coherence_score import *

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report

In [15]:
a=np.array([[1,2,3,-2],[1,-11,3,4]])
b=(a>0)*a
c=np.linalg.norm(b,2)
c

5.626891130072679

In [2]:
from gensim.models import KeyedVectors
glove_file = './glove.6B/glove.6B.100d.txt'
tmp_file = "./test_word2vec.txt"

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)

# 加载转化后的文件
model_glove = KeyedVectors.load_word2vec_format(tmp_file)

  


### Load Data from Json

In [3]:
### json load the dataset
with open('../cleaned_data/Spam_Ham.json', 'r') as jf:
    cleaned_data = json.load(jf)

In [4]:
### split data into 'sentence' and 'label'
sentences = [it['sentence'] for it in cleaned_data]
labels = [it['label'] for it in cleaned_data]

In [5]:
set(labels)

{'ham', 'spam'}

### Load pre-trained GloVe embeddings

In [6]:
#model_glove = api.load("./glove.6B/glove.6B.100d.txt")   ## load pretrained glove embeddings


### Use Count Vectors as features

In [7]:
## convert the corpora to Count vectors
count = CountVectorizer(max_df=.95, min_df=10, max_features=5000)
x_count = count.fit_transform(sentences)
## convert to matrix --- feature-document matrix
count_mat = x_count.toarray().T 

In [8]:
## features
features = count.get_feature_names()
len(list(features)),len(list(labels))

(869, 5572)

# 1. gassian_method L2 loss


In [9]:
## NMF methods for topic modeling
k = 100   ## the number of topics -- tune it for better result
W0,H0,err0=gaussian_method(count_mat, k, max_iter=4)  ## will return factor matrices: W, H and root mean squared error


In [10]:
np.square(count_mat - W0@H0).sum()

22905.426368242854

## coherence score

In [11]:
dic0 = top_keywords(W0, features, num=20)

In [12]:
## compute the coherence score for each topic
coherence_vec = []
for i in range(W0.shape[1]):  
    coherence_vec.append(coherence(dic0[i], model_glove))

In [13]:
np.mean(coherence_vec)   ## the mean coherence score of all topics

0.35705584

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report


indices = list(range(len(labels)))   ## indices of documents

## split data into train and test
ind_train, ind_test, y_train, y_test = train_test_split(
    indices, labels, test_size=0.2, random_state=2021, stratify=labels)

## train/test datasets

#H0 = H0.detach().numpy()
print(H0.shape)
x_train, x_test = H0[:, ind_train],H0[:, ind_test]

## encode labels to integers
Encoder = LabelEncoder()
Y_train = Encoder.fit_transform(y_train)
Y_test = Encoder.fit_transform(y_test)


# Classifier - Algorithm - SVM -- linear kernel
# fit the training dataset on the classifier
SVM = svm.SVC(C=1., kernel='linear', degree=3, gamma='auto', random_state=82, class_weight='balanced')
SVM.fit(x_train.T, Y_train)# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_test.T) # make predictions
print(classification_report(Y_test, predictions_SVM, digits=3))

(100, 5572)
              precision    recall  f1-score   support

           0      0.981     0.978     0.980       966
           1      0.862     0.879     0.870       149

    accuracy                          0.965      1115
   macro avg      0.922     0.929     0.925      1115
weighted avg      0.965     0.965     0.965      1115



# 2. SGD without MI

In [15]:
A = torch.FloatTensor(count_mat)
#A. type
print(A.shape)
A

torch.Size([869, 5572])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [16]:
class SeparableCritic(nn.Module):
    """Separable critic. where the output value is g(x) h(y). """

    def __init__(self, dim1, dim2, hidden_dim, embed_dim, layers, activation, **extra_kwargs):
        super(SeparableCritic, self).__init__()
        self._g = mlp(dim1, hidden_dim, embed_dim, layers, activation)
        self._h = mlp(dim2, hidden_dim, embed_dim, layers, activation)

    def forward(self, x, y):
        scores = torch.matmul(self._h(y), self._g(x).t())
        return scores



In [17]:
#step 2: create model Class
class GaussianNMF(torch.nn.Module):
    """
    centralized model training with stochastic gradient descent
    class for Non-Negetive Matrix Multiplication using Gaussian Method
    
    """
    def __init__(self, A, k):
        """ initialization """
        super(GaussianNMF, self).__init__()
        self.rows = A.size(0)
        self.cols = A.size(1)
        self.A = A
        
        self.W = torch.nn.Parameter(torch.rand(self.rows, k), requires_grad=True) 
        self.H = torch.nn.Parameter(torch.rand(k, self.cols), requires_grad=True)
        #print(self.H)
        self.num_topics = k
        
  
    def forward(self):
        return self.W.matmul(self.H) 
  
    def batch_gd_train(self, epochs, batch_size, lr):
        """
        train with full batch gradient descent, i.e., all data in a batch for each iteration
        :params[in]: epochs,
        
        :params[out]: W, H
        """
        optimizer = torch.optim.SGD(self.parameters(), lr=lr)
        for i in range(epochs):
            pred = self.forward()
            loss = (self.A - pred).pow(2).sum()
            ## backward
            optimizer.zero_grad()   ## zero all gradients
            loss.backward()   ## find derivatives
            optimizer.step()  
            self.W.data[self.W.data < 0] = 0. 
            self.H.data[self.H.data < 0] = 0. 
            print('loss at Epoch ',i, ' ',loss.item())
        ## return
        return self.W, self.H
        
    ## split an iterable of items into batches 
    def chunks(self, ls, batch_size): 
        """ 
        Yield successive n-sized chunks from ls, an iterable. 
        :params[in]: ls, an iterable of items 
        :params[in]: batch_size, an integer, batch size 
        returns a generator 
        """ 
        for i in range(0, len(ls), batch_size): 
            yield ls[i:i + batch_size]
        
    '''def estimate_mutual_information(estimator, x, y, critic_fn,
                                baseline_fn=None, alpha_logit=None, **kwargs):
        
        if estimator == 'smile':
            mi = smile_lower_bound(scores, **kwargs)
    '''

    def sgd_train(self, epochs, batch_size, lr):
        """
        train with stochastic gradient descent
        :params[in]: epochs,
        
        :params[out]: W, H
        
        ** x = mini_data.T
        y = mini_datah.T
        """
        
        optimizer = torch.optim.SGD(self.parameters(), lr=lr)                           
        scheduler= StepLR(optimizer, step_size=10, gamma=0.8)
        data_index = list(range(self.cols))   ## all column indices
        for i in range(epochs):
            mini_batches = self.chunks(data_index, batch_size)
            for it in mini_batches:
                mini_data = A[:, it]
                
                mini_datah= self.H[:, it]
               
                ## data in a minibatch
                pred = self.forward()[:, it]  ## prediction
                loss = (mini_data - pred).pow(2).sum()
                ## backward
                optimizer.zero_grad()   ## zero all gradients
                loss.backward()   ## find derivatives
                optimizer.step()  
                self.W.data[self.W.data < 0] = 0. 
                self.H.data[self.H.data < 0] = 0.
               
            ## shuffle indices
            scheduler.step()
            np.random.shuffle(data_index) 
            
            ## current loss
            cur_loss = (self.A - self.W@self.H).pow(2).sum()
            print('loss at Epoch ',i, ' ',cur_loss.item())
        ## return             
        return self.W, self.H

 

In [18]:
nmf_method1 = GaussianNMF(A, 100) ## matrix factorization
W1, H1 = nmf_method1.sgd_train(epochs=40, batch_size=1024, lr=2.e-2)

loss at Epoch  0   30499714.0
loss at Epoch  1   391396.78125
loss at Epoch  2   53144.7578125
loss at Epoch  3   41797.0703125
loss at Epoch  4   40642.203125
loss at Epoch  5   40160.7734375
loss at Epoch  6   39861.109375
loss at Epoch  7   39676.34375
loss at Epoch  8   39336.4140625
loss at Epoch  9   39010.14453125
loss at Epoch  10   39182.234375
loss at Epoch  11   37402.390625
loss at Epoch  12   36640.6953125
loss at Epoch  13   35908.578125
loss at Epoch  14   35099.38671875
loss at Epoch  15   34248.765625
loss at Epoch  16   33346.484375
loss at Epoch  17   32452.044921875
loss at Epoch  18   31507.541015625
loss at Epoch  19   30594.625
loss at Epoch  20   29881.5546875
loss at Epoch  21   29211.921875
loss at Epoch  22   28565.31640625
loss at Epoch  23   27974.58984375
loss at Epoch  24   27439.55859375
loss at Epoch  25   26937.6484375
loss at Epoch  26   26472.546875
loss at Epoch  27   26042.58984375
loss at Epoch  28   25652.1484375
loss at Epoch  29   25293.7460937

In [19]:
list(nmf_method1.parameters())

[Parameter containing:
 tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0024, 0.0000],
         [0.0000, 0.0060, 0.0000,  ..., 0.0000, 0.0321, 0.0160],
         [0.0000, 0.0000, 0.0065,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0029, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0143, 0.0034, 0.0000,  ..., 0.0000, 0.0000, 0.0263],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
        requires_grad=True),
 Parameter containing:
 tensor([[0.0000e+00, 0.0000e+00, 2.7522e-01,  ..., 0.0000e+00, 2.6952e-01,
          0.0000e+00],
         [0.0000e+00, 0.0000e+00, 4.1900e-04,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [0.0000e+00, 0.0000e+00, 2.9258e-02,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         ...,
         [0.0000e+00, 0.0000e+00, 7.5245e-02,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [1.5017e-04, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          5.2186e-05],
         [0

In [20]:
(A-W1@H1).pow(2).sum()

tensor(23033.5703, grad_fn=<SumBackward0>)

In [21]:
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report



indices = list(range(len(labels)))   ## indices of documents

## split data into train and test
ind_train, ind_test, y_train, y_test = train_test_split(
    indices, labels, test_size=0.2, random_state=2021, stratify=labels)
## train/test datasets
H1 = H1.detach().numpy()
x_train, x_test = H1[:, ind_train],H1[:, ind_test]
## encode labels to integers
Encoder = LabelEncoder()
Y_train = Encoder.fit_transform(y_train)
Y_test = Encoder.fit_transform(y_test)


# Classifier - Algorithm - SVM -- linear kernel
# fit the training dataset on the classifier
SVM = svm.SVC(C=1., kernel='linear', degree=3, gamma='auto', random_state=82, class_weight='balanced')
SVM.fit(x_train.T, Y_train)# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_test.T) # make predictions
print(classification_report(Y_test, predictions_SVM, digits=3))

              precision    recall  f1-score   support

           0      0.987     0.962     0.974       966
           1      0.787     0.919     0.848       149

    accuracy                          0.956      1115
   macro avg      0.887     0.941     0.911      1115
weighted avg      0.961     0.956     0.957      1115



## coherence score

In [22]:
dic1 = top_keywords(W1, features, num=20)

In [23]:
## compute the coherence score for each topic
coherence_vec = []
for i in range(W1.shape[1]):  
    coherence_vec.append(coherence(dic1[i], model_glove))
    
np.mean(coherence_vec)   ## the mean coherence score of all topics

0.34759513

# 3. SGD with MI

In [24]:
A = torch.FloatTensor(count_mat)
#A. type
print(A.shape)
#A=A.type(torch.FloatTensor)
A

torch.Size([869, 5572])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [25]:
class GaussianNMF_MI(GaussianNMF):
    """
    Centralized training NMF model
    class for Non-Negetive Matrix Factorization using Gaussian Method 
    with mutual information regularizer
    
    """
    def __init__(self, A, k, critic_config):
        """ initialization
        
        :params[in]: A, k
        :params[in]: critic_config, a dictionary, 
        
        
        """
        super(GaussianNMF_MI, self).__init__(A, k)  ## instantiate super class
        self.critic_config = critic_config
        ### critic function for computing mutual information
        self.critic_config['dim1'] = self.rows
        self.critic_config['dim2'] = k
        self.critic = SeparableCritic(**self.critic_config)
  

    def sgd_train(self, epochs, batch_size, lr, xi, step_size=10, gamma=0.9, W_init = None):
        """
        train with stochastic gradient descent
        :params[in]: epochs,
        
        :params[out]: W, H
        
        ** 
        x = mini_data.T
        y = mini_datah.T
        """
        if W_init is not None:
            self.W.data = W_init.data
        optimizer = torch.optim.SGD(self.parameters(), lr)
        scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
        data_index = list(range(self.cols))   ## all column indices
        for i in range(epochs):
            mini_batches = self.chunks(data_index, batch_size)
            for it in mini_batches:
                mini_data = A[:, it]
                mini_datah= self.H[:, it]

                # calculate mi 
                mi = estimate_mutual_information('smile', mini_data.T, mini_datah.T, 
                                                 self.critic)
                ## data in a minibatch
                pred = self.forward()[:, it]  ## prediction
                #guassian
                loss = (mini_data - pred).pow(2).sum() - xi * mi
                #possian
                #loss = (pred-mini_data*torch.log(pred)).sum()-xi*mi
              
                ## backward
                #lr = scheduler.get_lr()
                optimizer.zero_grad()   ## zero all gradients
                loss.backward()## find derivatives
                #load_state_dict(state_dict)
                optimizer.step()  
                self.W.data[self.W.data < 0] = 0. 
                self.H.data[self.H.data < 0] = 0. 
            print()
            ## renew learning rate
            #print('Epoch前:', i,'LR:', lr)
            scheduler.step()
            #print('Epoch:', i,'LR:', lr)
            ## shuffle indices
            np.random.shuffle(data_index) 
            ## current loss
            cur_loss = (self.A - self.W@self.H).pow(2).sum()
            print('loss at Epoch ',i, ' ',cur_loss.item())
            #print(lr)
        ## return
        return self.W, self.H

In [26]:
A = torch.FloatTensor(count_mat)
#A. type
print(A.shape)
#A=A.type(torch.FloatTensor)
A

torch.Size([869, 5572])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [27]:

#NMF-MI method 
k=100

#A = torch.randn(count_mat.shape[0],count_mat.shape[1])
critic_params = {
    'layers': 0,
    'embed_dim': 32,
    'hidden_dim': 256,
    'activation': 'relu',
}
##init nmf especially the W, H
nmf1 = GaussianNMF_MI(A, k, critic_params)
##put W in every client

W, H = nmf1.sgd_train(epochs=50, batch_size=1024, lr=4.e-2, xi = 1.e-1, gamma=0.95)#, W_init = torch.rand([2153, 20])) 
#def server_train(self, epoch, client_num, batch_sieze, lr, xi)
#W, H = nmf1.server_train(epoch=4, client_num = 3, batch_size=128, lr=2.e-4, xi=0 )
                 


loss at Epoch  0   99597952.0

loss at Epoch  1   1095882.75

loss at Epoch  2   108404.4375

loss at Epoch  3   39850.5625

loss at Epoch  4   39259.51953125

loss at Epoch  5   37959.390625

loss at Epoch  6   36357.328125

loss at Epoch  7   35605.75

loss at Epoch  8   34466.6640625

loss at Epoch  9   33219.51953125

loss at Epoch  10   31581.86328125

loss at Epoch  11   30005.857421875

loss at Epoch  12   28654.986328125

loss at Epoch  13   27444.9765625

loss at Epoch  14   26631.48828125

loss at Epoch  15   25632.765625

loss at Epoch  16   24948.7109375

loss at Epoch  17   24186.6328125

loss at Epoch  18   23456.22265625

loss at Epoch  19   22825.390625

loss at Epoch  20   22377.529296875

loss at Epoch  21   21971.837890625

loss at Epoch  22   21767.09765625

loss at Epoch  23   21420.6328125

loss at Epoch  24   21167.439453125

loss at Epoch  25   20991.4296875

loss at Epoch  26   20794.26171875

loss at Epoch  27   20631.2109375

loss at Epoch  28   20530.578125

In [35]:
nmf1.critic.parameters()

<generator object Module.parameters at 0x7faf9e0f2750>

In [39]:
nmf1.W

Parameter containing:
tensor([[0.0000, 0.0084, 0.0208,  ..., 0.0000, 0.0000, 0.0078],
        [0.0564, 0.0000, 0.0134,  ..., 0.0000, 0.0000, 0.1506],
        [0.0000, 0.0256, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0191, 0.0000, 0.0067,  ..., 0.0000, 0.0000, 0.0000],
        [0.0308, 0.0640, 0.0000,  ..., 0.0000, 0.0011, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0491, 0.0000, 0.0000]],
       requires_grad=True)

In [38]:
list(nmf1.named_parameters())

[('W',
  Parameter containing:
  tensor([[0.0000, 0.0084, 0.0208,  ..., 0.0000, 0.0000, 0.0078],
          [0.0564, 0.0000, 0.0134,  ..., 0.0000, 0.0000, 0.1506],
          [0.0000, 0.0256, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0191, 0.0000, 0.0067,  ..., 0.0000, 0.0000, 0.0000],
          [0.0308, 0.0640, 0.0000,  ..., 0.0000, 0.0011, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0491, 0.0000, 0.0000]],
         requires_grad=True)),
 ('H',
  Parameter containing:
  tensor([[0.0000e+00, 1.8006e-04, 4.9687e-03,  ..., 0.0000e+00, 0.0000e+00,
           0.0000e+00],
          [0.0000e+00, 0.0000e+00, 3.9967e-03,  ..., 0.0000e+00, 0.0000e+00,
           0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 5.7767e-10, 0.0000e+00,
           0.0000e+00],
          ...,
          [0.0000e+00, 5.0677e-03, 0.0000e+00,  ..., 6.8839e-10, 0.0000e+00,
           0.0000e+00],
          [0.0000e+00, 0.0000e+00, 1.5829e-01,  ..., 5.6974e-10, 0.0000e+00,

In [42]:
nmf1.critic.parameters()

<generator object Module.parameters at 0x7fafa4d45250>

In [61]:
class GaussianNMF_MI_separate(GaussianNMF):
    """
    Centralized separately training NMF model and mutual information parameters
    We train mutual information parameters and other NMF parameters separately...
    train mutual information parameters first and NMF parameters later in each iteration
    class for Non-Negetive Matrix Factorization using Gaussian Method 
    with mutual information regularizer
    
    
    """
    def __init__(self, A, k, critic_config):
        """ initialization
        
        :params[in]: A, k
        :params[in]: critic_config, a dictionary, 
        
        
        """
        super(GaussianNMF_MI_separate, self).__init__(A, k)  ## instantiate super class
        self.critic_config = critic_config
        ### critic function for computing mutual information
        self.critic_config['dim1'] = self.rows
        self.critic_config['dim2'] = k
        self.critic = SeparableCritic(**self.critic_config)
  

    def sgd_train(self, epochs, batch_size, lr, mi_lr, xi, step_size=10, gamma=0.9, W_init = None):
        """
        train with stochastic gradient descent
        :params[in]: epochs,
        
        :params[out]: W, H
        
        ** 
        x = mini_data.T
        y = mini_datah.T
        """
        if W_init is not None:
            self.W.data = W_init.data
        optimizer = torch.optim.SGD([self.W, self.H], lr=lr)
        mi_optimizer = torch.optim.SGD(nmf1.critic.parameters(), lr=mi_lr)
        scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
        mi_scheduler = StepLR(mi_optimizer, step_size=step_size, gamma=gamma)
        data_index = list(range(self.cols))   ## all column indices
        for i in range(epochs):
            mini_batches = self.chunks(data_index, batch_size)
            for it in mini_batches:
                mini_data = A[:, it]
                mini_datah= self.H[:, it]

                # calculate mi 
                #mi = estimate_mutual_information('smile', mini_data.T, mini_datah.T, 
                #                                 self.critic)
                mi_loss = -estimate_mutual_information('smile', mini_data.T, mini_datah.T.detach(), 
                                                 self.critic)
                mi_optimizer.zero_grad()   ## zero all gradients
                mi_loss.backward()         ## find derivatives
                mi_optimizer.step()  
                
                ## data in a minibatch
                pred = self.forward()[:, it]  ## prediction
                ## compute mutual information
                mi = estimate_mutual_information('smile', mini_data.T, mini_datah.T, 
                                                 self.critic)
                #guassian
                loss = (mini_data - pred).pow(2).sum() - xi * mi
                ## backward
                #lr = scheduler.get_lr()
                optimizer.zero_grad()   ## zero all gradients
                loss.backward()## find derivatives
                #load_state_dict(state_dict)
                optimizer.step()  
                self.W.data[self.W.data < 0] = 0. 
                self.H.data[self.H.data < 0] = 0. 
            print()
            ## renew learning rate
            #print('Epoch前:', i,'LR:', lr)
            scheduler.step()
            mi_scheduler.step()
            #print('Epoch:', i,'LR:', lr)
            ## shuffle indices
            np.random.shuffle(data_index) 
            ## current loss
            cur_loss = (self.A - self.W@self.H).pow(2).sum()
            print('loss at Epoch ',i, ' ',cur_loss.item())
            #print(lr)
        ## return
        return self.W, self.H

In [63]:
#NMF-MI method 
k=100

#A = torch.randn(count_mat.shape[0],count_mat.shape[1])
critic_params = {
    'layers': 0,
    'embed_dim': 32,
    'hidden_dim': 256,
    'activation': 'relu',
}
##init nmf especially the W, H
nmf1 = GaussianNMF_MI_separate(A, k, critic_params)
##put W in every client

W, H = nmf1.sgd_train(epochs=50, batch_size=256, lr=4.e-2,mi_lr=4.e-2, xi = 1.e-1, gamma=0.95)#, W_init = torch.rand([2153, 20])) 



loss at Epoch  0   4574194.0

loss at Epoch  1   46369.63671875

loss at Epoch  2   40113.1015625

loss at Epoch  3   39845.421875

loss at Epoch  4   39091.54296875

loss at Epoch  5   36612.8984375

loss at Epoch  6   35432.2265625

loss at Epoch  7   33708.671875

loss at Epoch  8   31520.47265625

loss at Epoch  9   30095.58203125

loss at Epoch  10   28205.390625

loss at Epoch  11   26855.47265625

loss at Epoch  12   25717.77734375

loss at Epoch  13   24545.767578125

loss at Epoch  14   23799.6328125

loss at Epoch  15   23101.685546875

loss at Epoch  16   22593.44140625

loss at Epoch  17   22212.4453125

loss at Epoch  18   21809.0

loss at Epoch  19   21442.146484375

loss at Epoch  20   21061.83203125

loss at Epoch  21   20784.375

loss at Epoch  22   20593.79296875

loss at Epoch  23   20551.4296875

loss at Epoch  24   20956.1875

loss at Epoch  25   20512.0625

loss at Epoch  26   20091.421875

loss at Epoch  27   20069.775390625

loss at Epoch  28   19856.390625

lo

In [64]:
nmf1.H

Parameter containing:
tensor([[3.2984e-03, 0.0000e+00, 0.0000e+00,  ..., 4.9399e-09, 0.0000e+00,
         0.0000e+00],
        [4.3528e-03, 0.0000e+00, 0.0000e+00,  ..., 5.4490e-09, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 6.5436e-10, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 3.6703e-01,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.1457e-02, 1.3531e-02, 0.0000e+00,  ..., 0.0000e+00, 7.0634e-04,
         0.0000e+00]], requires_grad=True)

###### SVM Classifier

In [65]:
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report

In [66]:
indices = list(range(len(labels)))   ## indices of documents

In [67]:
## split data into train and test
ind_train, ind_test, y_train, y_test = train_test_split(
    indices, labels, test_size=0.2, random_state=2021, stratify=labels)

In [70]:

## train/test datasets

H = H.detach().numpy()
print(H.shape)
x_train, x_test = H[:, ind_train],H[:, ind_test]

(100, 5572)


In [71]:
## encode labels to integers
Encoder = LabelEncoder()
Y_train = Encoder.fit_transform(y_train)
Y_test = Encoder.fit_transform(y_test)


# Classifier - Algorithm - SVM -- linear kernel
# fit the training dataset on the classifier
SVM = svm.SVC(C=1., kernel='linear', degree=3, gamma='auto', random_state=82, class_weight='balanced')
SVM.fit(x_train.T, Y_train)# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_test.T) # make predictions
print(classification_report(Y_test, predictions_SVM, digits=3))

              precision    recall  f1-score   support

           0      0.986     0.977     0.982       966
           1      0.861     0.913     0.886       149

    accuracy                          0.969      1115
   macro avg      0.924     0.945     0.934      1115
weighted avg      0.970     0.969     0.969      1115



## coherence score 

In [72]:
dic = top_keywords(W, features, num=20)

In [73]:
## compute the coherence score for each topic
coherence_vec = []
for i in range(W.shape[1]):  
    coherence_vec.append(coherence(dic[i], model_glove))

np.mean(coherence_vec)   ## the mean coherence score of all topics

0.34049606

# Federated learning SGD

In [74]:
A = torch.FloatTensor(count_mat)
#A. type
print(A.shape)
#A=A.type(torch.FloatTensor)


torch.Size([869, 5572])


In [75]:
def evaluation(H, labels):
    indices = list(range(len(labels)))   ## indices of documents
    
    ## split data into train and test
    ind_train, ind_test, y_train, y_test = train_test_split(
        indices, labels, test_size=0.2, random_state=2021, stratify=labels)
    H_new = H.detach().numpy()
    x_train, x_test = H_new[:, ind_train],H_new[:, ind_test]
    
    ## encode labels to integers
    Encoder = LabelEncoder()
    Y_train = Encoder.fit_transform(y_train)
    Y_test = Encoder.fit_transform(y_test)


    # Classifier - Algorithm - SVM -- linear kernel
    # fit the training dataset on the classifier
    SVM = svm.SVC(C=1., kernel='linear', degree=3, gamma='auto', random_state=82, class_weight='balanced')
    SVM.fit(x_train.T, Y_train)# predict the labels on validation dataset
    predictions_SVM = SVM.predict(x_test.T) # make predictions
    print(classification_report(Y_test, predictions_SVM, digits=3))

In [32]:
help(random_sample)

NameError: name 'random_sample' is not defined

In [66]:
import random

class Fed_NMF(nn.Module):
    """
    Federaated NMF 
    """
    def __init__(self, A, k, K):
        """
        initialization 
        :params[in], A, full matrix of all texts data
        :params[in], k, number of topics
        :params[in], K, number of clients
        
        """
        super(Fed_NMF, self).__init__()
        self.A = A
        self.rows,self.cols = A.size()
        self.k = k  ## topic number
        self.K = K  ## number of clients

    def split_clients(self):
        """
        split the full matrix into m clients by column iid
        
        :params[in]: K, the number of clients
        
        :params[out]: B, tuple of tensors
        """
        data_index = list(range(self.cols))   ## all column indices
        #np.random.shuffle(data_index) 
        ## split into chunks after shuffle
        B = torch.chunk(self.A[:,data_index], self.K, dim = 1)
        return B
    
    def split_clients_non_iid(self):
        """
        produce non-iid datasets for clients
        """
        
        client_dis=dirichlet([1,1], self.K)
        spam=[]
        ham=[]
        num=500
        for i in range(len(labels)):
            if labels[i]=='ham':
                ham.append(i)
            elif labels[i]=='spam':
                spam.append(i)
        client_sample=[]
        for i in client_dis: 
            spam_sample=np.random.choice(spam, size=int(i[0]*num),replace=False)
            ham_sample=np.random.choice(ham, size=num-int(i[0]*num))
            client_sample.append(np.concatenate((spam_sample,ham_sample)))
            n=torch.from_numpy(np.array(client_sample))
            B=torch.chunk(n,self.K)
        return B


    def split_into_chunks(self, ls, batch_size):
        """
        split a list of number into chunks up to a certain batch_size
        
        :params[in]: ls, a list of numbers
        :params[in]: batch_size, an interger     
        
        :params[out]: a generator
        """
        np.random.shuffle(ls) 
        for i in range(0, len(ls), batch_size): 
            yield ls[i:i + batch_size]

    
    def server_train(self, labels, iters, C, epoch, batch_size, lr, xi):
        """
        federated learning
        
        :params[in], iters, the number of interations for fedrated learning
        :params[in], C, the fraction of clients for each iteration
        :params[in], epoch, the number of epochs for local SGD
        :params[in], batch_size, the batch_size for local SGD
        :params[in], lr, the learning rate for local SGD
        :params[in], xi, the mutual information for local SGD
        
        :params[out]
        :
        """
        m = int(max(C*self.K,1))  ## number of clients for each iteration
        ## split whole dataset into K clients
        # torch.manual_seed(4)
        B = self.split_clients()
        ## column number for all clients -- using list comprehension
        num_cols = [it.size(1) for it in B]
        #Cols = num_cols.sum()
        ## initialization of W tensor
        W = torch.rand(self.rows, self.k)
        ## nmf model for all clients using dictionary comprehension
        nmf_models = {i: GaussianNMF_MI(B[i], self.k, critic_params) for i in range(self.K)}
        random.seed(18)
        for i in range(iters):     ## each iteration                  
            set_clients = random.sample(list(range(self.K)), m)  ## selection of client's id 
            temp = [num_cols[it1] for it1 in set_clients]            
            col_sum = sum(temp)
            tmp_W = torch.zeros(self.rows, self.k)
            ## for each selected client
            for j in set_clients:
                W1, _ = nmf_models[j].sgd_train(epoch, batch_size, lr, xi, W)
                tmp_W += W1.detach().data * num_cols[j]/col_sum
            W = tmp_W
            H_list = [nmf_models[it2].H.detach().data for it2 in range(self.K) ]
            ## after convergence
            H = torch.cat(H_list , 1)
            evaluation(H,labels)
        return W, H

    def server_train_epoch(self, labels, iters, C, epoch, batch_size, lr, xi):
        """
        federated learning by training over all clients in each iteration
        
        :params[in], iters, the number of interations for fedrated learning
        :params[in], list, each data's label
        :params[in], C, the fraction of clients for each iteration
        :params[in], epoch, the number of epochs for local SGD
        :params[in], batch_size, the batch_size for local SGD
        :params[in], lr, the learning rate for local SGD
        :params[in], xi, the mutual information for local 
        SGD
        
        :params[out]
        :
        """
        m = int(max(C*self.K,1))  ## batch_size for clients
        ## split whole dataset into K clients
        #torch.manual_seed(4)
        B = self.split_clients()
        ## column number for all clients -- using list comprehension
        num_cols = [it.size(1) for it in B]
        #Cols = num_cols.sum()
        ## initialization of W tensor
        W = torch.rand(self.rows, self.k)
        ## nmf model for all clients using dictionary comprehension
        nmf_models = {i: GaussianNMF_MI(B[i], self.k, critic_params) for i in range(self.K)}
        #random.seed(18)
        for i in range(iters):     ## each iteration -- training over all clients 
            ## split clients into batches
            batches = self.split_into_chunks(list(range(self.K)), m)  
            ### for loop over all batches of clients
            for set_clients in batches:   ## selection of client's id 
                temp = [num_cols[it1] for it1 in set_clients]            
                col_sum = sum(temp)
                tmp_W = torch.zeros(self.rows, self.k)
                ## for each selected client
                for j in set_clients:
                    W1, _ = nmf_models[j].sgd_train(epoch, batch_size, lr, xi, W)
                    tmp_W += W1.detach().data * num_cols[j]/col_sum
                W = tmp_W     ##update W
            ## list of H matrices for each iteration
            H_list = [nmf_models[it2].H.detach().data for it2 in range(self.K) ]
            ## merge into H
            H = torch.cat(H_list , 1)
            ### evaluate its performance on classification
            evaluation(H, labels)
        return W, H

In [67]:
nmf1 = Fed_NMF(A, 100,30)

In [68]:
dim1,dim2=A.shape[0],100
mi_params = dict(estimator='smile',critic='separable', baseline='unnormalized')

data_params = {
    'dim': dim1,
    'batch_size': 64,
    'cubic': None
}

critic_params = {
     
    'dim1': dim1,
    'dim2': dim2,
    'layers': 2,
    'embed_dim': 32,
    'hidden_dim': 256,
    'activation': 'relu',
}

critic = SeparableCritic(**critic_params)#.cuda()

#A = torch.randn(count_mat.shape[0],count_mat.shape[1])
critic_params = {
    'layers': 2,
    'embed_dim': 32,
    'hidden_dim': 256,
    'activation': 'relu',
}


W, H = nmf1.server_train(labels, iters =20, C=0.2, epoch=40, batch_size=128, lr=2.e-4, xi=0)

loss at Epoch  0   247790.21875
loss at Epoch  1   165534.25
loss at Epoch  2   124303.796875
loss at Epoch  3   100022.9375
loss at Epoch  4   84234.3125
loss at Epoch  5   73235.3125
loss at Epoch  6   65229.8671875
loss at Epoch  7   59166.96875
loss at Epoch  8   54453.28125
loss at Epoch  9   50715.2734375
loss at Epoch  10   47698.8515625
loss at Epoch  11   45220.2578125
loss at Epoch  12   43164.421875
loss at Epoch  13   41438.2578125
loss at Epoch  14   39979.16015625
loss at Epoch  15   38734.69140625
loss at Epoch  16   37667.62109375
loss at Epoch  17   36747.921875
loss at Epoch  18   35950.7265625
loss at Epoch  19   35256.6171875
loss at Epoch  20   34650.26171875
loss at Epoch  21   34118.9375
loss at Epoch  22   33651.75390625
loss at Epoch  23   33240.078125
loss at Epoch  24   32876.4609375
loss at Epoch  25   32554.703125
loss at Epoch  26   32269.51171875
loss at Epoch  27   32016.369140625
loss at Epoch  28   31791.20703125
loss at Epoch  29   31590.763671875
los

loss at Epoch  30   1509.7874755859375
loss at Epoch  31   1509.72998046875
loss at Epoch  32   1509.69140625
loss at Epoch  33   1509.59912109375
loss at Epoch  34   1509.535888671875
loss at Epoch  35   1509.5040283203125
loss at Epoch  36   1509.4638671875
loss at Epoch  37   1509.439453125
loss at Epoch  38   1509.397705078125
loss at Epoch  39   1509.388916015625
              precision    recall  f1-score   support

           0      0.852     0.526     0.650       966
           1      0.118     0.409     0.183       149

    accuracy                          0.510      1115
   macro avg      0.485     0.468     0.417      1115
weighted avg      0.754     0.510     0.588      1115

loss at Epoch  0   5751.79736328125
loss at Epoch  1   3778.0849609375
loss at Epoch  2   2913.8662109375
loss at Epoch  3   2463.036376953125
loss at Epoch  4   2198.453125
loss at Epoch  5   2029.40185546875
loss at Epoch  6   1914.748291015625
loss at Epoch  7   1831.211669921875
loss at Epoch  8  

loss at Epoch  5   1342.682861328125
loss at Epoch  6   1341.9210205078125
loss at Epoch  7   1341.800048828125
loss at Epoch  8   1341.875244140625
loss at Epoch  9   1341.440673828125
loss at Epoch  10   1341.4962158203125
loss at Epoch  11   1341.1614990234375
loss at Epoch  12   1341.20703125
loss at Epoch  13   1341.1004638671875
loss at Epoch  14   1340.92431640625
loss at Epoch  15   1340.83447265625
loss at Epoch  16   1340.7724609375
loss at Epoch  17   1340.69677734375
loss at Epoch  18   1340.63427734375
loss at Epoch  19   1340.540771484375
loss at Epoch  20   1340.541259765625
loss at Epoch  21   1340.430908203125
loss at Epoch  22   1340.4591064453125
loss at Epoch  23   1340.3974609375
loss at Epoch  24   1340.4072265625
loss at Epoch  25   1340.35107421875
loss at Epoch  26   1340.3802490234375
loss at Epoch  27   1340.3360595703125
loss at Epoch  28   1340.347412109375
loss at Epoch  29   1340.303466796875
loss at Epoch  30   1340.286865234375
loss at Epoch  31   1340.

loss at Epoch  22   1374.9580078125
loss at Epoch  23   1374.891357421875
loss at Epoch  24   1374.8909912109375
loss at Epoch  25   1374.862060546875
loss at Epoch  26   1374.89892578125
loss at Epoch  27   1374.896728515625
loss at Epoch  28   1374.89599609375
loss at Epoch  29   1374.889404296875
loss at Epoch  30   1374.89990234375
loss at Epoch  31   1374.8919677734375
loss at Epoch  32   1374.92333984375
loss at Epoch  33   1374.916015625
loss at Epoch  34   1374.9283447265625
loss at Epoch  35   1374.9345703125
loss at Epoch  36   1374.9315185546875
loss at Epoch  37   1374.929931640625
loss at Epoch  38   1374.931884765625
loss at Epoch  39   1374.9302978515625
loss at Epoch  0   1378.8218994140625
loss at Epoch  1   1377.76220703125
loss at Epoch  2   1377.685546875
loss at Epoch  3   1377.1903076171875
loss at Epoch  4   1377.5152587890625
loss at Epoch  5   1376.528076171875
loss at Epoch  6   1377.19091796875
loss at Epoch  7   1376.811767578125
loss at Epoch  8   1376.9052

loss at Epoch  39   1195.094970703125
loss at Epoch  0   1324.45361328125
loss at Epoch  1   1324.326171875
loss at Epoch  2   1321.9622802734375
loss at Epoch  3   1322.855224609375
loss at Epoch  4   1322.09814453125
loss at Epoch  5   1322.15185546875
loss at Epoch  6   1322.0806884765625
loss at Epoch  7   1322.862060546875
loss at Epoch  8   1322.436279296875
loss at Epoch  9   1322.774658203125
loss at Epoch  10   1322.583740234375
loss at Epoch  11   1322.627197265625
loss at Epoch  12   1322.819091796875
loss at Epoch  13   1322.925537109375
loss at Epoch  14   1322.974365234375
loss at Epoch  15   1322.9691162109375
loss at Epoch  16   1323.03662109375
loss at Epoch  17   1323.093994140625
loss at Epoch  18   1323.010498046875
loss at Epoch  19   1323.133056640625
loss at Epoch  20   1323.074462890625
loss at Epoch  21   1323.113037109375
loss at Epoch  22   1323.22216796875
loss at Epoch  23   1323.1884765625
loss at Epoch  24   1323.1943359375
loss at Epoch  25   1323.199218

loss at Epoch  18   1389.97412109375
loss at Epoch  19   1390.035888671875
loss at Epoch  20   1390.09033203125
loss at Epoch  21   1390.1414794921875
loss at Epoch  22   1390.18505859375
loss at Epoch  23   1390.225341796875
loss at Epoch  24   1390.261474609375
loss at Epoch  25   1390.29345703125
loss at Epoch  26   1390.3221435546875
loss at Epoch  27   1390.3482666015625
loss at Epoch  28   1390.37109375
loss at Epoch  29   1390.392578125
loss at Epoch  30   1390.41162109375
loss at Epoch  31   1390.42822265625
loss at Epoch  32   1390.443359375
loss at Epoch  33   1390.4561767578125
loss at Epoch  34   1390.468505859375
loss at Epoch  35   1390.47998046875
loss at Epoch  36   1390.4896240234375
loss at Epoch  37   1390.49853515625
loss at Epoch  38   1390.5064697265625
loss at Epoch  39   1390.513427734375
loss at Epoch  0   1125.4716796875
loss at Epoch  1   1121.175537109375
loss at Epoch  2   1120.14794921875
loss at Epoch  3   1118.952880859375
loss at Epoch  4   1120.3359375

loss at Epoch  38   1239.998046875
loss at Epoch  39   1240.012451171875
loss at Epoch  0   1530.10107421875
loss at Epoch  1   1530.424072265625
loss at Epoch  2   1527.6334228515625
loss at Epoch  3   1527.702880859375
loss at Epoch  4   1528.17333984375
loss at Epoch  5   1528.070556640625
loss at Epoch  6   1528.88720703125
loss at Epoch  7   1528.28076171875
loss at Epoch  8   1528.5244140625
loss at Epoch  9   1528.5760498046875
loss at Epoch  10   1528.5343017578125
loss at Epoch  11   1528.632080078125
loss at Epoch  12   1528.4581298828125
loss at Epoch  13   1528.60888671875
loss at Epoch  14   1528.75537109375
loss at Epoch  15   1528.6649169921875
loss at Epoch  16   1528.955078125
loss at Epoch  17   1528.89599609375
loss at Epoch  18   1528.96337890625
loss at Epoch  19   1529.052490234375
loss at Epoch  20   1529.0682373046875
loss at Epoch  21   1529.05908203125
loss at Epoch  22   1529.11767578125
loss at Epoch  23   1529.207275390625
loss at Epoch  24   1529.154174804

loss at Epoch  14   1323.0325927734375
loss at Epoch  15   1323.0885009765625
loss at Epoch  16   1323.0634765625
loss at Epoch  17   1323.3468017578125
loss at Epoch  18   1323.2900390625
loss at Epoch  19   1323.3841552734375
loss at Epoch  20   1323.31640625
loss at Epoch  21   1323.3133544921875
loss at Epoch  22   1323.330810546875
loss at Epoch  23   1323.3446044921875
loss at Epoch  24   1323.42333984375
loss at Epoch  25   1323.4549560546875
loss at Epoch  26   1323.4725341796875
loss at Epoch  27   1323.494384765625
loss at Epoch  28   1323.479248046875
loss at Epoch  29   1323.499267578125
loss at Epoch  30   1323.51904296875
loss at Epoch  31   1323.525146484375
loss at Epoch  32   1323.5701904296875
loss at Epoch  33   1323.56005859375
loss at Epoch  34   1323.556640625
loss at Epoch  35   1323.557373046875
loss at Epoch  36   1323.565673828125
loss at Epoch  37   1323.5732421875
loss at Epoch  38   1323.5791015625
loss at Epoch  39   1323.5888671875
loss at Epoch  0   1333

loss at Epoch  31   1324.376708984375
loss at Epoch  32   1324.382080078125
loss at Epoch  33   1324.3685302734375
loss at Epoch  34   1324.3817138671875
loss at Epoch  35   1324.3978271484375
loss at Epoch  36   1324.3951416015625
loss at Epoch  37   1324.416748046875
loss at Epoch  38   1324.426025390625
loss at Epoch  39   1324.424560546875
loss at Epoch  0   2018.9921875
loss at Epoch  1   2017.4461669921875
loss at Epoch  2   2016.4072265625
loss at Epoch  3   2018.447021484375
loss at Epoch  4   2018.0479736328125
loss at Epoch  5   2016.0411376953125
loss at Epoch  6   2017.129150390625
loss at Epoch  7   2016.931640625
loss at Epoch  8   2017.299560546875
loss at Epoch  9   2017.47021484375
loss at Epoch  10   2017.201171875
loss at Epoch  11   2017.54833984375
loss at Epoch  12   2017.866455078125
loss at Epoch  13   2018.0216064453125
loss at Epoch  14   2018.029052734375
loss at Epoch  15   2018.227783203125
loss at Epoch  16   2017.9190673828125
loss at Epoch  17   2018.058

loss at Epoch  9   1323.1685791015625
loss at Epoch  10   1322.568603515625
loss at Epoch  11   1322.4305419921875
loss at Epoch  12   1323.0906982421875
loss at Epoch  13   1323.039794921875
loss at Epoch  14   1322.8214111328125
loss at Epoch  15   1322.924072265625
loss at Epoch  16   1323.0494384765625
loss at Epoch  17   1323.136474609375
loss at Epoch  18   1323.16943359375
loss at Epoch  19   1323.226806640625
loss at Epoch  20   1323.2757568359375
loss at Epoch  21   1323.318603515625
loss at Epoch  22   1323.3900146484375
loss at Epoch  23   1323.3394775390625
loss at Epoch  24   1323.3619384765625
loss at Epoch  25   1323.3720703125
loss at Epoch  26   1323.442138671875
loss at Epoch  27   1323.4281005859375
loss at Epoch  28   1323.472900390625
loss at Epoch  29   1323.482177734375
loss at Epoch  30   1323.52734375
loss at Epoch  31   1323.533447265625
loss at Epoch  32   1323.523193359375
loss at Epoch  33   1323.531494140625
loss at Epoch  34   1323.522216796875
loss at Ep

loss at Epoch  25   1240.2442626953125
loss at Epoch  26   1240.25732421875
loss at Epoch  27   1240.264892578125
loss at Epoch  28   1240.26904296875
loss at Epoch  29   1240.322265625
loss at Epoch  30   1240.313232421875
loss at Epoch  31   1240.3134765625
loss at Epoch  32   1240.334716796875
loss at Epoch  33   1240.348388671875
loss at Epoch  34   1240.3837890625
loss at Epoch  35   1240.396240234375
loss at Epoch  36   1240.398681640625
loss at Epoch  37   1240.3909912109375
loss at Epoch  38   1240.386962890625
loss at Epoch  39   1240.397216796875
loss at Epoch  0   1122.164794921875
loss at Epoch  1   1120.7186279296875
loss at Epoch  2   1119.9471435546875
loss at Epoch  3   1119.02783203125
loss at Epoch  4   1120.007080078125
loss at Epoch  5   1120.03955078125
loss at Epoch  6   1119.4881591796875
loss at Epoch  7   1119.2994384765625
loss at Epoch  8   1120.015380859375
loss at Epoch  9   1119.956787109375
loss at Epoch  10   1120.237548828125
loss at Epoch  11   1120.75

loss at Epoch  8   1376.700927734375
loss at Epoch  9   1376.744384765625
loss at Epoch  10   1376.924560546875
loss at Epoch  11   1376.990478515625
loss at Epoch  12   1376.963623046875
loss at Epoch  13   1376.95654296875
loss at Epoch  14   1376.871337890625
loss at Epoch  15   1377.1357421875
loss at Epoch  16   1377.1171875
loss at Epoch  17   1377.139892578125
loss at Epoch  18   1377.22021484375
loss at Epoch  19   1377.3555908203125
loss at Epoch  20   1377.31689453125
loss at Epoch  21   1377.4189453125
loss at Epoch  22   1377.3798828125
loss at Epoch  23   1377.4600830078125
loss at Epoch  24   1377.448486328125
loss at Epoch  25   1377.4638671875
loss at Epoch  26   1377.4931640625
loss at Epoch  27   1377.492919921875
loss at Epoch  28   1377.552978515625
loss at Epoch  29   1377.5458984375
loss at Epoch  30   1377.5511474609375
loss at Epoch  31   1377.568359375
loss at Epoch  32   1377.56396484375
loss at Epoch  33   1377.582275390625
loss at Epoch  34   1377.5961914062

loss at Epoch  35   1321.385986328125
loss at Epoch  36   1321.39306640625
loss at Epoch  37   1321.41064453125
loss at Epoch  38   1321.39794921875
loss at Epoch  39   1321.402587890625
              precision    recall  f1-score   support

           0      0.852     0.524     0.649       966
           1      0.117     0.409     0.182       149

    accuracy                          0.509      1115
   macro avg      0.484     0.467     0.415      1115
weighted avg      0.754     0.509     0.586      1115

loss at Epoch  0   1237.36376953125
loss at Epoch  1   1237.210205078125
loss at Epoch  2   1237.517578125
loss at Epoch  3   1238.279541015625
loss at Epoch  4   1239.6951904296875
loss at Epoch  5   1239.01513671875
loss at Epoch  6   1238.542236328125
loss at Epoch  7   1238.42578125
loss at Epoch  8   1238.7469482421875
loss at Epoch  9   1239.2109375
loss at Epoch  10   1239.6737060546875
loss at Epoch  11   1239.43701171875
loss at Epoch  12   1239.5655517578125
loss at Epoch

loss at Epoch  13   1263.45703125
loss at Epoch  14   1263.4072265625
loss at Epoch  15   1263.5177001953125
loss at Epoch  16   1263.494384765625
loss at Epoch  17   1263.663818359375
loss at Epoch  18   1263.7333984375
loss at Epoch  19   1263.81396484375
loss at Epoch  20   1263.68701171875
loss at Epoch  21   1263.744140625
loss at Epoch  22   1263.768310546875
loss at Epoch  23   1263.775146484375
loss at Epoch  24   1263.7509765625
loss at Epoch  25   1263.806884765625
loss at Epoch  26   1263.822998046875
loss at Epoch  27   1263.8238525390625
loss at Epoch  28   1263.85205078125
loss at Epoch  29   1263.844482421875
loss at Epoch  30   1263.873291015625
loss at Epoch  31   1263.8917236328125
loss at Epoch  32   1263.8931884765625
loss at Epoch  33   1263.901123046875
loss at Epoch  34   1263.886962890625
loss at Epoch  35   1263.8994140625
loss at Epoch  36   1263.921630859375
loss at Epoch  37   1263.9337158203125
loss at Epoch  38   1263.9288330078125
loss at Epoch  39   1263

loss at Epoch  30   1263.670166015625
loss at Epoch  31   1263.6904296875
loss at Epoch  32   1263.690673828125
loss at Epoch  33   1263.6806640625
loss at Epoch  34   1263.70556640625
loss at Epoch  35   1263.7158203125
loss at Epoch  36   1263.708740234375
loss at Epoch  37   1263.70751953125
loss at Epoch  38   1263.71533203125
loss at Epoch  39   1263.731201171875
loss at Epoch  0   1195.9232177734375
loss at Epoch  1   1193.703857421875
loss at Epoch  2   1194.5068359375
loss at Epoch  3   1193.9451904296875
loss at Epoch  4   1193.6380615234375
loss at Epoch  5   1193.744384765625
loss at Epoch  6   1193.6885986328125
loss at Epoch  7   1194.039794921875
loss at Epoch  8   1194.0751953125
loss at Epoch  9   1194.1494140625
loss at Epoch  10   1194.192138671875
loss at Epoch  11   1194.0772705078125
loss at Epoch  12   1194.27783203125
loss at Epoch  13   1194.440673828125
loss at Epoch  14   1194.364501953125
loss at Epoch  15   1194.530029296875
loss at Epoch  16   1194.52709960

loss at Epoch  6   1528.5433349609375
loss at Epoch  7   1528.6744384765625
loss at Epoch  8   1528.682861328125
loss at Epoch  9   1529.043212890625
loss at Epoch  10   1529.15576171875
loss at Epoch  11   1528.964599609375
loss at Epoch  12   1529.159912109375
loss at Epoch  13   1529.2017822265625
loss at Epoch  14   1529.29150390625
loss at Epoch  15   1529.214599609375
loss at Epoch  16   1529.297119140625
loss at Epoch  17   1529.25244140625
loss at Epoch  18   1529.3323974609375
loss at Epoch  19   1529.4063720703125
loss at Epoch  20   1529.5218505859375
loss at Epoch  21   1529.5440673828125
loss at Epoch  22   1529.4239501953125
loss at Epoch  23   1529.50830078125
loss at Epoch  24   1529.5469970703125
loss at Epoch  25   1529.622314453125
loss at Epoch  26   1529.54833984375
loss at Epoch  27   1529.6011962890625
loss at Epoch  28   1529.636962890625
loss at Epoch  29   1529.67724609375
loss at Epoch  30   1529.6546630859375
loss at Epoch  31   1529.630615234375
loss at Epo

loss at Epoch  21   1221.966796875
loss at Epoch  22   1222.0390625
loss at Epoch  23   1222.0487060546875
loss at Epoch  24   1222.1219482421875
loss at Epoch  25   1222.1180419921875
loss at Epoch  26   1222.1593017578125
loss at Epoch  27   1222.170166015625
loss at Epoch  28   1222.177978515625
loss at Epoch  29   1222.294677734375
loss at Epoch  30   1222.277587890625
loss at Epoch  31   1222.227294921875
loss at Epoch  32   1222.250244140625
loss at Epoch  33   1222.249267578125
loss at Epoch  34   1222.2548828125
loss at Epoch  35   1222.2724609375
loss at Epoch  36   1222.2762451171875
loss at Epoch  37   1222.287841796875
loss at Epoch  38   1222.2900390625
loss at Epoch  39   1222.290283203125
loss at Epoch  0   1121.92724609375
loss at Epoch  1   1120.1265869140625
loss at Epoch  2   1120.749267578125
loss at Epoch  3   1119.953125
loss at Epoch  4   1120.1558837890625
loss at Epoch  5   1119.928955078125
loss at Epoch  6   1120.4423828125
loss at Epoch  7   1119.95861816406

loss at Epoch  36   1214.07470703125
loss at Epoch  37   1214.080810546875
loss at Epoch  38   1214.0849609375
loss at Epoch  39   1214.0906982421875
loss at Epoch  0   1211.79150390625
loss at Epoch  1   1209.495361328125
loss at Epoch  2   1210.7392578125
loss at Epoch  3   1209.020263671875
loss at Epoch  4   1209.685546875
loss at Epoch  5   1210.3973388671875
loss at Epoch  6   1210.346923828125
loss at Epoch  7   1210.1676025390625
loss at Epoch  8   1210.1181640625
loss at Epoch  9   1210.21484375
loss at Epoch  10   1210.258544921875
loss at Epoch  11   1210.0498046875
loss at Epoch  12   1210.1649169921875
loss at Epoch  13   1210.152099609375
loss at Epoch  14   1210.32177734375
loss at Epoch  15   1210.3763427734375
loss at Epoch  16   1210.3892822265625
loss at Epoch  17   1210.566162109375
loss at Epoch  18   1210.499755859375
loss at Epoch  19   1210.50634765625
loss at Epoch  20   1210.5673828125
loss at Epoch  21   1210.6705322265625
loss at Epoch  22   1210.7412109375


loss at Epoch  9   1343.1851806640625
loss at Epoch  10   1343.1494140625
loss at Epoch  11   1343.27734375
loss at Epoch  12   1343.36328125
loss at Epoch  13   1343.49658203125
loss at Epoch  14   1343.598388671875
loss at Epoch  15   1343.6280517578125
loss at Epoch  16   1343.68701171875
loss at Epoch  17   1343.576171875
loss at Epoch  18   1343.74072265625
loss at Epoch  19   1343.703857421875
loss at Epoch  20   1343.7745361328125
loss at Epoch  21   1343.8009033203125
loss at Epoch  22   1343.801025390625
loss at Epoch  23   1343.819580078125
loss at Epoch  24   1343.8184814453125
loss at Epoch  25   1343.850341796875
loss at Epoch  26   1343.878173828125
loss at Epoch  27   1343.8720703125
loss at Epoch  28   1343.873779296875
loss at Epoch  29   1343.896728515625
loss at Epoch  30   1343.9058837890625
loss at Epoch  31   1343.914794921875
loss at Epoch  32   1343.9091796875
loss at Epoch  33   1343.939453125
loss at Epoch  34   1343.956298828125
loss at Epoch  35   1343.94531

loss at Epoch  29   1321.398193359375
loss at Epoch  30   1321.44482421875
loss at Epoch  31   1321.44677734375
loss at Epoch  32   1321.4410400390625
loss at Epoch  33   1321.4464111328125
loss at Epoch  34   1321.473388671875
loss at Epoch  35   1321.47412109375
loss at Epoch  36   1321.475830078125
loss at Epoch  37   1321.473876953125
loss at Epoch  38   1321.4761962890625
loss at Epoch  39   1321.482177734375
loss at Epoch  0   1344.953125
loss at Epoch  1   1343.60498046875
loss at Epoch  2   1342.9952392578125
loss at Epoch  3   1342.1107177734375
loss at Epoch  4   1343.5574951171875
loss at Epoch  5   1342.6488037109375
loss at Epoch  6   1343.0526123046875
loss at Epoch  7   1342.7884521484375
loss at Epoch  8   1343.21240234375
loss at Epoch  9   1343.165283203125
loss at Epoch  10   1343.21875
loss at Epoch  11   1343.40869140625
loss at Epoch  12   1343.6839599609375
loss at Epoch  13   1343.60498046875
loss at Epoch  14   1343.705322265625
loss at Epoch  15   1343.6657714

loss at Epoch  1   1309.031494140625
loss at Epoch  2   1307.8341064453125
loss at Epoch  3   1307.2545166015625
loss at Epoch  4   1307.508544921875
loss at Epoch  5   1308.569091796875
loss at Epoch  6   1308.060546875
loss at Epoch  7   1307.825927734375
loss at Epoch  8   1307.8369140625
loss at Epoch  9   1307.594482421875
loss at Epoch  10   1307.743896484375
loss at Epoch  11   1307.749267578125
loss at Epoch  12   1307.662353515625
loss at Epoch  13   1308.0562744140625
loss at Epoch  14   1307.81298828125
loss at Epoch  15   1308.013671875
loss at Epoch  16   1308.013916015625
loss at Epoch  17   1308.071044921875
loss at Epoch  18   1308.1865234375
loss at Epoch  19   1308.13671875
loss at Epoch  20   1308.1595458984375
loss at Epoch  21   1308.206787109375
loss at Epoch  22   1308.143798828125
loss at Epoch  23   1308.239013671875
loss at Epoch  24   1308.2933349609375
loss at Epoch  25   1308.313720703125
loss at Epoch  26   1308.3232421875
loss at Epoch  27   1308.31909179

loss at Epoch  15   1211.45556640625
loss at Epoch  16   1211.6282958984375
loss at Epoch  17   1211.697509765625
loss at Epoch  18   1211.63037109375
loss at Epoch  19   1211.734619140625
loss at Epoch  20   1211.79736328125
loss at Epoch  21   1211.875244140625
loss at Epoch  22   1211.845947265625
loss at Epoch  23   1211.87548828125
loss at Epoch  24   1211.877685546875
loss at Epoch  25   1211.8681640625
loss at Epoch  26   1211.859375
loss at Epoch  27   1211.878173828125
loss at Epoch  28   1211.8941650390625
loss at Epoch  29   1211.9095458984375
loss at Epoch  30   1211.9114990234375
loss at Epoch  31   1211.953125
loss at Epoch  32   1212.006591796875
loss at Epoch  33   1211.981689453125
loss at Epoch  34   1211.9716796875
loss at Epoch  35   1212.0107421875
loss at Epoch  36   1212.0040283203125
loss at Epoch  37   1211.9957275390625
loss at Epoch  38   1211.9993896484375
loss at Epoch  39   1212.0068359375
loss at Epoch  0   1242.062255859375
loss at Epoch  1   1242.268188

loss at Epoch  39   1323.360595703125
              precision    recall  f1-score   support

           0      0.852     0.525     0.650       966
           1      0.117     0.409     0.182       149

    accuracy                          0.509      1115
   macro avg      0.485     0.467     0.416      1115
weighted avg      0.754     0.509     0.587      1115



In [73]:
H.shape

torch.Size([100, 5572])

In [69]:
import random
help(random.sample)

Help on method sample in module random:

sample(population, k) method of random.Random instance
    Chooses k unique random elements from a population sequence or set.
    
    Returns a new list containing elements from the population while
    leaving the original population unchanged.  The resulting list is
    in selection order so that all sub-slices will also be valid random
    samples.  This allows raffle winners (the sample) to be partitioned
    into grand prize and second place winners (the subslices).
    
    Members of the population need not be hashable or unique.  If the
    population contains repeats, then each occurrence is a possible
    selection in the sample.
    
    To choose a sample in a range of integers, use range as an argument.
    This is especially fast and space efficient for sampling from a
    large population:   sample(range(10000000), 60)



coherence

In [70]:
dic0 = top_keywords(W, features, num=20)


## compute the coherence score for each topic
coherence_vec = []
for i in range(W.shape[1]):  
    coherence_vec.append(coherence(dic0[i], model_glove))

np.mean(coherence_vec)   ## the mean coherence score of all topics


NameError: name 'model_glove' is not defined

# SVM classifier

In [74]:

    indices = list(range(len(labels)))   ## indices of documents
    
    ## split data into train and test
    ind_train, ind_test, y_train, y_test = train_test_split(
        indices, labels, test_size=0.2, random_state=2021, stratify=labels)
    H_new = H.detach().numpy()
    x_train, x_test = H_new[:, ind_train],H_new[:, ind_test]
    
    ## encode labels to integers
    Encoder = LabelEncoder()
    Y_train = Encoder.fit_transform(y_train)
    Y_test = Encoder.fit_transform(y_test)


    # Classifier - Algorithm - SVM -- linear kernel
    # fit the training dataset on the classifier
    SVM = svm.SVC(C=1., kernel='linear', degree=3, gamma='auto', random_state=82, class_weight='balanced')
    SVM.fit(x_train.T, Y_train)# predict the labels on validation dataset
    predictions_SVM = SVM.predict(x_test.T) # make predictions
    print(classification_report(Y_test, predictions_SVM, digits=3))

    

              precision    recall  f1-score   support

           0      0.852     0.525     0.650       966
           1      0.117     0.409     0.182       149

    accuracy                          0.509      1115
   macro avg      0.485     0.467     0.416      1115
weighted avg      0.754     0.509     0.587      1115



In [56]:
from numpy.random import dirichlet
k=10
client_dis=dirichlet([1,1],k)
spam=[]
ham=[]
num=500
for i in range(len(labels)):
    if labels[i]=='ham':
        ham.append(i)
    elif labels[i]=='spam':
        spam.append(i)
client_sample=[]
for i in client_dis: 
    spam_sample=np.random.choice(spam, size=int(i[0]*num),replace=False)
    ham_sample=np.random.choice(ham, size=num-int(i[0]*num))
    client_sample.append(np.concatenate((spam_sample,ham_sample)))
n=torch.from_numpy(np.array(client_sample))
B=torch.chunk(n,k)
(len(B),B[0].shape)
    
     

(10, torch.Size([1, 500]))

In [197]:
help(np.random.choice)

Help on built-in function choice:

choice(...) method of numpy.random.mtrand.RandomState instance
    choice(a, size=None, replace=True, p=None)
    
    Generates a random sample from a given 1-D array
    
    .. versionadded:: 1.7.0
    
    .. note::
        New code should use the ``choice`` method of a ``default_rng()``
        instance instead; please see the :ref:`random-quick-start`.
    
    Parameters
    ----------
    a : 1-D array-like or int
        If an ndarray, a random sample is generated from its elements.
        If an int, the random sample is generated as if a were np.arange(a)
    size : int or tuple of ints, optional
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.  Default is None, in which case a
        single value is returned.
    replace : boolean, optional
        Whether the sample is with or without replacement
    p : 1-D array-like, optional
        The probabilities associated with each

In [223]:
a=np.array([[ 0, 1, 2, 0, 2, 4],
        [ 3, 4, 5, 6, 8, 10],
        [ 3, 4, 5, 6, 8, 10], 
        [ 6, 7, 8, 12, 14, 16]])
n=torch.from_numpy(a)

In [225]:
n=torch.from_numpy(a)
torch.chunk(n,2)

(tensor([[ 0,  1,  2,  0,  2,  4],
         [ 3,  4,  5,  6,  8, 10]]),
 tensor([[ 3,  4,  5,  6,  8, 10],
         [ 6,  7,  8, 12, 14, 16]]))

In [46]:
client_dis=dirichlet([1,1], 100)

NameError: name 'dirichlet' is not defined