## non-negative matrix factorization (NMF) based topic modeling
This notebook presents the NMF approach

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
# Gensim
import gensim
import gensim.corpora as corpora
#from gensim.utils import simple_preprocess
#from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
### load NMF utility functions
from nmf_util import *
### load coherence score
import gensim.downloader as api
from coherence_score import *

### Load Data from Json

In [3]:
### json load the dataset
## 20news-18828.json
with open('../cleaned_data/yahoo_answers_csv.json', 'r') as jf:
    cleaned_data = json.load(jf)

In [10]:
yahoo_sample = cleaned_data[-60000:]
with open('../cleaned_data/yahoo_sample.json', 'w') as jf:
    json.dump(yahoo_sample, jf)

In [5]:
### split data into 'sentence' and 'label'
sentences = [it['sentence'] for it in cleaned_data]
labels = [it['label'] for it in cleaned_data]


In [7]:
set(labels)

{'1', '10', '2', '3', '4', '5', '6', '7', '8', '9'}

### Load pre-trained GloVe embeddings

In [8]:
model_glove = api.load("glove-twitter-100")   ## load pretrained glove embeddings

### Use Count Vectors as features

In [4]:
## convert the corpora to Count vectors
count = CountVectorizer(max_df=.95, min_df=10, max_features=5000)
x_count = count.fit_transform(sentences)
## convert to matrix --- feature-document matrix
count_mat = x_count.toarray().T 

In [5]:
## features
features = count.get_feature_names()

In [6]:
count_mat.shape

(5000, 18828)

In [7]:
## NMF methods for topic modeling
k = 10     ## the number of topics -- tune it for better result
max_iter = 10  ## maximum number of iterations
W,H,err=gaussian_method(count_mat, k, max_iter)  ## will return factor matrices: W, H and root mean squared error
#res1=poisson_method(count_mat.T, k, max_iter=5)
#res2=exponential_method(count_mat.T, k, max_iter=5)

In [9]:
W.shape

(5000, 10)

In [10]:
### extract top keywords, each topic presents 20 keywords
dic0 = top_keywords(W, features, num=20)

In [11]:
dic0

{0: ['maxaxaxaxaxaxaxaxaxaxaxaxaxaxax',
  '14',
  'part',
  'end',
  '12',
  '11',
  '13',
  'keywords',
  '10',
  'ha',
  'one',
  'make',
  'key',
  'article',
  'use',
  'writes',
  'may',
  'wire',
  're',
  'would'],
 1: ['wa',
  'one',
  'people',
  'would',
  'said',
  'say',
  'know',
  'like',
  'dont',
  'time',
  'armenian',
  'even',
  'ha',
  'didnt',
  'could',
  'see',
  'go',
  'get',
  'well',
  'way'],
 2: ['system',
  'image',
  'file',
  'available',
  'use',
  'window',
  'program',
  'also',
  'software',
  'version',
  'data',
  'ftp',
  'server',
  'get',
  'user',
  'graphic',
  'application',
  'display',
  'support',
  'format'],
 3: ['db',
  'byte',
  'bit',
  'push',
  'one',
  'pop',
  'si',
  'inc',
  'al',
  'loop',
  'offset',
  'call',
  'higher',
  'lower',
  'particle',
  'gas',
  'data',
  'west',
  'east',
  'left'],
 4: ['jpeg',
  'image',
  'file',
  'gif',
  'format',
  'color',
  'quality',
  'viewer',
  'free',
  'version',
  'see',
  'display

### Evaluate the coherence score of each topic

In [13]:
## compute the coherence score for each topic
coherence_vec = []
for i in range(W.shape[1]):  
    coherence_vec.append(coherence(dic0[i], model_glove))

In [14]:
np.mean(coherence_vec)   ## the mean coherence score of all topics

0.55294484

### Use TFIDF vectors as features

In [15]:
tfidf_vect = TfidfVectorizer(max_df=.95, min_df=10, max_features=5000)
x_tfidf = tfidf_vect.fit_transform(sentences)
## covert to matrix
tfidf_mat = x_tfidf.toarray().T
#print(tfidf_vect.get_feature_names())

In [16]:
## NMF methods for matrix factorization
k, max_iter = 10,10
W_tfidf,H_tfidf,err=gaussian_method(tfidf_mat, k, max_iter)
#tfidf_res1=poisson_method(tfidf_mat, k, max_iter=5)
#tfidf_res2=exponential_method(tfidf_mat, k, max_iter=5)

In [17]:
W_tfidf.shape
features_tfidf = tfidf_vect.get_feature_names()


In [18]:
dic_tfidf = top_keywords(W_tfidf, features_tfidf, num=20)

In [19]:
## compute the coherence score for each topic
coherence_vec = []
for i in range(W.shape[1]):  
    coherence_vec.append(coherence(dic_tfidf[i], model_glove))

In [20]:
np.mean(coherence_vec)  ## the mean coherence score of all topics

0.47842345

### SVM classifier:

In [39]:
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import classification_report

In [30]:
indices = list(range(len(labels)))   ## indices of documents

In [31]:
## split data into train and test
ind_train, ind_test, y_train, y_test = train_test_split(
    indices, labels, test_size=0.2, random_state=2021, stratify=labels)

In [36]:
## train/test datasets
x_train, x_test = H[:, ind_train],H[:, ind_test]

In [38]:
x_train.shape,x_test.shape,len(y_train), len(y_test)

((10, 15062), (10, 3766), 15062, 3766)

In [41]:
## encode labels to integers
Encoder = LabelEncoder()
Y_train = Encoder.fit_transform(y_train)
Y_test = Encoder.fit_transform(y_test)

In [42]:
Y_train

array([ 4, 13, 11, ...,  9,  0,  1], dtype=int64)

In [48]:
# Classifier - Algorithm - SVM -- linear kernel
# fit the training dataset on the classifier
SVM = svm.SVC(C=1., kernel='linear', degree=3, gamma='auto', random_state=82)#, class_weight='balanced')
SVM.fit(x_train.T, Y_train)# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_test.T) # make predictions
print(classification_report(Y_test, predictions_SVM, digits=3))

              precision    recall  f1-score   support

           0      0.222     0.013     0.024       160
           1      0.733     0.056     0.105       195
           2      0.337     0.178     0.233       197
           3      0.247     0.184     0.211       196
           4      0.000     0.000     0.000       192
           5      0.444     0.061     0.108       196
           6      0.102     0.840     0.182       194
           7      0.080     0.056     0.065       198
           8      0.103     0.543     0.173       199
           9      0.243     0.085     0.126       199
          10      0.667     0.060     0.110       200
          11      0.267     0.061     0.099       198
          12      0.083     0.005     0.010       196
          13      0.048     0.010     0.017       198
          14      0.339     0.193     0.246       197
          15      0.536     0.590     0.562       200
          16      0.360     0.170     0.231       182
          17      0.302    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
### fitting on training dataset
predictions_train = SVM.predict(x_train.T) # predict training examples
print(classification_report(Y_train, predictions_train, digits=3))

              precision    recall  f1-score   support

           0      0.182     0.006     0.012       639
           1      0.719     0.059     0.109       778
           2      0.362     0.221     0.274       788
           3      0.196     0.155     0.173       786
           4      0.000     0.000     0.000       769
           5      0.412     0.060     0.105       784
           6      0.099     0.815     0.176       778
           7      0.088     0.052     0.065       792
           8      0.094     0.491     0.158       795
           9      0.185     0.060     0.091       795
          10      0.505     0.063     0.111       799
          11      0.250     0.061     0.097       793
          12      0.220     0.017     0.031       785
          13      0.093     0.019     0.031       792
          14      0.353     0.180     0.238       790
          15      0.516     0.555     0.535       797
          16      0.358     0.165     0.226       728
          17      0.383    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
