In [40]:
from nltk.corpus import reuters 
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
documents = reuters.fileids()
 
train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                            documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                           documents))
# train_docs_id = train_docs_id[:1000]
# test_docs_id = test_docs_id[:1000]
 
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
# train_docs = train_docs[:1000]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
# test_docs = test_docs[:1000]

In [42]:
print(len(train_docs))

print(len(test_docs))

7769
3019


In [43]:
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_docs_id])
vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

In [58]:
print(mlb.classes_)
print(train_labels[0])
print(reuters.categories(train_docs_id[0]))
#print(train_labels[23])
classNameIndexMap = {}
for index,className in enumerate(mlb.classes_):
    classNameIndexMap[className] = index
print(classNameIndexMap)

['acq' 'alum' 'barley' 'bop' 'carcass' 'castor-oil' 'cocoa' 'coconut'
 'coconut-oil' 'coffee' 'copper' 'copra-cake' 'corn' 'cotton' 'cotton-oil'
 'cpi' 'cpu' 'crude' 'dfl' 'dlr' 'dmk' 'earn' 'fuel' 'gas' 'gnp' 'gold'
 'grain' 'groundnut' 'groundnut-oil' 'heat' 'hog' 'housing' 'income'
 'instal-debt' 'interest' 'ipi' 'iron-steel' 'jet' 'jobs' 'l-cattle'
 'lead' 'lei' 'lin-oil' 'livestock' 'lumber' 'meal-feed' 'money-fx'
 'money-supply' 'naphtha' 'nat-gas' 'nickel' 'nkr' 'nzdlr' 'oat' 'oilseed'
 'orange' 'palladium' 'palm-oil' 'palmkernel' 'pet-chem' 'platinum'
 'potato' 'propane' 'rand' 'rape-oil' 'rapeseed' 'reserves' 'retail'
 'rice' 'rubber' 'rye' 'ship' 'silver' 'sorghum' 'soy-meal' 'soy-oil'
 'soybean' 'strategic-metal' 'sugar' 'sun-meal' 'sun-oil' 'sunseed' 'tea'
 'tin' 'trade' 'veg-oil' 'wheat' 'wpi' 'yen' 'zinc']
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0

In [38]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

In [39]:
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(vectorised_train_documents, train_labels)

# predict
predictions = classifier.predict(vectorised_test_documents)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels,predictions)

In [None]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(vectorised_train_documents, train_labels)

# predict
predictions = classifier.predict(vectorised_test_documents)

accuracy_score(test_labels,predictions)

In [None]:
categories = reuters.categories()
print(len(categories))

In [None]:
# Test
t = reuters.fileids('earn')
f = list(filter(lambda doc: doc.startswith("train"),
                            t))
print(len(f))

In [78]:
import Utility
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.sparse import vstack
labelDocsMap = Utility.get_label_file_mapping()
sortedLabelCounts = Utility.get_sorted_categories_with_counts()

In [79]:
def semiSupFit(X_l, y_l, X_u):
    n_ul_docs = X_u.shape[0] # number of unlabeled samples
    n_l_docs = X_l.shape[0] # number of labeled samples
    alpha = 1e-2
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_l, y_l) # use labeled data only to initialize classifier parameters

    theta_wt_cj = clf.feature_log_prob_ # log CP of word given class [n_classes, n_words]
    theta_cj = clf.class_log_prior_
    prev_theta_wt_cj = np.zeros(theta_wt_cj.shape) - np.inf # log CP of word given class [n_classes, n_words]
    prev_theta_cj = np.zeros(theta_cj.shape) - np.inf
    diff_wt_cj = np.sum(theta_wt_cj-prev_theta_wt_cj)
    diff_cj = np.sum(theta_cj-prev_theta_cj)
    totalDiff = diff_wt_cj+diff_cj
    print("diff_wt_cj:",diff_wt_cj,"diff_cj:",diff_cj,'totalDiff:',totalDiff)

    iter_count = 0 # count EM iteration
    max_iter=30
    while (totalDiff!=0 and iter_count<max_iter):
        iter_count += 1
        print("EM iteration #%d" % iter_count) # debug
        # E-step: Estimate class membership of unlabeled documents
        y_u = clf.predict(X_u)
        # M-step: Re-estimate classifier parameters
        X = vstack([X_l, X_u])
        y = np.concatenate((y_l, y_u), axis=0)
        clf.fit(X, y)
        
        theta_wt_cj = clf.feature_log_prob_ # log CP of word given class [n_classes, n_words]
        theta_cj = clf.class_log_prior_

        diff_wt_cj = np.sum(theta_wt_cj-prev_theta_wt_cj)
        diff_cj = np.sum(theta_cj-prev_theta_cj)
        totalDiff = diff_wt_cj+diff_cj
        print("diff_wt_cj:",diff_wt_cj,"diff_cj:",diff_cj,'totalDiff:',totalDiff)
        prev_theta_wt_cj = theta_wt_cj
        prev_theta_cj = theta_cj

    return clf

In [85]:
def get40otherClassDocuments(currentClassName, sortedLabelCountsList, allSelectedDocuments, labelDocsMap):
    addedDocs = 0
    classIndex = 0
    dataSet = []
    while classIndex < len(sortedLabelCountsList) and addedDocs<40:
        classInfo = sortedLabelCountsList[classIndex]
        className = classInfo[0]
        if className == currentClassName:
            classIndex += 1
            continue
        classDocuments = labelDocsMap[className]
        doc_index = 0
        while doc_index<len(classDocuments):
            doc_id = classDocuments[doc_index]
            if doc_id not in allSelectedDocuments:
                allSelectedDocuments.add(doc_id)
                dataSet.append(doc_id)
                addedDocs += 1
                break
            doc_index += 1
        classIndex += 1
    return dataSet,allSelectedDocuments

def get10currentClassDocuments(currentClassName, allSelectedDocuments, labelDocsMap):
    dataSet = []
    classDocuments = labelDocsMap[currentClassName]
    doc_index = 0
    addedDocs = 0
    while addedDocs<10:
        doc_id = classDocuments[doc_index]
        if doc_id not in allSelectedDocuments:
            allSelectedDocuments.add(doc_id)
            dataSet.append(doc_id)
            addedDocs += 1
        doc_index += 1
    return dataSet,allSelectedDocuments

def getAllOtherDocuments(currentSetDocuments, train_docs_id):
    currentSetDocumentsSet = set(currentSetDocuments)
    allOtherDocuments = []
    for doc_id in train_docs_id:
        if doc_id not in currentSetDocumentsSet:
            allOtherDocuments.append(doc_id)
    return allOtherDocuments

sortedLabelCountsList = []
for className in sortedLabelCounts:
    sortedLabelCountsList.append([className, sortedLabelCounts[className]])
print(sortedLabelCountsList)
#for classIndex, classInfo in enumerate(sortedLabelCountsList):
classNames = []
NBaccuracy = []
EMaccuracy = []
classIndex = 0
while classIndex<10:
    classInfo = sortedLabelCountsList[classIndex]
    className = classInfo[0]
    #print(item)
    allSelectedDocuments = set()
    dataSet10,allSelectedDocuments = get10currentClassDocuments(className, allSelectedDocuments, labelDocsMap)
    dataSet40,allSelectedDocuments = get40otherClassDocuments(className, sortedLabelCountsList, allSelectedDocuments, labelDocsMap)
    dataSet50 = dataSet10+dataSet40
    dataSet50_docs = [reuters.raw(doc_id) for doc_id in dataSet50]
    vectorizer = TfidfVectorizer()
    vectorised_train_documents = vectorizer.fit_transform(dataSet50_docs)
    vectorised_test_documents = vectorizer.transform(test_docs)
    #Y10 = [1]*10
    #y40 = [0]*40
    X = vectorised_train_documents
    Y = [1]*10 + [0]*40
    cv_clf = MultinomialNB(alpha=1e-2)
    cv_clf.fit(X, Y)
    predicted = cv_clf.predict(vectorised_test_documents)
    #print('predicted:',predicted)
    columnIndex = classNameIndexMap[className]
    test_data_target = test_labels[:,columnIndex]
    nbaccuracy = accuracy_score(test_data_target, predicted) * 100
    print('className:',className,'accuracy:',nbaccuracy)
    
    
    allOtherDocuments = getAllOtherDocuments(dataSet50_docs, train_docs_id)
    allOtherDocuments_docs = [reuters.raw(doc_id) for doc_id in allOtherDocuments]
    X_u = vectorizer.transform(allOtherDocuments_docs)
    
    print('Xshape:',X.shape,'X_u shape:',X_u.shape)
    
    cv_clf = semiSupFit(X, Y, X_u)
    predicted = cv_clf.predict(vectorised_test_documents)
    #print('predicted:',predicted)
    #columnIndex = classNameIndexMap[className]
    #test_data_target = test_labels[:,columnIndex]
    emaccuracy = accuracy_score(test_data_target, predicted) * 100
    print('className:',className,'accuracy:',emaccuracy)
    
    NBaccuracy.append(nbaccuracy)
    EMaccuracy.append(emaccuracy)
    classNames.append(className)
    classIndex += 1
    #classifier = BinaryRelevance(GaussianNB())
    #classifier.fit(vectorised_train_documents, train_labels)
    
    #print('dataSet50',dataSet50)
    #print('dataSet40',dataSet40)
    #print('dataSet50',len(dataSet50))
    #print('dataSet50',len(dataSet50))
    #break
    #exit()
#'''
#firstLabel = sortedLabelCounts.get(0)#[0]
#print('firstLabel:',firstLabel)

[['earn', 2877], ['acq', 1650], ['money-fx', 538], ['grain', 433], ['crude', 389], ['trade', 368], ['interest', 347], ['wheat', 212], ['ship', 197], ['corn', 181], ['money-supply', 140], ['dlr', 131], ['sugar', 126], ['oilseed', 124], ['coffee', 111], ['gnp', 101], ['gold', 94], ['veg-oil', 87], ['soybean', 78], ['bop', 75], ['livestock', 75], ['nat-gas', 75], ['cpi', 69], ['cocoa', 55], ['reserves', 55], ['carcass', 50], ['copper', 47], ['jobs', 46], ['yen', 45], ['ipi', 41], ['iron-steel', 40], ['cotton', 39], ['barley', 37], ['gas', 37], ['rubber', 37], ['alum', 35], ['rice', 35], ['meal-feed', 30], ['palm-oil', 30], ['sorghum', 24], ['retail', 23], ['silver', 21], ['zinc', 21], ['pet-chem', 20], ['wpi', 19], ['rapeseed', 18], ['tin', 18], ['hog', 16], ['housing', 16], ['orange', 16], ['strategic-metal', 16], ['lead', 15], ['heat', 14], ['soy-oil', 14], ['fuel', 13], ['soy-meal', 13], ['lei', 12], ['sunseed', 11], ['dmk', 10], ['lumber', 10], ['income', 9], ['tea', 9], ['nickel', 8]

className: grain accuracy: 93.9715137462736
Xshape: (50, 2174) X_u shape: (7769, 2174)
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #1
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #2
diff_wt_cj: 229.0862254094389 diff_cj: -0.8812586108941405 totalDiff: 228.20496679854475
EM iteration #3
diff_wt_cj: -64.10516267589085 diff_cj: -0.22820001940333157 totalDiff: -64.33336269529418
EM iteration #4
diff_wt_cj: -19.435446530697067 diff_cj: -0.04204181505447657 totalDiff: -19.477488345751546
EM iteration #5
diff_wt_cj: 3.349388278602402 diff_cj: -0.03276403366901626 totalDiff: 3.3166242449333856
EM iteration #6
diff_wt_cj: -107.87202525465742 diff_cj: 0.03276403366901626 totalDiff: -107.8392612209884
EM iteration #7
diff_wt_cj: -127.017928543718 diff_cj: 0.07242494817078704 totalDiff: -126.94550359554721
EM iteration #8
diff_wt_cj: -40.91427529082322 diff_cj: 0.02946428658846223 totalDiff: -40.884811004234756
EM iteration #9
diff_wt_cj: -10.617036559593501 diff_cj: 0.009

className: wheat accuracy: 96.588274263001
Xshape: (50, 2254) X_u shape: (7769, 2254)
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #1
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #2
diff_wt_cj: -168.20379782454495 diff_cj: -0.21574296249603808 totalDiff: -168.419540787041
EM iteration #3
diff_wt_cj: -35.72649403313904 diff_cj: 0.0 totalDiff: -35.72649403313904
EM iteration #4
diff_wt_cj: -13.09059603304676 diff_cj: 0.0 totalDiff: -13.09059603304676
EM iteration #5
diff_wt_cj: 0.4632196609808088 diff_cj: 0.0 totalDiff: 0.4632196609808088
EM iteration #6
diff_wt_cj: -4.813181477288108 diff_cj: -0.004342623492617825 totalDiff: -4.817524100780726
EM iteration #7
diff_wt_cj: 0.0 diff_cj: 0.0 totalDiff: 0.0
className: wheat accuracy: 97.41636303411725
className: ship accuracy: 96.9195097714475
Xshape: (50, 2298) X_u shape: (7769, 2298)
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #1
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #2
diff_wt_cj: 23

In [87]:
print(classNames)
print(NBaccuracy)
print(EMaccuracy)
#dataSet50_docs = [reuters.raw(doc_id) for doc_id in dataSet50]

['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'wheat', 'ship', 'corn']
[95.13083802583637, 90.99039417025506, 96.29016230539914, 93.9715137462736, 88.04239814508115, 91.98410069559458, 95.85955614441868, 96.588274263001, 96.9195097714475, 96.45578005962238]
[95.26333222921497, 96.588274263001, 94.00463729711825, 95.82643259357403, 95.66081483935078, 63.431599867505795, 94.86584961907917, 97.41636303411725, 97.4494865849619, 97.35011593242795]


In [86]:
#print(dataSet50_docs)

In [31]:
#print([1]*10 + [0]*40)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
