In [70]:
import sklearn.datasets as skd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

In [71]:
train_data = skd.load_files('dataset/20Newsgroups/20news-bydate/20news-bydate-train', encoding='ISO-8859-1')

test_data = skd.load_files('dataset/20Newsgroups/20news-bydate/20news-bydate-test', encoding='ISO-8859-1')

In [72]:
# count vectorization
count_vk = CountVectorizer()
X_train_tf = count_vk.fit_transform(train_data.data)
X_test_tf = count_vk.transform(test_data.data)

In [73]:
tf = TfidfTransformer()
X_train_tfidf = tf.fit_transform(X_train_tf)
print('Shape of X_train_tfidf', X_train_tfidf.shape)

Shape of X_train_tfidf (11314, 130107)


In [84]:
split_ratio = 0.1 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(X_train_tfidf, train_data.target, train_size=split_ratio, stratify=train_data.target)
print(X_l.shape, X_u.shape)

(1131, 130107) (10183, 130107)




In [85]:
def semiSupFit(X_l, y_l, X_u):
    n_ul_docs = X_u.shape[0] # number of unlabeled samples
    n_l_docs = X_l.shape[0] # number of labeled samples
    alpha = 1e-2
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_l, y_l) # use labeled data only to initialize classifier parameters

    theta_wt_cj = clf.feature_log_prob_ # log CP of word given class [n_classes, n_words]
    theta_cj = clf.class_log_prior_
    prev_theta_wt_cj = np.zeros(theta_wt_cj.shape) - np.inf # log CP of word given class [n_classes, n_words]
    prev_theta_cj = np.zeros(theta_cj.shape) - np.inf
    diff_wt_cj = np.sum(theta_wt_cj-prev_theta_wt_cj)
    diff_cj = np.sum(theta_cj-prev_theta_cj)
    totalDiff = diff_wt_cj+diff_cj
    print("diff_wt_cj:",diff_wt_cj,"diff_cj:",diff_cj,'totalDiff:',totalDiff)

    iter_count = 0 # count EM iteration
    max_iter=30
    while (totalDiff!=0 and iter_count<max_iter):
        iter_count += 1
        print("EM iteration #%d" % iter_count) # debug
        # E-step: Estimate class membership of unlabeled documents
        y_u = clf.predict(X_u)
        # M-step: Re-estimate classifier parameters
        X = vstack([X_l, X_u])
        y = np.concatenate((y_l, y_u), axis=0)
        clf.fit(X, y)
        
        theta_wt_cj = clf.feature_log_prob_ # log CP of word given class [n_classes, n_words]
        theta_cj = clf.class_log_prior_

        diff_wt_cj = np.sum(theta_wt_cj-prev_theta_wt_cj)
        diff_cj = np.sum(theta_cj-prev_theta_cj)
        totalDiff = diff_wt_cj+diff_cj
        print("diff_wt_cj:",diff_wt_cj,"diff_cj:",diff_cj,'totalDiff:',totalDiff)
        prev_theta_wt_cj = theta_wt_cj
        prev_theta_cj = theta_cj

    return clf

In [86]:
# Train Naive Bayes classifier (imported) 
# using labeled data set only
cv_clf = MultinomialNB(alpha=1e-2)
cv_clf.fit(X_l, y_l)
predicted = cv_clf.predict(X_test_tf)
accuracy = accuracy_score(test_data.target, predicted) * 100
print("MultinomialNB Accuracy is {0:0.3f} %".format(accuracy))

# Training based on EM algorithm 
# using labeled data set only
cv_clf = semiSupFit(X_l, y_l, X_u)
predicted = cv_clf.predict(X_test_tf)
accuracy = accuracy_score(test_data.target, predicted) * 100
print("EM algo Accuracy is {0:0.3f} %".format(accuracy))

MultinomialNB Accuracy is 63.516 %
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #1
diff_wt_cj: inf diff_cj: inf totalDiff: inf
EM iteration #2
diff_wt_cj: -5831.3499956728065 diff_cj: 0.01281469903627297 totalDiff: -5831.33718097377
EM iteration #3
diff_wt_cj: -1004.3791510275624 diff_cj: 0.0009926520284047058 totalDiff: -1004.378158375534
EM iteration #4
diff_wt_cj: -312.80898895595345 diff_cj: 0.001047318062941116 totalDiff: -312.8079416378905
EM iteration #5
diff_wt_cj: -157.83304980879694 diff_cj: 0.001229723137170069 totalDiff: -157.83182008565979
EM iteration #6
diff_wt_cj: 0.0 diff_cj: 0.0 totalDiff: 0.0
EM algo Accuracy is 72.809 %
