Same first two preprocessing cells from part 3

In [None]:
import os
import sys
import operator
import time
import numpy as np
import contractions
from sklearn.model_selection import train_test_split


import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # needed by word_tokenize
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
# list of words found to still rank very high (10k+) after a running with just stopwords and examining vocab.txt
other_words = ['lines', 'subject', 'would', 'organization']

# Assuming this file is put under the same parent directoray as the data directory, and the data directory is named "20news-train"
root_path = "./20news-train"
# The maximum size of the final vocabulary. It's a hyper-parameter. You can change it to see what value gives the best performance.
MAX_VOCAB_SIZE = 5000

start_time = time.time()
vocab_full = {}
n_doc = 0
# Only keep the data dictionaries and ignore possible system files like .DS_Store
folders = [os.path.join(root_path, name) for name in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, name))]
for folder in folders:
    for filename in os.listdir(folder):
        file = os.path.join(folder, filename)
        n_doc += 1
        # print(file)
        with open(file, 'r', encoding='utf8', errors='ignore') as f:
            for line in f:
                # split contractions into two words
                line = contractions.fix(line)
                tokens = word_tokenize(line)
                # force everything to lower case and remove non-alphabetic characters
                tokens = [token.lower() for token in tokens if token.isalpha()]
                for token in tokens:
                    # remove stop words, other words (above) and single characters
                    if (token not in stop_words) and (token not in other_words) and (len(token) > 1):
                        vocab_full[token] = vocab_full.get(token, 0) + 1
print(f'{n_doc} documents in total with a total vocab size of {len(vocab_full)}')
vocab_sorted = sorted(vocab_full.items(), key=operator.itemgetter(1), reverse=True)
vocab_truncated = vocab_sorted[:MAX_VOCAB_SIZE]
# Save the vocabulary to file for visual inspection and possible analysis
with open('vocab1.txt', 'w') as f:
    for vocab, freq in vocab_truncated:
        f.write(f'{vocab}\t{freq}\n')
# The final vocabulary is a dict mapping each token to its id. frequency information is not needed anymore.
vocab = dict([(token, id) for id, (token, _) in enumerate(vocab_truncated)])
# Since we have truncated the vocabulary, we will encounter many tokens that are not in the vocabulary. We will map all of them to the same 'UNK' token (a common practice in text processing), so we append it to the end of the vocabulary.
vocab['UNK'] = MAX_VOCAB_SIZE
vocab_size = len(vocab)
unk_id = MAX_VOCAB_SIZE
elapsed_time = time.time() - start_time
print(f'Vocabulary construction took {elapsed_time} seconds')

In [None]:
# Since we have truncated the vocabulary, it's now reasonable to hold the entire feature matrix in memory (it takes about 3.6GB on a 64-bit machine). If memory is an issue, you could make the vocabulary even smaller or use sparse matrix.
start_time = time.time()
features = np.zeros((n_doc, vocab_size), dtype=int)
print(f'The feature matrix takes {sys.getsizeof(features)} Bytes.')
# The class label of each document
labels = np.zeros(n_doc, dtype=int)
# The mapping from the name of each class label (i.e., the subdictionary name corresponding to a topic) to an integer ID
label2id = {}
label_id = 0
doc_id = 0
for folder in folders:
    label2id[folder] = label_id
    for filename in os.listdir(folder):
        labels[doc_id] = label_id
        file = os.path.join(folder, filename)
        with open(file, 'r', encoding='utf8', errors='ignore') as f:
            for line in f:
                tokens = word_tokenize(line)
                for token in tokens:
                    # if the current token is in the vocabulary, get its ID; otherwise, get the ID of the UNK token
                    token_id = vocab.get(token, unk_id)
                    features[doc_id, token_id] += 1
        doc_id += 1
    label_id += 1
elapsed_time = time.time() - start_time
print(f'Feature extraction took {elapsed_time} seconds')

Kmeans class

In [None]:
from scipy.spatial.distance import euclidean
import numpy as np


class Kmeans:
    def __init__(self, num_clusters, feature_matrix):
        self.k = num_clusters
        # get the shape of the matrix [# of docs, # of features (max vocab size)]
        dimensionality = feature_matrix.shape
        # calculate the min and max value for each feature, to be used in centroid initialization (earlier runs showed
        # min to always be 0)
        ranges = np.zeros((dimensionality[1]-1, 2))
        for dim in range(dimensionality[1]-1):
            ranges[dim, 0] = 0
            ranges[dim, 1] = np.max(feature_matrix[:, dim])
        # create k centroids, matching # of features stored for each doc, ignorining the UNK token b/c of its effect
        # on skewing the ranges and resulting centroids
        self.centroids = np.zeros((self.k, dimensionality[1]-1))
        for i in range(self.k):
            for dim in range(dimensionality[1]-1):
                # random, uniform initialization
                self.centroids[i, dim] = np.random.uniform(ranges[dim, 0], ranges[dim, 1], 1)
        # initialize class variables in the __init__ function
        self.cluster = []

    def convergeClusters(self, features):
        dimensions = features.shape

        # track episodes till converge and prevent too long of running times (hopefully never 10k)
        episodes = 0
        not_converged = True
        while episodes < 10000 and not_converged:
            episodes += 1

            # calculate the distance from each feature to the corresponding feature in every centroid
            distances = np.zeros((dimensions[0],self.k))
            for f_index, f_val in enumerate(features):
                for c_index, c_val in enumerate(self.centroids):
                    distances[f_index, c_index] = euclidean(f_val[:-1], c_val)
            # assign each point to the minimum distance (closest) cluster
            self.cluster = np.argmin(distances, axis = 1)

            # gather all the points in the cluster.  if no points, do nothing for now, else compute the mean
            # of that feature for that cluster
            updated_centroids = np.zeros_like(self.centroids)
            for centroid in range(self.k):
                temp = features[self.cluster == centroid]
                if len(temp) != 0:
                    for dim in range(dimensions[1]-1):
                        updated_centroids[centroid, dim] = np.mean(temp[:,dim])

            # checks if the distance between centroids is smaller than the system eps
            # (The smallest representable positive number such that 1.0 + eps != 1.0.) 2.22e-16 on my local machine
            if np.linalg.norm(updated_centroids - self.centroids) < np.finfo(float).eps:
                print("converged in {} episodes".format(episodes))
                not_converged = False

            self.centroids = updated_centroids
        print("converged in {} episodes".format(episodes))
        
        
    def evaluate(self, test_features):
        # essentially the cluster identifying logic of the converge function, just not in a loop and returns the
        # labels.
        dimensions = test_features.shape

        distances = np.zeros((dimensions[0], self.k))
        for f_index, f_val in enumerate(test_features):
            for c_index, c_val in enumerate(self.centroids):
                distances[f_index, c_index] = euclidean(f_val[:-1], c_val)
        # assign each point to the minimum distance (closest) cluster
        test_labels = np.argmin(distances, axis=1)
        return test_labels


Converge the clusters on the training data

In [None]:
kmeans = Kmeans(20, features)
kmeans.convergeClusters(features)

re-use first two cells, just changing the path root

In [None]:
import os
import sys
import operator
import time
import numpy as np
import contractions
from sklearn.model_selection import train_test_split


import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # needed by word_tokenize
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
# list of words found to still rank very high (10k+) after a running with just stopwords and examining vocab.txt
other_words = ['lines', 'subject', 'would', 'organization']

# Assuming this file is put under the same parent directoray as the data directory, and the data directory is named "20news-train"
root_path = "./20news-test"
# The maximum size of the final vocabulary. It's a hyper-parameter. You can change it to see what value gives the best performance.
MAX_VOCAB_SIZE = 5000

start_time = time.time()
vocab_full = {}
n_doc = 0
# Only keep the data dictionaries and ignore possible system files like .DS_Store
folders = [os.path.join(root_path, name) for name in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, name))]
for folder in folders:
    for filename in os.listdir(folder):
        file = os.path.join(folder, filename)
        n_doc += 1
        # print(file)
        with open(file, 'r', encoding='utf8', errors='ignore') as f:
            for line in f:
                # split contractions into two words
                line = contractions.fix(line)
                tokens = word_tokenize(line)
                # force everything to lower case and remove non-alphabetic characters
                tokens = [token.lower() for token in tokens if token.isalpha()]
                for token in tokens:
                    # remove stop words, other words (above) and single characters
                    if (token not in stop_words) and (token not in other_words) and (len(token) > 1):
                        vocab_full[token] = vocab_full.get(token, 0) + 1
print(f'{n_doc} documents in total with a total vocab size of {len(vocab_full)}')
vocab_sorted = sorted(vocab_full.items(), key=operator.itemgetter(1), reverse=True)
vocab_truncated = vocab_sorted[:MAX_VOCAB_SIZE]
# Save the vocabulary to file for visual inspection and possible analysis
with open('vocab1.txt', 'w') as f:
    for vocab, freq in vocab_truncated:
        f.write(f'{vocab}\t{freq}\n')
# The final vocabulary is a dict mapping each token to its id. frequency information is not needed anymore.
vocab = dict([(token, id) for id, (token, _) in enumerate(vocab_truncated)])
# Since we have truncated the vocabulary, we will encounter many tokens that are not in the vocabulary. We will map all of them to the same 'UNK' token (a common practice in text processing), so we append it to the end of the vocabulary.
vocab['UNK'] = MAX_VOCAB_SIZE
vocab_size = len(vocab)
unk_id = MAX_VOCAB_SIZE
elapsed_time = time.time() - start_time
print(f'Vocabulary construction took {elapsed_time} seconds')

In [None]:
# Since we have truncated the vocabulary, it's now reasonable to hold the entire feature matrix in memory (it takes about 3.6GB on a 64-bit machine). If memory is an issue, you could make the vocabulary even smaller or use sparse matrix.
start_time = time.time()
features = np.zeros((n_doc, vocab_size), dtype=int)
print(f'The feature matrix takes {sys.getsizeof(features)} Bytes.')
# The class label of each document
labels = np.zeros(n_doc, dtype=int)
# The mapping from the name of each class label (i.e., the subdictionary name corresponding to a topic) to an integer ID
label2id = {}
label_id = 0
doc_id = 0
for folder in folders:
    label2id[folder] = label_id
    for filename in os.listdir(folder):
        labels[doc_id] = label_id
        file = os.path.join(folder, filename)
        with open(file, 'r', encoding='utf8', errors='ignore') as f:
            for line in f:
                tokens = word_tokenize(line)
                for token in tokens:
                    # if the current token is in the vocabulary, get its ID; otherwise, get the ID of the UNK token
                    token_id = vocab.get(token, unk_id)
                    features[doc_id, token_id] += 1
        doc_id += 1
    label_id += 1
elapsed_time = time.time() - start_time
print(f'Feature extraction took {elapsed_time} seconds')

modify the given test cell to set the classified labels to test_labels, then calculate accuracy as written.  (I believe that is what this cell is set up to do.  Takes the 'answers' from the documents in the second for loop, then calculates accuracy)

In [None]:
#print(len(test_labels))

#save the map between the name of label to an interger ID
label_names = os.listdir("./20news-train")
label_map = {}
new_ID = 0
for label in label_names:
    label_map[label] = new_ID
    new_ID +=1
    
#find the real label for the test dataset 
TEST_PATH = "./20news-test"
test_folders = os.listdir(TEST_PATH)
train_labels = []
for folder in test_folders:
    path = os.path.join(TEST_PATH,folder)
    try:
        files = os.listdir(path)
    except NotADirectoryError:
        continue
    for file in files:
        tmp = label_map[f]
        train_labels.append(tmp)
#print(len(train_labels))

# use model trained a few cells ago along with the newly extracted features to classify the new documents
test_labels = kmeans.evaluate(features)

#calculate accuracy
test_labels = np.array(test_labels).astype(int)
train_labels = np.array(train_labels).astype(int)
accuracy = np.sum(test_labels == train_labels)/test_labels.shape[0]
print("Accuracy is ", accuracy)