Sawyer Byrd

CMSC422 HW1

In [2]:
#imports 
from pathlib import Path
import numpy as np
import pandas as pd
import re
import chardet
import math
from collections import Counter


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Cleaning The Data

In [3]:
# setting directory
directory = Path('/home/sawyerbyrd/CMSC422/HW1/20_newsgroups')

puting label names into list.

In [4]:
# index of label in list will corespond to label number.
# e.g. 'comp.graphics' is at index 1 so its label number will be 1.

labels = []

for file_path in directory.iterdir(): 
   labels.append(file_path.name)

print(labels)

['comp.os.ms-windows.misc', 'comp.graphics', 'sci.crypt', 'rec.sport.baseball', 'comp.windows.x', 'rec.motorcycles', 'rec.autos', 'soc.religion.christian', 'talk.politics.misc', 'talk.politics.guns', 'sci.electronics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'alt.atheism', 'talk.religion.misc', 'sci.space', 'sci.med', 'rec.sport.hockey', 'misc.forsale', 'talk.politics.mideast']


function that removes the first 4 lines of each document

In [5]:
def remove_first_4(file_path): 
    file = file_path.open('r', errors='ignore')
    lines = file.readlines()
    return lines[4:]

Function that tokenizes text

In [6]:
# returns the cleaned, lowercase, tokenized text.

def tokenize(text):
    text = text.lower()
    # collects only alphanumeric and spaces 
    cleaned = re.sub(r'[^a-z\s]', ' ', text)
    # sorts though for words of length >= 2
    tokens = re.findall(r'\b[a-z]{2,}\b', cleaned)
    return tokens

Entering docs into dataframe

In [7]:
# each row has: col1 -> doc contents ; col2 -> label

# list that will be used to create dataframe
docs = []

# iterating through class files
for label_path in directory.iterdir():
    # iterating through each doc 
    for file_path in label_path.iterdir():
        if file_path.is_file():
            # removing the first 4 lines, making it into one string and tokenizing it
            doc_content = remove_first_4(file_path)
            doc_content = tokenize(''.join(doc_content))
            docs.append({
                'Contents': doc_content,
                'Label': labels.index(label_path.name)
            })

all_docs = pd.DataFrame(docs)


Splitting into train and test sets

In [8]:
train_temp = []
test_temp = []
# half of the docs from each class in train and half in test
for label in range(20): 
    class_df = all_docs[all_docs['Label'] == label]
    # shuffling the class set before spliting
    class_df = class_df.sample(frac=1, random_state=37).reset_index(drop=True)
    train_temp.append(class_df[:500])
    test_temp.append(class_df[500:])

# concat all dfs for train and test into one df each
# shuffling test for randomness
train = pd.concat(train_temp).reset_index(drop=True)
test = pd.concat(test_temp).reset_index(drop=True)
test = test.sample(frac=1, random_state=37).reset_index(drop=True)

train

Unnamed: 0,Contents,Label
0,"[subject, lockups, in, enh, mode, floppy, mess...",0
1,"[message, id, qgsb, world, std, com, followup,...",0
2,"[date, apr, organization, center, for, reliabl...",0
3,"[keywords, winprinter, from, lasermaster, corp...",0
4,"[message, id, silver, sfu, ca, sender, news, s...",0
...,...,...
9995,"[subject, desertification, of, the, negev, mes...",19
9996,"[subject, re, israeli, terrorism, date, apr, g...",19
9997,"[subject, the, soviet, armenian, government, m...",19
9998,"[subject, arrest, of, fugitive, in, adl, case,...",19


Organizing some data for training algorithm

In [9]:
# each index in this array holds a dictionary for the counts of unique words in the cooresponding class
# ie. index 0 holds the counts for all unique words in class 0 ('comp.os.ms-windows.misc')
class_wrd_ct = [Counter() for _ in range(20)]

# this holds the count of each unique word in the entirety of the train set
# ie. the vocab set
vocab = Counter()

# for each row, update the class at index (label) and vocab with word count for that docs content
for i, row in train.iterrows():
    cont = row['Contents']
    label = row['Label']
    class_wrd_ct[label].update(cont)
    vocab.update(cont)

Creating a stop list and removing those words from V

In [10]:
stop_lst = set({word for word, _ in vocab.most_common(200)})

In [11]:
# removing stop list from vocab
vocab = set({
    word: word
    for word in vocab.keys()
    if word not in stop_lst
})

Setting up constant for training

In [12]:
voc_len = len(vocab)
print('Length of Vocabulary: ', voc_len)

Length of Vocabulary:  83330


training

In [13]:
# creating and populating loglikelyhood dictionary

log_likelyhood = {}

for cls in range(20):
    # for each class, create a log likeleyhood entry 
    log_likelyhood[cls] = {}
    # for each word in vocab
    for word in vocab:
        count_w_c = class_wrd_ct[cls].get(word, 0) + 1
        class_ct = len(class_wrd_ct[cls]) + voc_len
        # log likeleyhood[w,c] = (count(w,c) + 1) / (how many times it shows up in cls + |V|)
        log_likelyhood[cls][word] = math.log(count_w_c / class_ct)


Testing


Setting up constant for tests

In [14]:
logprior = math.log(1000/20000)
print('Logprior: ', logprior)

Logprior:  -2.995732273553991


Defining the test algorithm

In [15]:
def test_naive(test_doc):
    sum = [logprior] * 20
    for c in range(20):
        for word in test_doc:
            if word in vocab:
                sum[c] = sum[c] + log_likelyhood[c][word]
    
    arg_max_c = max(sum)
    cls = sum.index(arg_max_c)
    return cls
    

Creating a new column in the dataframe to represent the classification (i.e. y_hat)

In [16]:
test['y_hat'] = 0

Creating variables to keep track of correct classifications

In [17]:
# variable to count the number of correct classifications
correct = 0

# dictionary to count the number of correct classifications in each class
class_correct = {}

Runing each test doc through algorithm

In [18]:
for i, row in test.iterrows():
    # updating y_hat to the algorithms guess for what class this doc is in
    y_hat = test_naive(row['Contents'])
    row['y_hat'] = y_hat
    # updating the number of correct classifications
    label = row['Label']
    if label == y_hat:
        correct += 1
        if label not in class_correct:
            class_correct[label] = 0
        class_correct[label] += 1

Checking the total accuracy, as well as for each individual class

In [19]:
# Total accuracy
print('Number Correct: ', correct)
percent_correct = (correct/len(test)) * 100

# Class accuracy
print('Percentage Correct: ', percent_correct, '%\n')
for key, value in class_correct.items():
    print('Class: ', labels[key])
    print('percent_correct: ', (value/500) * 100, '%\n')

Number Correct:  8187
Percentage Correct:  81.89456837051115 %

Class:  sci.med
percent_correct:  89.4 %

Class:  sci.crypt
percent_correct:  94.39999999999999 %

Class:  rec.motorcycles
percent_correct:  91.8 %

Class:  sci.space
percent_correct:  93.0 %

Class:  soc.religion.christian
percent_correct:  98.0 %

Class:  talk.politics.misc
percent_correct:  79.60000000000001 %

Class:  comp.graphics
percent_correct:  80.60000000000001 %

Class:  sci.electronics
percent_correct:  72.0 %

Class:  comp.sys.mac.hardware
percent_correct:  70.6 %

Class:  talk.politics.mideast
percent_correct:  96.8 %

Class:  talk.religion.misc
percent_correct:  58.599999999999994 %

Class:  talk.politics.guns
percent_correct:  89.4 %

Class:  rec.sport.baseball
percent_correct:  88.6 %

Class:  comp.windows.x
percent_correct:  89.0 %

Class:  rec.sport.hockey
percent_correct:  96.6 %

Class:  misc.forsale
percent_correct:  48.8 %

Class:  comp.sys.ibm.pc.hardware
percent_correct:  74.8 %

Class:  alt.atheis