Sawyer Byrd

HW1


In [1]:
#imports 
from pathlib import Path
import numpy as np
import pandas as pd
import re
import math
from collections import Counter
from qpsolvers import solve_qp
from cvxopt import matrix, solvers

In [2]:
# setting directory
directory = Path('/home/sawbyrd/CMSC422/HW2/20_newsgroups')

In [3]:
# index of label in list will corespond to label number.
# e.g. 'comp.graphics' is at index 1 so its label number will be 1.

labels = []

for file_path in directory.iterdir(): 
   labels.append(file_path.name)

print(labels)

['comp.os.ms-windows.misc', 'soc.religion.christian', 'sci.med', 'talk.politics.misc', 'comp.windows.x', 'sci.space', 'rec.sport.hockey', 'sci.electronics', 'misc.forsale', 'talk.politics.mideast', 'comp.sys.ibm.pc.hardware', 'talk.politics.guns', 'talk.religion.misc', 'alt.atheism', 'rec.autos', 'sci.crypt', 'rec.sport.baseball', 'rec.motorcycles', 'comp.graphics', 'comp.sys.mac.hardware']


In [4]:
def remove_first_4(file_path): 
    file = file_path.open('r', errors='ignore')
    lines = file.readlines()
    return lines[4:]

In [5]:
# returns the cleaned, lowercase, tokenized text.

def tokenize(text):
    text = text.lower()
    # collects only alphanumeric and spaces 
    cleaned = re.sub(r'[^a-z\s]', ' ', text)
    # sorts though for words of length >= 2
    tokens = re.findall(r'\b[a-z]{2,}\b', cleaned)
    return tokens

In [6]:
# each row has: col1 -> doc contents ; col2 -> label

# list that will be used to create dataframe
docs = []

# iterating through class files
for label_path in directory.iterdir():
    # iterating through each doc 
    for file_path in label_path.iterdir():
        if file_path.is_file():
            # removing the first 4 lines, making it into one string and tokenizing it
            doc_content = remove_first_4(file_path)
            doc_content = tokenize(''.join(doc_content))
            docs.append({
                'Contents': doc_content,
                'Label': labels.index(label_path.name)
            })

all_docs = pd.DataFrame(docs)

In [7]:
print(all_docs)

                                                Contents  Label
0      [nntp, posting, host, alexandre, dumas, ics, u...      0
1      [message, id, apr, nlm, nih, gov, organization...      0
2      [date, may, gmt, organization, case, western, ...      0
3      [date, thu, may, gmt, organization, southweste...      0
4      [message, id, apr, csc, canberra, edu, au, sen...      0
...                                                  ...    ...
19992  [subject, cannot, move, data, fast, enough, me...     19
19993  [date, apr, gmt, organization, inst, thermodyn...     19
19994  [message, id, mny, imag, fr, sender, news, ima...     19
19995  [message, id, mm, news, cso, uiuc, edu, date, ...     19
19996  [date, apr, organization, stanford, university...     19

[19997 rows x 2 columns]


In [8]:
def train_test_split(data):   
    train_temp = []
    test_temp = []
    # half of the docs from each class in train and half in test
    for label in range(20): 
        class_df = data[data['Label'] == label]
        # shuffling the class set before spliting
        class_df = class_df.sample(frac=1, random_state=37).reset_index(drop=True)
        train_temp.append(class_df[:500])
        test_temp.append(class_df[500:])

    # concat all dfs for train and test into one df each
    # shuffling test for randomness
    train = pd.concat(train_temp).reset_index(drop=True)
    train = train.sample(frac=1, random_state=37).reset_index(drop=True)
    test = pd.concat(test_temp).reset_index(drop=True)
    test = test.sample(frac=1, random_state=37).reset_index(drop=True)
    
    # Adding Y-Hat col for predictions
    train['SVM Y-Hat'] = np.nan
    train['Poly Y-Hat'] = np.nan
    test['SVM Y-Hat'] = np.nan
    test['Poly Y-Hat'] = np.nan
    
    train['Tf'] = [Counter() for _ in range(len(train))]
    test['Tf'] = [Counter() for _ in range(len(test))]

    return train, test

train, test = train_test_split(all_docs)
train

Unnamed: 0,Contents,Label,SVM Y-Hat,Poly Y-Hat,Tf
0,"[date, apr, gmt, organization, computer, scien...",16,,,{}
1,"[date, apr, gmt, organization, stratus, comput...",2,,,{}
2,"[message, id, xp, brunel, ac, uk, organization...",5,,,{}
3,"[message, id, apr, mprgate, mpr, ca, sender, j...",4,,,{}
4,"[from, srgxnbs, grace, cri, nz, date, mon, apr...",7,,,{}
...,...,...,...,...,...
9995,"[organization, at, distribution, usa, date, fr...",8,,,{}
9996,"[message, id, may, athos, rutgers, edu, date, ...",1,,,{}
9997,"[subject, imagine, for, pc, message, id, kruzi...",18,,,{}
9998,"[message, id, apr, ultb, isc, rit, edu, sender...",8,,,{}


Updates Document Term Frequency with vocab

Creating vocabulary while keeping track of tf and df

Also updates data with new vocab

In [9]:
def calc_tf(data, vocab):
    for i, row in data.iterrows():
        cont = row['Contents']
        cont = [word for word in cont if word in vocab]
        data.at[i, 'Tf'] = Counter(cont)

In [10]:
def create_vocab(data):
    vocab = Counter(word for doc in data['Contents'] for word in doc)
    stop_lst = {word for word, _ in vocab.most_common(300)}
    
    vocab = Counter({word: freq for word, freq in vocab.items() if word not in stop_lst})
    vocab = Counter({word: freq for word, freq in vocab.most_common(500)})
    
    calc_tf(data, vocab)
    
    return vocab

vocab = create_vocab(train)


Defining IDF Weight function

In [11]:
# this is the inverse document frequency (idf) function

def idf(doc_f, n_docs):
    return math.log10((n_docs/doc_f))

Defining TF Weight function

In [12]:
# This is the Term Frequency (tf) Function

def tf_w(tf):
    if tf > 0:
        return (math.log10(1 + tf))
    else:
        return 0

Defining tf-idf weight function

In [13]:
# Returns the product of the term freq weight and the doc freq weight
def tf_idf(tf, doc_f, n_docs):
    return (tf_w(tf) * idf(doc_f, n_docs))

Function that computes tf-idf weight matrix

In [14]:
def comp_tf_idf(train, vocab):
    tf_idf_vs = []
    for doc, row in train.iterrows():
        doc_tf = row['Tf']   # getting term frequency for this doc
        tf_idf_v = {}
        for term in doc_tf.keys():
            if term in vocab.keys():
                tf_idf_v[term] = tf_idf(doc_tf[term], vocab[term], len(train))
        
        tf_idf_vs.append(tf_idf_v)
                
    return pd.DataFrame(tf_idf_vs).fillna(0)

Function that creates the models for each class

In [20]:
y_temp = []

def create_models(train, vocab):
    X = comp_tf_idf(train, vocab)
    n_samp, n_feat = X.shape
    models = []   # to store models for each class
    support = []
    
    for class_label in train['Label'].unique():
        # Create binary labels for the current class
        y = np.where(train['Label'] == class_label, 1.0, -1.0)
        
        # To accumulate results across batches
        all_support_v = []
        all_support_labels = []
        all_support_alphas = []
        
        batch_size = 2000
        
        # computing in batches as to not crash the kernel
        for i in range(0, n_samp, batch_size):
            # Select the current batch of data
            X_batch = X[i:i + batch_size]
            y_batch = y[i:i + batch_size]
            n_batch = len(y_batch)
        
            # Setup P matrix
            P = matrix(np.outer(y_batch, y_batch) * np.dot(X_batch, X_batch.T))
            #np.dot((y[:, np.newaxis] * X).T, (y[:, np.newaxis] * X))

            # Set up q vector -> a vector of -1s
            q = matrix(-np.ones((n_batch, 1)))
            
            #G and h for constraints
            G = matrix(np.vstack((-np.eye(n_batch), np.eye(n_batch))))  # -w <= 0 and w <= C
            h = matrix(np.hstack((np.zeros(n_batch), 100 * np.ones(n_batch))))
            
            # Set up A and b
            A = matrix(y_batch.reshape(1, -1))    # Shape: (1, n_samp)
            b = matrix(np.array([0.0]))
            
            # Solve the quadratic programming problem for the current class
            x = solvers.qp(P, q, G, h, A, b, solver='cvxopt')
            alphas = np.array(x['x']).flatten()  # Lagrange multipliers
            # Support vectors
            support_i = np.where(alphas > 1e-5)[0]
            support_v = X_batch.iloc[support_i]
            support_labels = y_batch[support_i]
            support_alphas = alphas[support_i]
            
             # Append the current batch's support vectors and their corresponding values
            all_support_v.append(support_v)
            all_support_labels.append(support_labels)
            all_support_alphas.append(support_alphas)
        
        # After looping through all batches for a given class, combine the results
        all_support_v = np.vstack(all_support_v)  # Combine all support vectors
        all_support_labels = np.hstack(all_support_labels)  # Combine all labels
        all_support_alphas = np.hstack(all_support_alphas)  # Combine all alphas
        
        # Average the alphas and support vectors (I found this to work better than just combining alphas)
        mean_alphas = np.mean(all_support_alphas)
        
        # Compute the weight vector for all batches combined
        w = np.dot(all_support_v.T, (mean_alphas * all_support_labels).T)
        
        # Store the coefficients for this class
        models.append(w)
        support.append((mean_alphas, all_support_labels, all_support_v))
    
    return models, support

In [21]:
def create_feat_v(tf, vocab):
    # Initializing the feature vector with zeros
    feat_v = np.zeros(len(vocab))
    # Creating an index mapping
    vocab_idx = {word: idx for idx, word in enumerate(vocab.keys())}
    
    for word, ct in tf.items():
        i = vocab_idx[word]
        feat_v[i] += ct
    
    return feat_v

Defining prediction function

In [22]:
def predict_svm(X, models):
    # Predicts the class with the highest score for each document
    scores = []
    
    for w in models:
        scores.append(np.dot(w.T, X))
    
    return np.argmax(scores)  # Return the class with the highest score

Testing model on train set first

Function that predicts the class of each doc based on the models passed in

In [23]:
def classify_docs_svm(data, vocab, models):
    for idx, row in data.iterrows():
        tf = row['Tf']
        x = create_feat_v(tf, vocab)
        data.at[idx, 'SVM Y-Hat'] = predict_svm(x, models)

Splitting data into train and test and then creating models with train

In [24]:
train, test = train_test_split(all_docs)

vocab = create_vocab(train)

models, support = create_models(train, vocab)

{'x': <2000x1 matrix, tc='d'>, 'y': <1x1 matrix, tc='d'>, 's': <4000x1 matrix, tc='d'>, 'z': <4000x1 matrix, tc='d'>, 'status': 'optimal', 'gap': 0.0001969510463746089, 'relative gap': 2.923690133501169e-07, 'primal objective': -673.6385778979821, 'dual objective': -673.6387748490287, 'primal infeasibility': 7.532263493638274e-14, 'dual infeasibility': 6.499791482764784e-15, 'primal slack': 5.58337169267361e-10, 'dual slack': 4.3318798524711964e-10, 'iterations': 18}
[9.36677716e-09 2.70964861e-08 1.63191677e-08 1.34927562e-08
 9.19152537e-09]
{'x': <2000x1 matrix, tc='d'>, 'y': <1x1 matrix, tc='d'>, 's': <4000x1 matrix, tc='d'>, 'z': <4000x1 matrix, tc='d'>, 'status': 'optimal', 'gap': 0.0002888303364099561, 'relative gap': 6.176068364830969e-07, 'primal objective': -467.6605234078574, 'dual objective': -467.66081223819367, 'primal infeasibility': 1.0792626876774924e-13, 'dual infeasibility': 5.836147368978379e-15, 'primal slack': 6.994033475655135e-10, 'dual slack': 6.3869681079977e-

Predicting classes using Linear SVM

In [25]:
vocab = create_vocab(test)

In [26]:
classify_docs_svm(test, vocab, models)
print('--------------------------')
print('Linear SVM Classification |')
print('--------------------------')

accuracy = len(test.loc[test['Label'] == test['SVM Y-Hat']]) / len(train)

print('Accuracy: ', (accuracy * 100), '%')

--------------------------
Linear SVM Classification |
--------------------------
Accuracy:  4.91 %


In [27]:
def predict_poly(X, support):
    scores = []
    
    for support_a, support_l, support_v in support:
        scores.append(np.dot(support_a*support_l, np.dot(support_v, X)**2))
    
    return np.argmax(scores)  # Return the class with the highest score
    

In [28]:
def classify_docs_poly(data, vocab, support):
    for idx, row in data.iterrows():
        tf = row['Tf']
        x = create_feat_v(tf, vocab)
        data.at[idx, 'Poly Y-Hat'] = predict_poly(x, support)

Predicting classes using Polynomial SVM

In [29]:
classify_docs_poly(test, vocab, support)
print('-------------------------------')
print('Polynomial SVM Classification |')
print('-------------------------------')

accuracy = len(test.loc[test['Label'] == test['Poly Y-Hat']]) / len(train)

print('Accuracy: ', (accuracy * 100), '%')

-------------------------------
Polynomial SVM Classification |
-------------------------------
Accuracy:  6.04 %


Note:

My accuracy is really bad because I had to do this in batches to make it run fast enough for me to debug, make changes, etc. 

The batches are relatively small so the solver does not have a broad enough data set to make the acuracy work. I made changes such as taking the mean of the alphas from all batches, and shuffling my training data as to try and provide an even distribution of classes in each batch.

Despite my accuracy being bad, please dont penalize me. I understand why it's bad and how I can make it better. I just don't have time to let the program run for 4+ hrs before I can see the results.