In [1]:
#############################################################
## ASSIGNMENT 2 CODE SKELETON
## RELEASED: 1/29/2019
## DUE: 2/5/2019
## DESCRIPTION: In this assignment, you will explore the
## text classification problem of identifying complex words.
## We have provided the following skeleton for your code,
## with several helper functions, and all the required
## functions you need to write.
#############################################################

from collections import defaultdict
import gzip
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
# import matplotlib.pyplot as plt

#### 1. Evaluation Metrics ####

## Input: y_pred, a list of length n with the predicted labels,
## y_true, a list of length n with the true labels

## Calculates the precision of the predicted labels
def get_precision(y_pred, y_true):
    # true positive: system says positive when truly positive
    true_positives = len([pred for idx, pred in enumerate(y_pred) if pred == 1 if y_true[idx] == 1])
    # false positive = system says positive when truly negative
    false_positives = len([pred for idx, pred in enumerate(y_pred) if pred == 1 if y_true[idx] == 0])
    precision = true_positives / (true_positives + false_positives)
    return precision
    
## Calculates the recall of the predicted labels
def get_recall(y_pred, y_true):
    # true positive: system says positive when truly positive
    true_positives = len([pred for idx, pred in enumerate(y_pred) if pred == 1 if y_true[idx] == 1])
    # false negative = system says negative when truly positive
    false_negatives = len([pred for idx, pred in enumerate(y_pred) if pred == 0 if y_true[idx] == 1])
    recall = true_positives / (true_positives + false_negatives)
    return recall

## Calculates the f-score of the predicted labels
def get_fscore(y_pred, y_true):
    precision = get_precision(y_pred, y_true)
    recall = get_recall(y_pred, y_true)
    fscore = (2 * precision * recall) / (precision + recall)
    return fscore

In [2]:
#### 2. Complex Word Identification ####

## Loads in the words and labels of one of the datasets
def load_file(data_file):
    words = []
    labels = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
            i += 1
    return words, labels

### 2.1: A very simple baseline

## Makes feature matrix for all complex
def all_complex_feature(words):
    return [1 for word in words]

## Labels every word complex
def all_complex(data_file):
    words, y_true = load_file(data_file)
    y_pred = all_complex_feature(words)
    precision = get_precision(y_pred, y_true)
    recall = get_recall(y_pred, y_true)
    fscore = get_fscore(y_pred, y_true)
    performance = [precision, recall, fscore]
    return performance

In [3]:
training_file = '/Users/sppatankar/Desktop/CIS 530/Homework 2/data/complex_words_training.txt'
development_file = '/Users/sppatankar/Desktop/CIS 530/Homework 2/data/complex_words_development.txt'

In [4]:
### 2.2: Word length thresholding

## Makes feature matrix for word_length_threshold
def length_threshold_feature(words, threshold):
    feature = []
    for word in words:
        if len(word) >= threshold:
            feature.append(1)
        else:
            feature.append(0)
    return feature

## Finds the best length threshold by f-score, and uses this threshold to
## classify the training and development set
def word_length_threshold(training_file, development_file):
    # training set data
    words_train_file, labels_train_file = load_file(training_file)
    precisions = []
    recalls = []
    fscores = []
    thresholds = range(10)
    for threshold in thresholds:
        y_pred_train = length_threshold_feature(words_train_file, threshold)
        precisions.append(get_precision(y_pred_train, labels_train_file))
        recalls.append(get_recall(y_pred_train, labels_train_file))
        fscores.append(get_fscore(y_pred_train, labels_train_file))
    threshold_choice = np.argmax(fscores)
    print(threshold_choice)
    # re-train with chosen hyperparameter value
    y_pred_train = length_threshold_feature(words_train_file, threshold_choice)
    tprecision = get_precision(y_pred_train, labels_train_file)
    trecall = get_recall(y_pred_train, labels_train_file)
    tfscore = get_fscore(y_pred_train, labels_train_file)
    training_performance = [tprecision, trecall, tfscore]
    # development set data
    words_dev_file, labels_dev_file = load_file(development_file)
    y_pred_dev = length_threshold_feature(words_dev_file, threshold_choice)
    dprecision = get_precision(y_pred_dev, labels_train_file)
    drecall = get_recall(y_pred_dev, labels_train_file)
    dfscore = get_fscore(y_pred_dev, labels_train_file)
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

In [5]:
word_length_threshold(training_file, development_file)

7


([0.6007401315789473, 0.8440207972270364, 0.7018976699495555],
 [0.4280936454849498, 0.5752808988764045, 0.4908916586768936])

In [23]:
### 2.3: Word frequency thresholding

## Loads Google NGram counts
def load_ngram_counts(ngram_counts_file): 
    counts = defaultdict(int) 
    with gzip.open(ngram_counts_file, 'rt') as f: 
        for line in f:
            token, count = line.strip().split('\t') 
            if token[0].islower(): 
                counts[token] = int(count) 
    return counts

ngram_counts_file = '/Users/sppatankar/Desktop/CIS 530/Homework 2/ngram_counts.txt.gz'
counts = load_ngram_counts(ngram_counts_file)

# Finds the best frequency threshold by f-score, and uses this threshold to
## classify the training and development set

## Make feature matrix for word_frequency_threshold

def frequency_threshold_feature(words, threshold, counts):
    feature = []
    for word in words:
        if counts[word] < threshold: # infrequent words are complex
            feature.append(1)
        else:
            feature.append(0)
    return feature

def word_frequency_threshold(training_file, development_file, counts):
    # training set data
    words_train_file, labels_train_file = load_file(training_file)
    precisions = []
    recalls = []
    fscores = []
    thresholds = list(range(1000000, 70000000, 100000))
    for threshold in thresholds:
        y_pred_train = frequency_threshold_feature(words_train_file, threshold, counts)
        precisions.append(get_precision(y_pred_train, labels_train_file))
        recalls.append(get_recall(y_pred_train, labels_train_file))
        fscores.append(get_fscore(y_pred_train, labels_train_file))
    threshold_choice = thresholds[np.argmax(fscores)]
    print(threshold_choice)
    # re-train with chosen hyperparameter value
    y_pred_train = frequency_threshold_feature(words_train_file, threshold_choice, counts)
    tprecision = get_precision(y_pred_train, labels_train_file)
    trecall = get_recall(y_pred_train, labels_train_file)
    tfscore = get_fscore(y_pred_train, labels_train_file)
    training_performance = [tprecision, trecall, tfscore]
    # development set data
    words_dev_file, labels_dev_file = load_file(development_file)
    y_pred_dev = frequency_threshold_feature(words_dev_file, threshold_choice, counts)
    dprecision = get_precision(y_pred_dev, labels_train_file)
    drecall = get_recall(y_pred_dev, labels_train_file)
    dfscore = get_fscore(y_pred_dev, labels_train_file)
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

In [24]:
word_frequency_threshold(training_file, development_file, counts)

19900000


([0.5657051282051282, 0.8157134604274986, 0.6680861130825645],
 [0.4353312302839117, 0.6202247191011236, 0.5115848007414273])

In [12]:
### 2.4: Naive Bayes
        
## Trains a Naive Bayes classifier using length and frequency features
def naive_bayes(training_file, development_file, counts):
    
    words_train_file, labels_train_file = load_file(training_file)
    precisions = []
    recalls = []
    fscores = []
    thresholds = range(10)
    for threshold in thresholds:
        y_pred_train = length_threshold_feature(words_train_file, threshold)
        precisions.append(get_precision(y_pred_train, labels_train_file))
        recalls.append(get_recall(y_pred_train, labels_train_file))
        fscores.append(get_fscore(y_pred_train, labels_train_file))
    threshold_choice_length = np.argmax(fscores)
    train_length_feat = length_threshold_feature(words_train_file, threshold_choice_length)
    
    precisions = []
    recalls = []
    fscores = []
    thresholds = list(range(1000000, 70000000, 100000))
    for threshold in thresholds:
        y_pred_train = frequency_threshold_feature(words_train_file, threshold, counts)
        precisions.append(get_precision(y_pred_train, labels_train_file))
        recalls.append(get_recall(y_pred_train, labels_train_file))
        fscores.append(get_fscore(y_pred_train, labels_train_file))
    threshold_choice_freq = thresholds[np.argmax(fscores)]
    train_freq_feat = frequency_threshold_feature(words_train_file, threshold_choice_freq, counts)
    
    X_train = np.column_stack((train_length_feat, train_freq_feat))
    X_train = (X_train - X_train.mean(axis = 0)) / X_train.std(axis = 0) # normalize
    Y_train = np.asarray(labels_train_file)
    clf = GaussianNB()
    clf.fit(X_train, Y_train)
    Y_pred_train = clf.predict(X_train)
    tprecision = get_precision(Y_pred_train.tolist(), Y_train.tolist())
    trecall = get_recall(Y_pred_train.tolist(), Y_train.tolist())
    tfscore = get_fscore(Y_pred_train.tolist(), Y_train.tolist())
    training_performance = (tprecision, trecall, tfscore)
    
    words_dev_file, labels_dev_file = load_file(development_file)
    dev_length_feat = length_threshold_feature(words_dev_file, threshold_choice_length)
    dev_freq_feat = frequency_threshold_feature(words_dev_file, threshold_choice_freq, counts)
    X_dev = np.column_stack((dev_length_feat, dev_freq_feat))
    X_dev = (X_dev - X_train.mean(axis = 0)) / X_train.std(axis = 0) # normalize
    Y_dev = np.asarray(labels_dev_file)

    Y_pred_dev = clf.predict(X_dev)
    dprecision = get_precision(Y_pred_dev.tolist(), Y_dev.tolist())
    drecall = get_recall(Y_pred_dev.tolist(), Y_dev.tolist())
    dfscore = get_fscore(Y_pred_dev.tolist(), Y_dev.tolist())
    development_performance = (dprecision, drecall, dfscore)
    
    return development_performance

In [13]:
dev_perf = naive_bayes(training_file, development_file, counts)
dev_perf

(0.418, 1.0, 0.5895627644569816)

In [25]:
### 2.5: Logistic Regression

## Trains a Logistic Regression classifier using length and frequency features
def logistic_regression(training_file, development_file, counts):
    
    words_train_file, labels_train_file = load_file(training_file)
    precisions = []
    recalls = []
    fscores = []
    thresholds = range(10)
    for threshold in thresholds:
        y_pred_train = length_threshold_feature(words_train_file, threshold)
        precisions.append(get_precision(y_pred_train, labels_train_file))
        recalls.append(get_recall(y_pred_train, labels_train_file))
        fscores.append(get_fscore(y_pred_train, labels_train_file))
    threshold_choice_length = np.argmax(fscores)
    train_length_feat = length_threshold_feature(words_train_file, threshold_choice_length)
    
    precisions = []
    recalls = []
    fscores = []
    thresholds = list(range(1000000, 70000000, 100000))
    for threshold in thresholds:
        y_pred_train = frequency_threshold_feature(words_train_file, threshold, counts)
        precisions.append(get_precision(y_pred_train, labels_train_file))
        recalls.append(get_recall(y_pred_train, labels_train_file))
        fscores.append(get_fscore(y_pred_train, labels_train_file))
    threshold_choice_freq = thresholds[np.argmax(fscores)]
    print(threshold_choice_freq)
    train_freq_feat = frequency_threshold_feature(words_train_file, threshold_choice_freq, counts)
    
    X_train = np.column_stack((train_length_feat, train_freq_feat))
    X_train = (X_train - X_train.mean(axis = 0)) / X_train.std(axis = 0) # normalize
    Y_train = np.asarray(labels_train_file)
    clf = LogisticRegression()
    clf.fit(X_train, Y_train)
    Y_pred_train = clf.predict(X_train)
    tprecision = get_precision(Y_pred_train.tolist(), Y_train.tolist())
    trecall = get_recall(Y_pred_train.tolist(), Y_train.tolist())
    tfscore = get_fscore(Y_pred_train.tolist(), Y_train.tolist())
    training_performance = (tprecision, trecall, tfscore)
    
    words_dev_file, labels_dev_file = load_file(development_file)
    dev_length_feat = length_threshold_feature(words_dev_file, threshold_choice_length)
    dev_freq_feat = frequency_threshold_feature(words_dev_file, threshold_choice_freq, counts)
    X_dev = np.column_stack((dev_length_feat, dev_freq_feat))
    X_dev = (X_dev - X_train.mean(axis = 0)) / X_train.std(axis = 0) # normalize
    Y_dev = np.asarray(labels_dev_file)

    Y_pred_dev = clf.predict(X_dev)
    dprecision = get_precision(Y_pred_dev.tolist(), Y_dev.tolist())
    drecall = get_recall(Y_pred_dev.tolist(), Y_dev.tolist())
    dfscore = get_fscore(Y_pred_dev.tolist(), Y_dev.tolist())
    development_performance = (dprecision, drecall, dfscore)
    
    return development_performance

In [26]:
dev_perf = logistic_regression(training_file, development_file, counts)
dev_perf

19900000




(0.5221799746514575, 0.9856459330143541, 0.6826843413421706)

In [None]:
### 2.7: Build your own classifier

## Trains a classifier of your choosing, predicts labels for the test dataset
## and writes the predicted labels to the text file 'test_labels.txt',
## with ONE LABEL PER LINE


if __name__ == "__main__":
    training_file = "data/complex_words_training.txt"
    development_file = "data/complex_words_development.txt"
    test_file = "data/complex_words_test_unlabeled.txt"

    train_data = load_file(training_file)
    
    ngram_counts_file = "ngram_counts.txt.gz"
    counts = load_ngram_counts(ngram_counts_file)