In [2]:
import pandas as pd 
import numpy as np 

#### 1. Evaluation Metrics ####

In [4]:
## Input: y_pred, a list of length n with the predicted labels,
## y_true, a list of length n with the true labels
#                Predicted
#              |   0   |   1
# Actual  0    |  TN   |  FP
#         1    |  FN   |  TP

## Calculates the precision of the predicted labels
def get_precision(y_pred, y_true):
    truePos, falsePos = 0, 0
    for i,j in zip(y_pred, y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 1 and j == 0:
            falsePos += 1
            
    if truePos + falsePos == 0: #incase there are no true positives or false positives
        return 0
    
    precision = truePos/(truePos + falsePos)
    return precision
    
## Calculates the recall of the predicted labels
def get_recall(y_pred, y_true): #TP/(TP + FN)
    truePos, falseNeg = 0, 0
    
    for i,j in zip(y_pred, y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 0 and j == 1:
            falseNeg += 1
    recall = truePos/(truePos+falseNeg)
    return recall

## Calculates the f-score of the predicted labels
def get_fscore(y_pred, y_true): #TP/(TP + 1/2(FP + FN))
    truePos, falsePos, falseNeg = 0, 0, 0
    for i, j in zip(y_pred,y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 1 and j == 0:
            falsePos += 1
        elif i == 0 and j == 1:
            falseNeg += 1
    if truePos == 0:
        return 0
    fscore = truePos/(truePos + 0.5 * (falsePos + falseNeg))
    return fscore

#### 2. Complex Word Identification ####

In [None]:
def load_file(data_file):
    words = []
    labels = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
            i += 1
    return words, labels

## Makes feature matrix for all complex
def all_complex_feature(words):
    res = []

    for i,j in zip(words[0],words[1]):
        res.append([j])
    return res

## Labels every word complex
def all_complex(data_file):
    ## YOUR CODE HERE...
    words, labels = load_file(data_file)
    pred = [1] * len(words)

    # metrics
    precision = get_precision(pred, labels)
    recall = get_recall(pred, labels)
    fscore = get_fscore(pred, labels)

    performance = [precision, recall, fscore]
    return performance


# ### 2.2: Word length thresholding

## Makes feature matrix for word_length_threshold
def length_threshold_feature(words, threshold):
    res = []
    for word in words:
        if len(word) >= threshold:
            res.append([1])
        else:
            res.append([0])
    return res

## Finds the best length threshold by f-score, and uses this threshold to
## classify the training and development set
def word_length_threshold(training_file, development_file):
    ## YOUR CODE HERE
    words, labels = load_file(training_file)



    training_performance = [tprecision, trecall, tfscore]
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

### 2.3: Naive Bayes

In [None]:
## Trains a Naive Bayes classifier using length and frequency features
def naive_bayes(training_file, development_file, counts):
    ## YOUR CODE HERE
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

### 2.4: Logistic Regression

In [None]:
def logistic_regression(training_file, development_file, counts):
    ## YOUR CODE HERE    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

### 2.7: Build your own classifier

In [None]:
if __name__ == "__main__":
    training_file = "data/complex_words_training.txt"
    development_file = "data/complex_words_development.txt"
    test_file = "data/complex_words_test_unlabeled.txt"

    train_data = load_file(training_file)