In [1]:
import pandas as pd 
import numpy as np 

#### 1. Evaluation Metrics ####

In [None]:
## Input: y_pred, a list of length n with the predicted labels,
## y_true, a list of length n with the true labels
#                Pred
#              |   0   |   1
# true    0    |  TN   |  FP
#         1    |  FN   |  TP

## Calculates the precision of the predicted labels
def get_precision(y_pred, y_true):
    truePos, falsePos = 0, 0
    for i,j in zip(y_pred, y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 1 and j == 0:
            falsePos += 1
            
    if truePos + falsePos == 0: #incase there are no true positives or false positives
        return 0
    
    precision = truePos/(truePos + falsePos)
    return precision
    
## Calculates the recall of the predicted labels
def get_recall(y_pred, y_true): #TP/(TP + FN)
    truePos, falseNeg = 0, 0
    
    for i,j in zip(y_pred, y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 0 and j == 1:
            falseNeg += 1
    recall = truePos/(truePos+falseNeg)
    return recall

## Calculates the f-score of the predicted labels
def get_fscore(y_pred, y_true): #TP/(TP + 1/2(FP + FN))
    truePos, falsePos, falseNeg = 0, 0, 0
    for i, j in zip(y_pred,y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 1 and j == 0:
            falsePos += 1
        elif i == 0 and j == 1:
            falseNeg += 1
    if truePos == 0:
        return 0
    fscore = truePos/(truePos + 0.5 * (falsePos + falseNeg))
    return fscore

#### 2. Complex Word Identification ####

In [1]:
def load_file(data_file):
    words = []
    labels = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
            i += 1
    return words, labels

## Makes feature matrix for all complex
def all_complex_feature(words):
    res = []
    for word in words:
        res.append([1])
    return res

## Labels every word complex
def all_complex(data_file):
    ## YOUR CODE HERE...
    words, labels = load_file(data_file)
    pred = [1] * len(words)

    # metrics
    precision = get_precision(pred, labels)
    recall = get_recall(pred, labels)
    fscore = get_fscore(pred, labels)

    performance = [precision, recall, fscore]
    return performance


# ### 2.2: Word length thresholding

## Makes feature matrix for word_length_threshold
def length_threshold_feature(words, threshold):
    res = []
    for word in words:
        if len(word) >= threshold:
            res.append([1])
        else:
            res.append([0])
    return res

## Finds the best length threshold by f-score, and uses this threshold to
## classify the training and development set
def word_length_threshold(training_file, development_file):
    ## YOUR CODE HERE
    words, labels = load_file(training_file)
    best_fscore = 0
    best_length = 4
    
    for i in range(4,11):
        res = length_threshold_feature(words,i)
        pred = [x[0] for x in res]
        
        tprecision = get_precision(pred,labels)
        trecall = get_recall(pred,labels)
        tfscore = get_fscore(pred,labels)
        if tfscore > best_fscore:
            best_fscore = tfscore
            best_length = i
            best_precision = tprecision
            best_recall = trecall 
    
    dev_words, dev_labels = load_file(development_file)
    dev_res = length_threshold_feature(dev_words,best_length)
    dev_pred = [x[0] for x in dev_res]
    
    dprecision = get_precision(dev_pred,dev_labels)
    drecall = get_recall(dev_pred,dev_labels)
    dfscore = get_fscore(dev_pred,dev_labels)
            
    training_performance = [best_precision, best_recall, best_fscore]
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

### 2.3: Naive Bayes

In [None]:
## Trains a Naive Bayes classifier using length and frequency features
from syllables import count_syllables

vowels = "aeiou"
wordFreq = {}

def getFeatures(word, counts): #helper function to make it easier to get features. also saves me time
    wordFeatures = []
    count = 0
    wordFeatures.append(len(word)) #length of word
    wordFeatures.append(count_syllables(word)) #number of syllables
    for char in word:
        if char in vowels:
            count += 1
    wordFeatures.append(count) #number of vowels in each word
    wordFreq = counts.get(word,0)
    wordFeatures.append(wordFreq) #frequency of word
    return wordFeatures

def naive_bayes(training_file, development_file, counts):
    training_words, training_labels = load_file(training_file)
    development_words, development_labels = load_file(development_file)
    
    tFeatures = []
    for word in training_words:
        tFeatures.append(getFeatures(word, counts))
        
    probComplex, probSimple = 0, 0
    featureProb = {}
    for i, features in enumerate(tFeatures):
        label = training_labels[i]
        # TODO: Count how often each feature value appears with each class
        # Update your feature_probs dictionary
    
    devPred = []
    for word in development_words:
        wordFeatures = getFeatures(word, counts)
        scoreComplex = probComplex
        scoreSimple = probSimple
        for featureValue in wordFeatures:
        # score_complex *= P(feature_value | complex)
        # score_simple *= P(feature_value | simple)
            pass
        if scoreComplex > scoreSimple:
            prediction = 1
        else:
            prediction = 0
        devPred.append(prediction)
        
    trainingPred = []
    tprecision = get_precision(trainingPred, training_labels)
    trecall = get_recall(trainingPred, training_labels)
    tfscore = get_fscore(trainingPred, training_labels)
    
    dprecision = get_precision(devPred, development_labels)
    drecall = get_recall(devPred, development_labels)
    dfscore = get_fscore(devPred, development_labels)
    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

### 2.4: Logistic Regression

In [None]:
def logistic_regression(training_file, development_file, counts):
    ## YOUR CODE HERE
    
    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

### 2.7: Build your own classifier

In [None]:
if __name__ == "__main__":
    training_file = "data/complex_words_training.txt"
    development_file = "data/complex_words_development.txt"
    test_file = "data/complex_words_test_unlabeled.txt"

    train_data = load_file(training_file)