In [5]:
import pandas as pd 
import numpy as np 

#### 1. Evaluation Metrics ####

In [6]:
## Input: y_pred, a list of length n with the predicted labels,
## y_true, a list of length n with the true labels
#                Pred
#              |   0   |   1
# true    0    |  TN   |  FP
#         1    |  FN   |  TP

## Calculates the precision of the predicted labels
def get_precision(y_pred, y_true):
    truePos, falsePos = 0, 0
    for i,j in zip(y_pred, y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 1 and j == 0:
            falsePos += 1
            
    if truePos + falsePos == 0: #incase there are no true positives or false positives
        return 0
    
    precision = truePos/(truePos + falsePos)
    return precision
    
## Calculates the recall of the predicted labels
def get_recall(y_pred, y_true): #TP/(TP + FN)
    truePos, falseNeg = 0, 0
    
    for i,j in zip(y_pred, y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 0 and j == 1:
            falseNeg += 1
    recall = truePos/(truePos+falseNeg)
    return recall

## Calculates the f-score of the predicted labels
def get_fscore(y_pred, y_true): #TP/(TP + 1/2(FP + FN))
    truePos, falsePos, falseNeg = 0, 0, 0
    for i, j in zip(y_pred,y_true):
        if i == 1 and j == 1:
            truePos += 1
        elif i == 1 and j == 0:
            falsePos += 1
        elif i == 0 and j == 1:
            falseNeg += 1
    if truePos == 0:
        return 0
    fscore = truePos/(truePos + 0.5 * (falsePos + falseNeg))
    return fscore

#### 2. Complex Word Identification ####

In [7]:
def load_file(data_file):
    words = []
    labels = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
            i += 1
    return words, labels

## Makes feature matrix for all complex
def all_complex_feature(words):
    res = []
    for word in words:
        res.append([1])
    return res

## Labels every word complex
def all_complex(data_file):
    ## YOUR CODE HERE...
    words, labels = load_file(data_file)
    pred = [1] * len(words)

    # metrics
    precision = get_precision(pred, labels)
    recall = get_recall(pred, labels)
    fscore = get_fscore(pred, labels)

    performance = [precision, recall, fscore]
    return performance


# ### 2.2: Word length thresholding

## Makes feature matrix for word_length_threshold
def length_threshold_feature(words, threshold):
    res = []
    for word in words:
        if len(word) >= threshold:
            res.append([1])
        else:
            res.append([0])
    return res

## Finds the best length threshold by f-score, and uses this threshold to
## classify the training and development set
def word_length_threshold(training_file, development_file):
    ## YOUR CODE HERE
    words, labels = load_file(training_file)
    best_fscore = 0
    best_length = 4
    
    for i in range(4,11):
        res = length_threshold_feature(words,i)
        pred = [x[0] for x in res]
        
        tprecision = get_precision(pred,labels)
        trecall = get_recall(pred,labels)
        tfscore = get_fscore(pred,labels)
        if tfscore > best_fscore:
            best_fscore = tfscore
            best_length = i
            best_precision = tprecision
            best_recall = trecall 
    
    dev_words, dev_labels = load_file(development_file)
    dev_res = length_threshold_feature(dev_words,best_length)
    dev_pred = [x[0] for x in dev_res]
    
    dprecision = get_precision(dev_pred,dev_labels)
    drecall = get_recall(dev_pred,dev_labels)
    dfscore = get_fscore(dev_pred,dev_labels)
            
    training_performance = [best_precision, best_recall, best_fscore]
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

### 2.3: Naive Bayes

In [29]:
## Trains a Naive Bayes classifier using length and frequency features
from syllables import count_syllables

vowels = "aeiou"
def getFeatures(word, counts):
    wordFeatures = []
    count = 0
    wordFeatures.append(len(word)) #length of word
    wordFeatures.append(count_syllables(word)) #number of syllables
    for char in word:
        if char in vowels:
            count += 1
    wordFeatures.append(count) #number of vowels in each word
    wordFeatures.append(word.count('y')) 
    wordFeatures.append(len([c for c in word if c not in vowels])) 
    # wordFreq = counts.get(word,0)
    # if wordFreq > 10:
    #     wordFreq = 10 #capping at 10 for numbers with high frequency
    # wordFeatures.append(wordFreq) #frequency of word
    return wordFeatures

counts = {}
def simpleCounts(training_file):
    words, labels = load_file(training_file)
    counts = {}
    for word in words:
        counts[word] = counts.get(word, 0) + 1
    return counts

def naive_bayes(training_file, development_file, counts):
    training_words, training_labels = load_file(training_file)
    development_words, development_labels = load_file(development_file)
    
    # Extract features for training
    tFeatures = []
    for word in training_words:
        tFeatures.append(getFeatures(word, counts))
    
    # Count feature occurrences
    featureProb = {}
    for i, features in enumerate(tFeatures):
        label = training_labels[i]
        for idx, value in enumerate(features):
            if idx not in featureProb:
                featureProb[idx] = {}
            if value not in featureProb[idx]:
                featureProb[idx][value] = {'complex': 0, 'simple': 0}
            if label == 1: #count complex words
                featureProb[idx][value]['complex'] += 1
            else: #count simple words
                featureProb[idx][value]['simple'] += 1
    
    # Calculate class probabilities
    probComplex = sum(1 for label in training_labels if label == 1)
    probSimple = sum(1 for label in training_labels if label == 0)
    probComplex /= len(training_labels)
    probSimple /= len(training_labels)

    # Convert feature counts to probabilities
    total_complex_words = sum(1 for label in training_labels if label == 1)
    total_simple_words = sum(1 for label in training_labels if label == 0)

    for feature_idx in featureProb:
        for feature_value in featureProb[feature_idx]:
            featureProb[feature_idx][feature_value]['complex'] /= total_complex_words
            featureProb[feature_idx][feature_value]['simple'] /= total_simple_words

    # Predict development set
    devPred = []
    for word in development_words:
        wordFeatures = getFeatures(word, counts)
        scoreComplex = probComplex
        scoreSimple = probSimple
        
        for feature_idx, featureValue in enumerate(wordFeatures):
            # Look up probabilities, with default values if feature wasn't seen in training
            if feature_idx in featureProb and featureValue in featureProb[feature_idx]:
                prob_complex = featureProb[feature_idx][featureValue]['complex']
                prob_simple = featureProb[feature_idx][featureValue]['simple']
            else:
                # Handle unseen feature values with small default probabilities
                prob_complex = 1e-6  # Very small probability
                prob_simple = 1e-6
            
            # Multiply probabilities (Naive Bayes assumption)
            scoreComplex *= prob_complex
            scoreSimple *= prob_simple
            
        if scoreComplex > scoreSimple:
            prediction = 1
        else:
            prediction = 0
        devPred.append(prediction)
    
    # Predict training set
    trainingPred = []
    for word in training_words:
        wordFeatures = getFeatures(word, counts)
        scoreComplex = probComplex
        scoreSimple = probSimple
        
        for feature_idx, featureValue in enumerate(wordFeatures):
            if feature_idx in featureProb and featureValue in featureProb[feature_idx]:
                prob_complex = featureProb[feature_idx][featureValue]['complex']
                prob_simple = featureProb[feature_idx][featureValue]['simple']
            else:
                prob_complex = 1e-6
                prob_simple = 1e-6
            scoreComplex *= prob_complex
            scoreSimple *= prob_simple
        
        if scoreComplex > scoreSimple:
            prediction = 1
        else:
            prediction = 0
        trainingPred.append(prediction)
    
    # Calculate performance metrics
    tprecision = get_precision(trainingPred, training_labels)
    trecall = get_recall(trainingPred, training_labels)
    tfscore = get_fscore(trainingPred, training_labels)
    
    dprecision = get_precision(devPred, development_labels)
    drecall = get_recall(devPred, development_labels)
    dfscore = get_fscore(devPred, development_labels)
    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

I removed word frequency due to the high numbers i was getting. It was also causing my logistic regression model to give me a recall score of 99%. 

In [None]:
train = 'complex_words_training.txt'
dev = 'complex_words_development.txt'
test = 'complex_words_test_unlabeled.txt'

counts = simpleCounts(train)
results = naive_bayes(train, dev, counts)

print("Naive Bayes Results:")
print(f"Precision: {results[0]:.3f}")
print(f"Recall: {results[1]:.3f}")
print(f"F-score: {results[2]:.3f}")

Naive Bayes Results:
Development - Precision: 0.685
Development - Recall: 0.730
Development - F-score: 0.707


### 2.4: Logistic Regression

In [31]:
## Trains a Logistic Regression classifier using length and frequency features
from sklearn.linear_model import LogisticRegression
import numpy as np

def logistic_regression(training_file, development_file, counts):
    training_words, training_labels = load_file(training_file)
    development_words, development_labels = load_file(development_file)
    
    # Extract features for training
    training_features = []
    for word in training_words:
        training_features.append(getFeatures(word, counts))
    
    # Extract features for development
    development_features = []
    for word in development_words:
        development_features.append(getFeatures(word, counts))
    
    # Convert to numpy arrays (required by sklearn)
    X_train = np.array(training_features)
    y_train = np.array(training_labels)
    X_dev = np.array(development_features)
    y_dev = np.array(development_labels)
    
    # Create and train logistic regression model
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    
    # Make predictions
    training_predictions = model.predict(X_train)
    development_predictions = model.predict(X_dev)
    
    # Calculate performance metrics
    tprecision = get_precision(training_predictions.tolist(), training_labels)
    trecall = get_recall(training_predictions.tolist(), training_labels)
    tfscore = get_fscore(training_predictions.tolist(), training_labels)
    
    dprecision = get_precision(development_predictions.tolist(), development_labels)
    drecall = get_recall(development_predictions.tolist(), development_labels)
    dfscore = get_fscore(development_predictions.tolist(), development_labels)
    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

In [None]:
logisticResults = logistic_regression(train, dev, counts)

print("Logistic Regression Results:")
print(f"Precision: {logisticResults[0]:.3f}")
print(f"Recall: {logisticResults[1]:.3f}")
print(f"F-score: {logisticResults[2]:.3f}")

Logistic Regression Results:
Development - Precision: 0.725
Development - Recall: 0.629
Development - F-score: 0.673


### 2.7: Build your own classifier

I chose to use random forest classifer for the model of my choice. 
How it works, you create many trees but each tree sees different data, and each tree uses random features to come with the final prediction

In [None]:
## Trains a Random Forest classifier using length and syllable features
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def random_forest(training_file, development_file, counts):
    training_words, training_labels = load_file(training_file)
    development_words, development_labels = load_file(development_file)
    
    training_features = [] #grabs the features for the training set
    for word in training_words:
        training_features.append(getFeatures(word, counts))
    
    development_features = [] #grabs the features for the development set
    for word in development_words:
        development_features.append(getFeatures(word, counts))
    
    X_train = np.array(training_features)
    y_train = np.array(training_labels)
    X_dev = np.array(development_features)
    y_dev = np.array(development_labels)
    
    model = RandomForestClassifier(
        n_estimators=100,      # Number of trees
        random_state=42,       
        max_depth=10,          # try to prevent overfitting
        min_samples_split=5,   # Minimum samples to split a node
        min_samples_leaf=2     # Minimum samples in a leaf
    )
    model.fit(X_train, y_train)
    training_predictions = model.predict(X_train)
    development_predictions = model.predict(X_dev)
    
    #metrics
    tprecision = get_precision(training_predictions.tolist(), training_labels)
    trecall = get_recall(training_predictions.tolist(), training_labels)
    tfscore = get_fscore(training_predictions.tolist(), training_labels)
    
    dprecision = get_precision(development_predictions.tolist(), development_labels)
    drecall = get_recall(development_predictions.tolist(), development_labels)
    dfscore = get_fscore(development_predictions.tolist(), development_labels)
    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

In [34]:
rf_performance = random_forest(train, dev, counts)

print("Random Forest Results:")
print(f"Development - Precision: {rf_performance[0]:.3f}")
print(f"Development - Recall: {rf_performance[1]:.3f}")
print(f"Development - F-score: {rf_performance[2]:.3f}")

Random Forest Results:
Development - Precision: 0.702
Development - Recall: 0.648
Development - F-score: 0.674


BEST PRECISION: LOGISTIC REGRESSION WITH *0.725*

BEST RECALL: NAIVE BAYES WITH *0.730* 

BEST F-SCORE: NAIVE BAYES WITH *0.707*

Naive bayes showed to be the best model out of the 3 which uses probability-based classification. 

Features used:
- Word length
- Syllable Count
- Vowel Count
- Word Frequency(i removed it since it wasnt helping my models)