[Ordinal regression](https://en.wikipedia.org/wiki/Ordinal_regression) is a classification method for categories on an ordinal scale -- e.g. [1, 2, 3, 4, 5] or [G, PG, PG-13, R].  This notebook implements ordinal regression using the method of [Frank and Hal 2001](https://www.cs.waikato.ac.nz/~eibe/pubs/ordinal_tech_report.pdf), which transforms a k-multiclass classifier into k-1 binary classifiers (each of which predicts whether a data point is above a threshold in the ordinal scale -- e.g., whether a movie is "higher" than PG).  This method can be used with any binary classification method that outputs probabilities; here L2-regularizaed binary logistic regression is used.

This notebook trains a model (on `train.txt`), optimizes L2 regularization strength on `dev.txt`, and evaluates performance on `test.txt`.  Reports test accuracy with 95% confidence intervals.

In [14]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm

In [15]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15527\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
trainingFile = "../Data/train.txt"
devFile = "../Data/dev.txt"
testFile = "../Data/test.txt"
    
# ordinal values must be in order *as strings* from smallest to largest, e.g.:
# ordinal_values=["G", "PG", "PG-13", "R"]

ordinal_values=["1", "2", "3", "4"]

In [17]:
def load_ordinal_data(filename, ordering):
    """
    Split the input file into X and Y component
    Input
        filename(str)
        ordering(list of str)
    Output
        X(list): list of all the questions
        Y(list): One-hot encoding of Y
        orig_Y: list of all the labels
    """
    X = []
    Y = []
    orig_Y=[]
    for _ in ordering:
        Y.append([])
        
    with open(filename, encoding="utf-8") as file:
        # for each row
        for line in file:
            cols = line.split("\t")
            label = cols[2].lstrip().rstrip()
            text = cols[3]
            
            X.append(text)
            orig_Y.append(label)

            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
                    
    return X, Y, orig_Y

In [46]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY
        
        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY
        
        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for i, text in enumerate(data):
            feats = self.feature_method(i, text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):
        
        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape
        # fit the model for each level??
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:
                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                # score is define as R^2
                training_accuracy = log_reg.score(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg
                # print("Method: combiner function, C: %s, Features: %s, Train score: %.3f, Dev score: %.3f" % 
                #       (C, F, training_accuracy, development_accuracy))
                # print("Method: combiner function, C: %s, Features: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % 
                #       (C, F, sum(log_reg.predict(self.trainX)==self.trainY[idx])/500, sum(log_reg.predict(self.devX)==self.devY[idx])/100))
            self.log_regs[idx]=best_model
        
    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]
        
        preds=np.array(preds)
        print(len(preds))
        for data_point in range(len(preds[0])):
            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot

In [33]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [34]:
def run(trainingFile, devFile, testFile, ordinal_values):
    trainX, trainY, orig_trainY=load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY=load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY=load_ordinal_data(testFile, ordinal_values)
    simple_classifier = OrdinalClassifier(ordinal_values, combiner_function, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()

    lower, upper=confidence_intervals(accuracy, len(testY[0]), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

### Different features

In [35]:
def bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)
    for word in words:
        word=word.lower()
        if word in feats:
            feats[word] += 1
        else:
            feats[word] = 1
    return feats

In [36]:
# No effect

from afinn import Afinn
def afinn_sentiment(text):
    # Here the `feats` dict should contain the features -- the key should be the feature name,
    # and the value is the feature value.  See `simple_featurize` for an example.

    feats = {}
    # BEGIN SOLUTION
    afinn = Afinn()
    sentences = nltk.sent_tokenize(text)
    for i, sentence in enumerate(sentences):
      feats[i] = afinn.score(sentence)

    # END SOLUTION
    return feats

In [37]:
from collections import Counter

# No effect
def bigram(text):
    # Here the `feats` dict should contain the features -- the key should be the feature name,
    # and the value is the feature value.  See `simple_featurize` for an example.

    feats = {}
    words = nltk.word_tokenize(text)
    trigrams = [' '.join(tg) for tg in list(nltk.bigrams(words))]
    feats = dict(Counter(trigrams))
    return feats

In [38]:
# No effect
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

def Vader_sentiment(text):
    feats = {}
    feats["pos"] = 0
    feats["neg"] = 0
    feats["neu"] = 0
    feats["compound"] = 0
    sentences = nltk.sent_tokenize(text)
    sia = SentimentIntensityAnalyzer()
    for sentence in sentences:
        score = sia.polarity_scores(sentence)
        feats["pos"] += score["pos"]
        feats["neg"] += score ["neg"]
        feats["neu"] += score ["neu"]
        feats["compound"] += score ["compound"]
    return feats

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\15527\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [39]:
# Result in a lower accuracy
import liwc

# import requests
# url = 'https://raw.githubusercontent.com/chun-hu/conversation-modeling/master/LIWC2007_English100131.dic'
# r = requests.get(url)
# with open('LIWC2007_English100131.dic', 'wb') as f:
#     f.write(r.content)


def liwc_pos_type(text):
    # Here the `feats` dict should contain the features -- the key should be the feature name,
    # and the value is the feature value.  See `simple_featurize` for an example.

    feats = {}
    words = nltk.word_tokenize(text)
    dictionary, category_names = liwc.read_dic("LIWC2007_English100131.dic")

    for word in words:
        word=word.lower()
        if word in dictionary:
            for category in dictionary[word]:
                if category in feats:
                    feats[category] += 1
                else:
                    feats[category] = 1

    return feats

In [40]:
# No effect
def question_word_diction(text):
        question_word_list = ['what', 'where', 'when','how','why','did','do','does','have','has','am','is','are','can','could','may','would','will','should'
"didn't","doesn't","haven't","isn't","aren't","can't","couldn't","wouldn't","won't","shouldn't",'?']
        feats = {}
        words = nltk.word_tokenize(text)
        for word in words:
            if word.lower() in feats:
                  feats[word.lower()] += 1
            elif word.lower() in question_word_list:
                  feats[word.lower()] = 1
        return feats

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(documents):
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the documents
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Get the feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Dictionary to store TF-IDF for each document
    tfidf_scores = {}

    # Extract TF-IDF scores for each document
    for doc_idx, doc_vector in enumerate(tfidf_matrix):
        doc_dict = {}
        for word_idx, score in zip(doc_vector.indices, doc_vector.data):
            doc_dict[feature_names[word_idx]] = score
        tfidf_scores[f"Document_{doc_idx}"] = doc_dict

    return tfidf_scores

# Example usage
trainX, _, _ = load_ordinal_data(trainingFile, ["1", "2", "3", "4"])
tfidf_result = compute_tfidf(trainX)
print(tfidf_result)

{'Document_0': {'using': 0.2157110727879763, 'already': 0.20666461530635633, 'that': 0.06442433283282548, 'non': 0.22737398494175284, 'or': 0.09797338437461499, 'government': 0.48762387276466035, 'payers': 0.24381193638233017, 'any': 0.12338843318314066, 'seeing': 0.14141643063602855, 'you': 0.05343456933968377, 'are': 0.18852232185788156, 'clear': 0.22737398494175284, 'be': 0.11413460452532664, 'want': 0.19927312134739897, 'just': 0.10084983703901036, 'guess': 0.1447968024889173, 'criteria': 0.48762387276466035, 'the': 0.1061320249382119, 'to': 0.15364429372064572, 'moving': 0.20666461530635633, 'then': 0.13539933868864062, 'and': 0.07215712385963219}, 'Document_1': {'well': 0.1856200255254132, 'as': 0.11504153417212735, 'outlook': 0.2098124293477866, 'seasonal': 0.24752556019584515, 'than': 0.2098124293477866, 'also': 0.19596374459629926, 'americas': 0.24752556019584515, 'by': 0.1856200255254132, 'driven': 0.24752556019584515, 'quarter': 0.12856226119781258, 'ts': 0.24752556019584515

In [42]:
def combiner_function(i, text):

    # Here the `all_feats` dict should contain the features -- the key should be the feature name,
    # and the value is the feature value.  See `simple_featurize` for an example.
    # at the moment, all 4 of: bag of words and your 3 original features are handed off to the combined model
    # update the values within [bag_of_words, feature1, feature2, feature3] to change this.

    all_feats={}
    for feature in [bow_featurize]:
        all_feats.update(feature(text))
        # all_feats.update(tfidf_result[f"Document_{i}"])
    return all_feats

In [47]:
run(trainingFile, devFile, testFile, ordinal_values)

3
Test accuracy for best dev model: 0.570, 95% CIs: [0.473 0.667]

