In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

SEED = 4121995

np.random.seed(SEED)

# Introduction

The approach I'll use in this notebook is to build the bare minimum training and validation set and an algorithm maximizing the likelihood of an n-gram of sentences (or words) to be one of a discourse types using a RandomForestClassifier or a NaiveBayesClassifier on TFIDF or CountVectorized datasets.

#### I'll do this without exploring the dataset, as I'm more interested in making a validation pipeline and an algorithm that I have in mind.

# Creating training and validation set

In [None]:
train_df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
train_df.head()

I'll make a validation set with a number of essays that is equal to the number of essays in the test set.

In [None]:
!ls '/kaggle/input/feedback-prize-2021/test' | wc -l

In [None]:
!ls '/kaggle/input/feedback-prize-2021/train' | wc -l

The training data has 15594 essays, which means that we can possibly spare more than 5 essays. But I don't know if we should.

The regular approach would be to split the dataset into an 80/20 training/validation split. <s>but until now this notebook hasn't made any rigorous algorithm, so I'll stick with 5 essays just to make it run.</s> 

<s>After I make sure that the pipeline is working, I can look into making a robust validation set that reflects public leaderboard score, and indicates how well we can do in the private dataset.</s>

Evaluation one essay takes around 113 ms, so in order to minimize waiting and still be able to evaluate results, I'll use around 50 essays so that evaluation doesn't exceed 10 seconds.

In [None]:
def split_essays(train_df, n):
    if isinstance(n, float):
        n = int(len(train_df.id.unique()) * n)
    val_ids = np.random.choice(train_df.id.unique(), n, False)
    train_df, val_df = train_df[~train_df.id.isin(val_ids)], train_df[train_df.id.isin(val_ids)]
    return train_df, val_df

In [None]:
train_df, val_df = split_essays(train_df, 100)
train_df.shape, val_df.shape

Now of course I don't want to use the whole training dataset right now, so I'll make another function to sample a small part for development purposes. I can use the previous function that I made, but I'll discard the training set it creates.

In [None]:
# _, dev_df = split_essays(train_df, 10000)
dev_df = train_df
dev_df.shape

One thing that we need to add to the training set is parapgraphs or blocks of text which have no classification, and then we should add a label to them in order to train the classifier to also predict gaps in the text.

In [None]:
# These functions are inspired by this amazing notebook 
# https://www.kaggle.com/erikbruin/nlp-on-student-writing-eda

def get_unique_ids(df):
    return df.id.unique()

def filter_essay(df, essay_id):
    return df.query('id == @essay_id').reset_index(drop=True)

def read_essay_txt(essay_id, path='train'):
    essay_file_path = f"../input/feedback-prize-2021/{path}/{essay_id}.txt"
    with open(essay_file_path, 'r') as essay_file:
        return essay_file.read()
        
def add_gap_rows_essay(df, essay_id, path):
    
    essay_df = filter_essay(df, essay_id)
    essay_txt = read_essay_txt(essay_id, path)
    
    for index, row in essay_df.iterrows():
        if index == essay_df.index[0]: 
            continue
            
        current_discourse_start = int(row['discourse_start'])
        current_discourse_end = int(row['discourse_end'])
        previous_discourse_start = int(essay_df.loc[index - 1, 'discourse_start'])
        previous_discourse_end = int(essay_df.loc[index - 1, 'discourse_end'])

        if previous_discourse_end != current_discourse_start - 1 and previous_discourse_end != current_discourse_start:
            current_predstring = row['predictionstring']
            previous_predstring = essay_df.loc[index - 1, 'predictionstring']

            current_predstring_first_token = int(current_predstring.split()[0])
            previous_predstring_last_token = int(previous_predstring.split()[-1])
            
            gap_tokens_list = np.arange(previous_predstring_last_token + 1,
                                        current_predstring_first_token).tolist()

            gap_row = {}  
            gap_row['id'] = row['id']
            gap_row['discourse_id'] = row['discourse_id']
            gap_row['discourse_start'] = previous_discourse_end + 1
            gap_row['discourse_end'] = current_discourse_start - 1
            gap_row['discourse_text'] = essay_txt[previous_discourse_end+1: current_discourse_start]
            gap_row['discourse_type'] = 'Gap'
            gap_row['discourse_type_num'] = 'Gap'
            gap_row['predictionstring'] = ' '.join([str(token) for token in gap_tokens_list])
            
            essay_df = essay_df.append(pd.Series(gap_row), ignore_index=True)
    
    essay_df = essay_df.sort_values('discourse_start').reset_index(drop=True)
    return essay_df

def add_gap_rows_df(df, path):
    new_df = None
    essay_ids = get_unique_ids(df)
    
    for essay_id in essay_ids:
        essay_df = add_gap_rows_essay(df, essay_id, path)
        new_df = pd.concat([new_df, essay_df], axis=0, ignore_index=True)
    
    return new_df           
        

In [None]:
%%time

# Testing on one discourse
add_gap_rows_essay(dev_df, dev_df.id.values[30], 'train')

In [None]:
%%time

# dev_df = add_gap_rows_df(dev_df, 'train')

In [None]:
%%time 

# val_df = add_gap_rows_df(val_df, 'train')

It seems that gaps aren't always sentences, so I think that dropping gap lines that aren't full sentences or paragraphs could be beneficial as the model will always be passed TFIDF or Vectorized sentences, unless the approach is used but with maximizing the likelihood of ngrams of words.

# The Algorithm

What I have in a mind is a simple algorithm that maximizes the likelihood of a TFIDF or Bag of words of sequences of sentences (or words) of being one of the presented discourse types.

I don't know much about NLP, but this is the easiest way I can think about. 

For example, let's say that we an essay, the algorithm would split this essay into sentences, then it would classify the first sentence alone and take it's maximum prediction as a baseline. 

The algorithm would then add another sentence classify the two sentences together, and then add another and another. Hypothetically, since I will be training the simple model on block of discourse types, the likelihood would increase with increasing sentences, until it starts decreasing, and then we can select the block which maximized the likelihood of correct prediction according to the model.

Then the algorithm would iterate again from the sentence following the previously predicted block of text.

The core of this approach can be used with any complex model, but I don't know if it would be suitable or not, since probably deep learning models could be capable of more without this brute forcing attempt, but I could be wrong since I don't know much about NLP.

And of course this approach is naive, since it won't learn about the context of the essay, and hence won't be able to predict two evidence sentences following each other for example, and might consider them as one.

## Preprocessing the dataset

TFIDF or Bag of Words could be used, <s>but I'll use a simple Bag of Words right now.</s> and I'll use TFIDF.

The data provides two features regarding position of the text which is discourse_start and discourse_end. I want to use them, but I'm think it would be easier to another feature which is the start token, since my model uses Bag of Words, so it would be easier to implement it into the prediction pipeline of the algorithm.

In [None]:
# Add starting token feature
dev_df['start_token'] = dev_df['predictionstring'].str.split().str[-1].shift(1).fillna(0).astype(int) + 1
val_df['start_token'] = val_df['predictionstring'].str.split().str[-1].shift(1).fillna(0).astype(int) + 1

dev_df.head(5)

In [None]:
%%time

# Add full text length feature
train_essays = os.listdir('/kaggle/input/feedback-prize-2021/train/')
test_essays = os.listdir('/kaggle/input/feedback-prize-2021/test/')

train_essays_length = {id_.rstrip('.txt'): len(read_essay_txt(id_.rstrip('.txt')).split()) for id_ in train_essays}
test_essays_length = {id_.rstrip('.txt'): len(read_essay_txt(id_.rstrip('.txt'), 'test').split()) for id_ in test_essays}

In [None]:
dev_df['len_essay'] = dev_df['id'].map(train_essays_length)
val_df['len_essay'] = val_df['id'].map(train_essays_length)
dev_df.head(10)

Now I'll make a transformer which expects starting token feature and text, then it uses them to calculate the ending token.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD

class ExtraFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['end_token'] = X['start_token'] + X['discourse_text'].str.split().str.len()
        X['tokens_from_start_to_end'] = X['end_token'] - X['start_token'] + 1
        X['tokens_from_start_to_finish'] = X['len_essay'] - X['start_token'] + 1
        X['tokens_from_end_to_finish'] = X['len_essay'] - X['end_token'] + 1
        X['percent_read_before'] = X['start_token'] / X['len_essay']
        X['percent_read_now'] = X['end_token'] / X['len_essay']
        X['percent_remaining'] = (X['len_essay'] - X['end_token']) / X['len_essay']
        X['percent_sentence'] = (X['end_token'] - X['start_token'] + 1) / X['len_essay']
        
        feats = ['percent_read_before', 'percent_read_now', 'percent_remaining', 'percent_sentence']
        return X[feats].values
    

def preprocess_data(X, pipeline=None):
    if not pipeline:
        
        vectorizer_pipeline = Pipeline([
            ('vectorizer', CountVectorizer(ngram_range=(1, 3), max_features=100000)),
#             ('tfidf', TfidfTransformer()),
#             ('pca', TruncatedSVD(n_components=0.9))
        ])
                
        pipeline = FeatureUnion([
            ('extra_features', ColumnTransformer([('end_token', ExtraFeatures(), ['start_token', 'len_essay', 'discourse_text'])])),
            ('vectorizer', ColumnTransformer([('vectorizer', vectorizer_pipeline, 'discourse_text')]))
        ])

        
        pipeline.fit(X)
        
    X = pipeline.transform(X)
    return X, pipeline

def encode_labels(y, encoder=None):
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(y)
        
    y = encoder.transform(y)
    return y, encoder

In [None]:
dev_df.head(1)

In [None]:
X_train, y_train = dev_df[['start_token', 'len_essay', 'discourse_text']], dev_df['discourse_type']
X_val, y_val = val_df[['start_token', 'len_essay', 'discourse_text']], val_df['discourse_type']

X_train, pipeline = preprocess_data(X_train)
X_val, _ = preprocess_data(X_val, pipeline)

y_train, encoder = encode_labels(y_train)
y_val, _ = encode_labels(y_val, encoder)

## Training the core model

Since I like Random Forests, I'll stick with a RF Classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression

In [None]:
%%time

model = LogisticRegression(C=1, dual=True, solver='liblinear')
model.fit(X_train, y_train)

# if hasattr(model, 'oob_score_'): print(model.oob_score_)

# model = MultinomialNB()
# model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

# Prediction

## Designing The Algorithm

The algorithm should take a full text and then output a dataframe of classes and prediction strings (tokens). 

In [None]:
dev_df['next_discourse_type'] = dev_df['discourse_type'].shift(-1)
dev_df['next_id'] = dev_df['id'].shift(-1)

dev_df.loc[
    dev_df['next_id'] != dev_df['id'],
    'next_discourse_type'
] = 'NaN'

In [None]:
discourse_next = pd.pivot_table(data=dev_df, index='discourse_type', columns='next_discourse_type', aggfunc='size')
discourse_next = discourse_next.apply(lambda x: x / discourse_next.sum(axis=1))
discourse_next = discourse_next[encoder.classes_].fillna(0)
discourse_next

In [None]:
expectations = {}
for discourse_type, row in discourse_next.iterrows():
    expectations[discourse_type] = row.values

In [None]:
import nltk

def predict_text(model, data, pipeline):
    return model.predict_proba(pipeline.transform(data))

def predict_essay(model, essay_id, path, pipeline, max_iter, print_results=False):
    essay_txt = read_essay_txt(essay_id, path)
    essay_sentences = nltk.sent_tokenize(essay_txt)
    essay_preds = {}
    essay_preds['id'] = essay_id
    essay_preds['discourse_type'] = []
    essay_preds['predictionstring'] = []
    
#     print(len(essay_sentences))
#     print(len(essay_txt.split()))
    
    start_token = 0
    end_token = 0
    start_sent = 0
    end_sent = 1
    iter_bad = 0
    max_pred = 0
    
    start_end_sents, max_preds, argmax_preds = [], [], [] 
    stop = False
    
    prev_type = None
    
    while not stop: 
        data = {}
        data['start_token'] = [start_token + 1]
        data['len_essay'] = train_essays_length[essay_id] if path == 'train' else test_essays_length[essay_id]
        data['discourse_text'] = [' '.join(essay_sentences[start_sent:end_sent])]
        data = pd.DataFrame(data)[['start_token', 'len_essay', 'discourse_text']]
        preds = predict_text(model, data, pipeline)
        
#         print('Before update:', np.argmax(preds))
        if prev_type:
            preds = update_preds(preds, prev_type)
            
#         print('After update:', np.argmax(preds), prev_type)

        if preds.max() >= max_pred:
            max_pred = preds.max()
            start_end_sents.append([start_sent, end_sent])
            max_preds.append(max_pred)
            argmax_preds.append(preds.argmax())

        else:
            iter_bad += 1
        
#         print(start_sent, end_sent, encoder.inverse_transform([preds.argmax()])[0], preds.max(), max_pred, iter_bad)
        end_sent += 1
            
#         print(start_end_sents, max_preds)
                         
        if iter_bad >= max_iter or end_sent > len(essay_sentences):
            best_pred = np.argmax(max_preds)
            best_start_end = start_end_sents[best_pred]
            merged_sentence = ' '.join(essay_sentences[best_start_end[0]: best_start_end[-1]])
            end_token = len(merged_sentence.split()) + end_token
            prediction_string = ' '.join([str(token) for token in range(start_token, end_token)])
            
            essay_preds['discourse_type'].append(encoder.inverse_transform([argmax_preds[best_pred]])[0])
            essay_preds['predictionstring'].append(prediction_string)
            
            if print_results: print('MATCH ------- \n', merged_sentence, '\n\n', encoder.inverse_transform([argmax_preds[best_pred]])[0], '\n\n')
            
            start_token = end_token
            start_sent = best_start_end[-1]
            
#             end_token = start_token + 1
            end_sent = start_sent + 1
            
            iter_bad = 0
            max_pred = 0
            
            prev_type = encoder.inverse_transform([argmax_preds[best_pred]])[0]
            
            start_end_sents, max_preds, argmax_preds = [], [], []
            
            
        if start_sent == len(essay_sentences):
            stop = True
    
    return essay_preds

def update_preds(preds, prev_type):
    prev_type_expec = expectations[prev_type]
    return preds * prev_type_expec

def predict_df(model, df, path, pipeline, max_iter=5):
    essay_ids = df['id'].unique()
    preds_df = None
    for essay_id in essay_ids:
        essay_preds = predict_essay(model, essay_id, path, pipeline, max_iter)
        preds_df = pd.concat([preds_df, pd.DataFrame(essay_preds)], axis=0)
        
    return preds_df

Let's test the algorithm now with one essay.

In [None]:
val_df.id.unique()

In [None]:
%%time

_ = predict_essay(model, '570D8769BE33', 'train', pipeline, 1, print_results=True)

# Evaluation

According to the competition evaluation page

1. For each sample, all ground truths and predictions for a given class are compared.
2. If the overlap between the ground truth and prediction is >= 0.5, and the overlap between the prediction and the ground truth >= 0.5, the prediction is a match and considered a true positive. If multiple matches exist, the match with the highest pair of overlaps is taken.
3. Any unmatched ground truths are false negatives and any unmatched predictions are false positives.


In [None]:
%%time

val_preds_df = predict_df(model, val_df, 'train', pipeline, 1)
val_preds_df.head()

In [None]:
def evaluate_df(df, pred_df):
    essay_ids = df['id'].unique()
    f1_scores = []
    for essay_id in essay_ids:
        f1_score = evaluate_essay(df, pred_df, essay_id)
        f1_scores.append(f1_score)
    return np.mean(f1_scores)
        
def evaluate_essay(df, pred_df, essay_id, print_results=False):
    essay_df = filter_essay(df, essay_id)
    pred_essay_df = filter_essay(pred_df, essay_id)
    pred_essay_df = pred_essay_df.loc[pred_essay_df['discourse_type'] != 'Gap', :]
    f1_scores = []
    for class_ in df['discourse_type'].unique():
        f1_score = evaluate_class(essay_df, pred_essay_df, class_, print_results)
        f1_scores.append(f1_score)
        
    return np.mean(f1_scores)
        
def evaluate_class(df, pred_df, class_, print_results):
    class_df = filter_class(df, class_)
    pred_class_df = filter_class(pred_df, class_)
    truths = class_df['predictionstring'].str.split(' ').tolist()
    predictions = pred_class_df['predictionstring'].str.split(' ').tolist()
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    matched_truths_idx = []
    for prediction in predictions:
        for i, truth in enumerate(truths):
            if test_overlap(prediction, truth):
                true_positives += 1 
                matched_truths_idx.append(i)
            else:
                false_positives += 1
        truths = remove_from_list(truths, matched_truths_idx)
        matched_truths_idx = []
        
    false_negatives = len(truths)
    
    f1_score = calculate_f1(true_positives, false_positives, false_negatives)
    
    if print_results: print(class_, f1_score)
        
    return f1_score


def filter_class(df, class_):
    return df.query('discourse_type == @class_')
                
        
def test_overlap(prediction, truth):
    prediction_set = set(prediction)
    truth_set = set(truth)
#     print(overlap_fraction(prediction_set, truth_set), overlap_fraction(truth_set, prediction_set))
    if overlap_fraction(prediction_set, truth_set) >= 0.5 and overlap_fraction(truth_set, prediction_set) >= 0.5:
        return True
    
    
def overlap_fraction(set1, set2):
    return len(set1.intersection(set2)) / len(set1)
    
    
def remove_from_list(list_, idx):
    return [x for i, x in enumerate(list_) if i not in idx]

def calculate_f1(true_positives, false_positives, false_negatives):
    precision = calculate_precision(true_positives, false_positives)
    recall = calculate_recall(true_positives, false_negatives)
    return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    
def calculate_precision(true_positives, false_positives):
    return true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

def calculate_recall(true_positives, false_negatives):
    return true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

In [None]:
%%time

# Test one essay
evaluate_essay(val_df, val_preds_df, '570D8769BE33', True)

In [None]:
%%time

evaluate_df(val_df, val_preds_df)

# Submission

In [None]:
test_ids = !ls '/kaggle/input/feedback-prize-2021/test'
test_ids = [id_.rstrip('.txt') for id_ in test_ids]
test_df = pd.DataFrame({'id': test_ids})
test_df['len_essay'] = test_df['id'].map(test_essays_length)

In [None]:
%%time

submission = predict_df(model, test_df, 'test', pipeline, 1)
submission.head()

In [None]:
submission.columns = ['id', 'class', 'predictionstring']

In [None]:
# Drop gaps
submission = submission.loc[submission['class'] != 'Gap', :]

In [None]:
submission.to_csv('submission.csv', index=False)