In [None]:
import pandas as pd

input_dir = "../data"
train_df = pd.read_csv(f"{input_dir}/train.csv")
train_df.head()

In [None]:
exclude_list = [
    # infinite execution
    '25F9B9BAA02A',
    '49586CD6A649',
    '6F896BABB13C',
    'BECA14914CFB',
    'E7A3DBC919C1',
    'FC9BC150809F',
    'FFC43F453EF6',
    'EFCA46E0BF9F',

    # spelling errors
    'C30B2AD4AF0A',
    '718800CC3C50',
    '9C2E6F09CC73',

    # nan in argumentation rankings
    '8D4A0D4CD2C2',
    '129497C3E0FC']

train_df = train_df.loc[~train_df.essay_id.isin(exclude_list)].copy().reset_index(drop=True)

train_df["discourse_elements_number"] = train_df.groupby(train_df.essay_id).discourse_id.transform('count')
train_df = train_df.loc[train_df.discourse_elements_number < 15].reset_index(drop=True).copy()

In [None]:
def clean_text(text):
    return text.strip().lower()

In [None]:
import string
import spacy
from spacy.lang.en import English

import nltk

nlp = spacy.load("en_core_web_trf", disable=['ner', 'parser', 'lemmatizer', 'textcat'])
parser = English()

def punctuation_removal(tokens):
    punctuations = string.punctuation
    tokens = [token for token in tokens if token not in punctuations]
    return tokens

def stopwords_removal(tokens):
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

def stemming(tokens):
    porter = nltk.PorterStemmer()
    stems = [porter.stem(token) for token in tokens]
    return stems
    
def lemmatization(tokens):
    wordnet_lemmatizer = nltk.WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return lemmas
    
def spacy_tokenizer(phrase, steps=[punctuation_removal]):
    phrase = phrase.strip().lower()
    tokens = parser(phrase)
    tokens = [token.text for token in tokens]
    
    for step in steps:
        tokens = step(tokens)
    
    return tokens

In [None]:
import json

with open('rankings.txt', 'r') as f:
    rankings = f.readlines()

rankings = [ranking[1:-2].split('],[') for ranking in rankings]
for ranking in rankings:
    ranking[0] = ranking[0][1:]
    ranking[-1] = ranking[-1][:-1]
rankings = [ranking for ranking_list in rankings for ranking in ranking_list]
rankings = {ranking.split(',')[0]: ranking.split(',')[1] for ranking in rankings}

In [None]:
train_df['bwaf_rank'] = train_df.discourse_id.map(rankings.__getitem__)
train_df['bwaf_rank'] = train_df['bwaf_rank'].astype('float64')

In [None]:
# useful modules
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train_df.discourse_type.values.reshape(-1, 1))
print(enc.categories_)
train_df = pd.concat([train_df, pd.DataFrame(enc.transform(train_df.discourse_type.values.reshape(-1, 1)).toarray())], axis=1)

In [None]:
import numpy as np

def cross_validation(pipe, X, y, n_splits=10):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True)

    log_losses = []
    for i, (train_index, val_index) in enumerate(kf.split(X, y)):
        pipe.fit(X.iloc[train_index], y[train_index])
        predicted = pipe.predict_proba(X.iloc[val_index])
        current_log_loss = metrics.log_loss(y[val_index], predicted)
        # current_accuracy = metrics.accuracy_score(y[val_index], predicted.argmax(axis=1))

        log_losses.append(current_log_loss)
    return log_losses

In [None]:
vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)
classifier = AdaBoostClassifier()

bwaf_rank_transformer = FunctionTransformer(lambda x: x[['bwaf_rank', 0, 1, 2, 3, 4, 5, 6]])
discourse_transformer = FunctionTransformer(lambda x: x.discourse_text.map(clean_text))

pipe = Pipeline([
    ('features', FeatureUnion([
        ('numeric_feature', bwaf_rank_transformer),
        ('text_features', Pipeline([
            ('selector', discourse_transformer),
            ('vectorizer', vectorizer)]))])),
    ('classifier', classifier)])

X = train_df.loc[:, ['discourse_text', 'bwaf_rank', 0, 1, 2, 3, 4, 5, 6]]

l_enc = LabelEncoder()
l_enc.fit(train_df.discourse_effectiveness.values)
y = l_enc.transform(train_df.discourse_effectiveness.values)

log_loss = 0
log_losses_with_rank = []
for i in range(0, 10):
    current_log_losses = cross_validation(pipe, X, y)
    log_losses_with_rank.extend(current_log_losses)

In [None]:
from sklearn.base import TransformerMixin

vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)
classifier = AdaBoostClassifier()
discourse_transformer = FunctionTransformer(lambda x: x.discourse_text.map(clean_text))

pipe = Pipeline([
    ('selector', discourse_transformer),
    ('vectorizer', vectorizer),
    ('classifier', classifier)])

X = train_df.loc[:, ['discourse_text']]
l_enc = LabelEncoder()
l_enc.fit(train_df.discourse_effectiveness)
y = l_enc.transform(train_df.discourse_effectiveness)

log_loss = 0
log_losses_without_rank = []
for i in range(0, 10):
    current_log_losses = cross_validation(pipe, X, y)

    print(f'Log losses at run {i}: {current_log_losses}')
    log_losses_without_rank.extend(current_log_losses)

In [None]:
#Compute the difference between the results
diff = [y - x for y, x in zip(log_losses_with_rank, log_losses_without_rank)]

#Comopute the mean of differences
d_bar = np.mean(diff)

#compute the variance of differences
sigma2 = np.var(diff)

# in a 10-fold cross validation the size of the test sample is 1 tenth of the original size
n1 = 0.9
n2 = 0.1

# 10-fold cross validation repeated 10 times
k = 100

# compute the modified variance
sigma2_mod = sigma2 * (1/k + n2/n1)
# compute the t_static
t_static =  d_bar / np.sqrt(sigma2_mod)