In [None]:
%config Completer.use_jedi = False

# !pip install -Uqqq plotnine

<h1><center>Comprehensive EDA for the Feedback Prize - Evaluating Student Writing Competition</center></h1>
                           
                           
<center><img src = "https://storage.googleapis.com/kaggle-competitions/kaggle/31779/logos/header.png" width = "1000" height = "400"/></center>    

This EDA aims to answer the folowing questions:   
1. What is the Structure for the TRAIN data ?
1. What is the distribution of the labels?
1. What is the test data Structure?
1. What is the submission data Structure?
1. How is the score calculated?

<h3 style='background:orange; color:black'><center>Consider upvoting this notebook if you found it helpful.</center></h3>

In [None]:
import os
import shutil

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from plotnine import *

# Structure of the TRAIN data?

- How many text files do we have?
- How many labeled fragments we have?
- Example of labeled (with color) for a single text file

## How many text files do we have?

In [None]:
files_train = os.listdir('/kaggle/input/feedback-prize-2021/train')
print(f'We have {len(files_train)} files in the train folder')

## How many labeled fragments we have?

In [None]:
df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
print(f'We have {len(df)} rows in the train.csv file')

## Example of labeled text

In [None]:
import spacy

class Labeler():
    ## adapted from https://www.kaggle.com/odins0n/feedback-prize-eda
    def __init__(self):
        self.df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv').set_index('id')
        colors = {'Lead': '#EE11D0',
                  'Position': '#AB4DE1',
                  'Claim': '#1EDE71',
                  'Evidence': '#33FAFA',
                  'Counterclaim': '#4253C1',
                  'Concluding Statement': 'yellow',
                  'Rebuttal': 'red'}
        self.options = {"ents": list(colors.keys()), "colors": colors}
    def __call__(self, idx, truncate = None):
        ents = []
        for i, row in self.df.loc[idx].iterrows():
            start = int(row['discourse_start'])
            end = int(row['discourse_end'])
            label = row['discourse_type']
            if truncate is not None:
                if start < truncate:
                    ents.append({
                        'start': start,
                        'end': min(end, truncate), 
                        'label': label
                    })
            else:
                ents.append({
                    'start': start,
                    'end': end, 
                    'label': label
                })
        txt_file = f'/kaggle/input/feedback-prize-2021/train/{idx}.txt'
        with open(txt_file, 'r') as file: text_data = file.read()
        if truncate is not None:
            text_data = text_data[:truncate] + ' [...]'
        doc = {
            "text": text_data,
            "ents": ents,
        }

        spacy.displacy.render(doc, style="ent", options=self.options, manual=True, jupyter=True);
        
labeler = Labeler()

In [None]:
labeler('0000D23A521A')

# Distribution of the labels

- Quantity
- Lenght (boxplot that compares length per label) (word and run-text)
- No-label
- Ratio of label/total-text length

In [None]:
label_ordered_list = df['discourse_type'].value_counts().index.tolist()[::-1]

(ggplot(df, aes('discourse_type'))
 + geom_bar(fill = 'orange', color = 'black')
 + scale_x_discrete(limits = label_ordered_list)
 + ggtitle('Frequency of Discourse Type (label)')
 + coord_flip()
 + xlab('Discourse Type')
 + ylab('Frequency')
)

In [None]:
df['char_len'] = (df['discourse_end'] - df['discourse_start']).astype(int)
df['word_len'] = df['predictionstring'].str.split().apply(len)

In [None]:
from sklearn.linear_model import LinearRegression

X = df[['word_len']]
y = df['char_len']
reg = LinearRegression().fit(X, y)
score, a, b = reg.score(X, y), reg.coef_.item(), reg.intercept_
score, a, b

In [None]:
(ggplot(
    df.sample(frac = 0.1, random_state = 42), 
    aes(x = 'word_len', y = 'char_len'))
 + geom_point(alpha = 0.1)
 + geom_smooth(method = 'lm')
 + ylab('Number of characters')
 + xlab('Number of words')
 + ggtitle('Average word length per text file')
 + annotate('text', x = 600, y = 2500, label = f'$y = {a:.2f}x + {b:.2f}$')
)

In [None]:
(ggplot(df, aes(x = 'discourse_type', y = 'word_len'))
 + geom_boxplot(color = 'black', fill = 'orange')
 + scale_x_discrete(limits = label_ordered_list)
 + ylab('Length of the comment in words')
 + xlab('Discourse Type')
 + coord_flip()
)

In [None]:
(ggplot(df, aes(x = 'discourse_type', y = 'word_len'))
 + geom_boxplot(color = 'black', fill = 'orange')
 + scale_x_discrete(limits = label_ordered_list)
 + ylab('Length of the comment in words')
 + xlab('Discourse Type')
 + coord_flip()
 + ylim(0, 150)
)

In [None]:
(ggplot(
    df.sample(frac = 0.1, random_state = 42), 
    aes(x = 'discourse_type', y = 'word_len'))
 + geom_violin(color = 'black', fill = 'orange')
 + scale_x_discrete(limits = label_ordered_list)
 + ylab('Length of the comment in words')
 + xlab('Discourse Type')
 + coord_flip()
 + ylim(0, 150)
)

### How much unlabeled text do we have in the train set?

In [None]:
label_ratio = []
for i, txt in tqdm(df.groupby('id')): 
    
    txt_id = txt['id'].values[0]
    txt_file = f"/kaggle/input/feedback-prize-2021/train/{txt_id}.txt"
    
    with open(txt_file, 'r') as file:
        txt_data = file.read()
        
    len_lbls = txt['char_len'].sum()
    len_txt = len(txt_data)
    ratio = len_lbls/len_txt

    label_ratio.append(pd.DataFrame({'id': [txt_id], 'ratio':[ratio]}))
    
label_ratio = pd.concat(label_ratio).reset_index(drop = True)

In [None]:
label_ratio[['ratio']].describe().T

In [None]:
(ggplot(label_ratio, aes('ratio'))
 + geom_histogram(bins = 100, color = 'black', fill = 'orange')
 + scale_y_log10()
)

### Let's investigate those strange outliers

In [None]:
label_ratio.sort_values('ratio').head()

In [None]:
#This text could be either a bug in the file or just a 
# smart-ass kid trying to inflate the number of words in his essay
labeler('C278EDC82048', 1200)

In [None]:
#This text is clearly from a brat
labeler('129497C3E0FC')        

In [None]:
#Maybe this one the kid got our of topic? instant F?
labeler('F5EE08CB44B9')

In [None]:
# This text have a bunch of white spaces
labeler('9B23715DFB32')

In [None]:
# This one apprears to have labeling issues
labeler('F45B396E0A01')

My advice would be to remove those texts from the training dataset

In [None]:
(label_ratio['ratio'] > 0.50).mean()

By thresholding the ratio of labeled data to 50% we still have 99.4% of the data.

In [None]:
(label_ratio['ratio'] > 0.80).mean()

And by thresholding it on 80% we still have 94% of the data.

In [None]:
label_ratio['ratio'].mean()

Also, considering the distribution, if you ignore the no-label data (i.e. assign a label to every single word) you should probably still score quite high as the average label fraction is 95%

# Structure of the TEST data

### How many text files we have?

In [None]:
files_test = os.listdir('/kaggle/input/feedback-prize-2021/test')
print(f'We have {len(files_test)} files in the train folder')

This is common on kernel competitions where the true test set is hiddien. according to the competition description there are about 10k test files.
> Note that this is a code competition, in which you will submit code that will be run against an unseen test set. The unseen test set is approximately 10k documents. A small public test sample has been provided for testing your notebooks.

# Structure of the SUBMISSION file


In [None]:
submission = pd.read_csv('../input/feedback-prize-2021/sample_submission.csv')
submission

So the goal is to predict (for each text file) pairs of `class` and `predictionstring`. Note there for a single file there will be multiple predictions.

# Building a *very* naive baseline

Just to test the submission format, we can predict that the most common occurences that happens in our dataset.

First we filter the outliers by considering only files with label_ratio > 0.8

In [None]:
files_to_keep = label_ratio['id'][label_ratio['ratio'] > 0.80]
df_clean = df.set_index('id').loc[files_to_keep].reset_index().copy()
df_clean = df_clean.set_index('id')

Next we normalize string start and end by the text length

In [None]:
for idx, group in tqdm(df_clean.groupby(df_clean.index)):
    txt_file_path = f'/kaggle/input/feedback-prize-2021/train/{idx}.txt'
    with open(txt_file_path, 'r') as file:
        text_data = file.read()
    txt_len = len(text_data.split())
    df_clean.loc[idx, 'txt_len'] = int(txt_len)
    
df_clean['start'] = df_clean.apply(lambda x: int(x['predictionstring'].split()[0]) / x['txt_len'], axis = 1)
df_clean['end'] = df_clean.apply(lambda x: int(x['predictionstring'].split()[-1]) / x['txt_len'], axis = 1)
df_clean = df_clean.reset_index()

We can visualize the distribution of those labels to assure that they make sense.

In [None]:
(ggplot(
    pd.melt(df_clean, ['id', 'discourse_type'], ['start', 'end']), 
    aes(y = 'value', x = 'discourse_type', fill = 'variable'))
 + geom_boxplot(color = 'black')
 + scale_x_discrete(limits = label_ordered_list)
 + ylab('Relative postion of starting word')
 + xlab('Discourse Type')
 + coord_flip()
)

In [None]:
(pd.melt(df_clean, ['id', 'discourse_type'], ['start', 'end'])
 .groupby(['discourse_type', 'variable'])['value']
 .median()
).to_frame().T

Now we guess the ideal splits by considering the statistics of our data

In [None]:
lead = [0, 0.05]
position = [0.05, 0.2]
claim = [0.2, 0.4]
evidence = [0.4, 0.6]
counterclaim = [0.60, 0.62]
rebuttal = [0.62, 0.68]
concluding_statement = [0.8, 1.0]

And we build the submission file

In [None]:
sub = []
for txt_file in files_test:
    txt_file_path = f'/kaggle/input/feedback-prize-2021/test/{txt_file}'
    with open(txt_file_path, 'r') as file:
        text_data = file.read()
    txt_len = len(text_data.split())
    sub.append(pd.DataFrame({
        'id': txt_file.split('.')[0],
        'class': [
            'Lead',
            'Position',
            'Claim',
            'Evidence',
            'Counterclaim',
            'Rebuttal',
            'Concluding Statement',
        ],
        'predictionstring': [
            ' '.join((np.arange(txt_len*lead[0], txt_len*lead[1], dtype = int) + 1).astype(str)),
            ' '.join((np.arange(txt_len*position[0], txt_len*position[1], dtype = int) + 1).astype(str)),
            ' '.join((np.arange(txt_len*claim[0], txt_len*claim[1], dtype = int) + 1).astype(str)),
            ' '.join((np.arange(txt_len*evidence[0], txt_len*evidence[1], dtype = int) + 1).astype(str)),
            ' '.join((np.arange(txt_len*counterclaim[0], txt_len*counterclaim[1], dtype = int) + 1).astype(str)),
            ' '.join((np.arange(txt_len*rebuttal[0], txt_len*rebuttal[1], dtype = int) + 1).astype(str)),
            ' '.join((np.arange(txt_len*concluding_statement[0], txt_len*concluding_statement[1], dtype = int) + 1).astype(str)),
        ]
    }))

submission = pd.concat(sub).reset_index(drop = True)

In [None]:
submission.to_csv('submission.csv', index = False)
submission

<h3 style='background:orange; color:black'><center>WORK IN PROGRESS!! Come back for more later...</center></h3>

TODO: Try to use decision trees to model this naive relationship

# Evaluation Score

- Theory on what is the score
- Examples on how to calculate the evaluation metric