Inspired From:

->https://www.kaggle.com/code/erikbruin/nlp-on-student-writing-eda

->https://www.kaggle.com/code/cdeotte/2nd-place-solution-cv741-public727-private740

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('dark_background')
from matplotlib.ticker import FuncFormatter
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import os

In [None]:
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')

sample_submission = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_txt = glob('../input/feedback-prize-effectiveness/train/*.txt') 
test_txt = glob('../input/feedback-prize-effectiveness/test/*.txt')

In [None]:
train

In [None]:
train.query('essay_id == "87A6EF3113C6"')

In [None]:
train["discourse_len"] = train["discourse_text"].apply(lambda x: len(x))

In [None]:
!cat ../input/feedback-prize-effectiveness/train/00066EA9880D.txt

In [None]:
train

In [None]:
print(f"The total number of discourses is {len(train)}")

In [None]:
fig = plt.figure(figsize=(12,8))

ax1 = fig.add_subplot(211)
ax1 = train.groupby('discourse_type')['discourse_len'].mean().sort_values().plot(kind="barh")
ax1.set_title("Average number of words versus Discourse Type", fontsize=14, fontweight = 'bold')
ax1.set_xlabel("Average number of words", fontsize = 10)
ax1.set_ylabel("")

ax2 = fig.add_subplot(212)
ax2 = train.groupby('discourse_type')['discourse_type'].count().sort_values().plot(kind="barh")
ax2.get_xaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ','))) #add thousands separator
ax2.set_title("Frequency of Discourse Type in all essays", fontsize=14, fontweight = 'bold')
ax2.set_xlabel("Frequency", fontsize = 10)
ax2.set_ylabel("")

plt.tight_layout(pad=2)
plt.show()

In [None]:
train_first = train.drop_duplicates(subset = "essay_id", keep = "first").discourse_type.value_counts().rename_axis('discourse_type').reset_index(name='counts_first')
train_first['percent_first'] = round((train_first['counts_first']/train.essay_id.nunique()),2)
train_last = train.drop_duplicates(subset = "essay_id", keep = "last").discourse_type.value_counts().rename_axis('discourse_type').reset_index(name='counts_last')
train_last['percent_last'] = round((train_last['counts_last']/train.essay_id.nunique()),2)
train_first_last = train_first.merge(train_last, on = "discourse_type", how = "left")
train_first_last

In [None]:
train.loc[train['essay_id']== '00066EA9880D']['discourse_text'].to_list()[0]

In [None]:
train.loc[train['essay_id'] == '00066EA9880D'].shape[0]

In [None]:
train.loc[train['essay_id'] == '00066EA9880D']['discourse_text'].to_list()

In [None]:
mem = ""
ind = []
for i in train['essay_id'].unique().tolist():
    mem = open('../input/feedback-prize-effectiveness/train/{}.txt'.format(i), 'r')
    mem = str.join('', mem)
    x = train.loc[train['essay_id'] == i]
    for j in range(0,x.shape[0]):
        ind.append(mem.find(x['discourse_text'].to_list()[j][:10]))

In [None]:
train['discourse_start'] = ind

In [None]:
train['discourse_end'] = train['discourse_start'] + train['discourse_len']

In [None]:
train.isnull().sum()

In [None]:
len_dict = {}
word_dict = {}
for t in tqdm(train_txt):
    with open(t, "r") as txt_file:
        myid = t.split("/")[-1].replace(".txt", "")
        data = txt_file.read()
        mylen = len(data.strip())
        myword = len(data.split())
        len_dict[myid] = mylen
        word_dict[myid] = myword
train["essay_len"] = train["essay_id"].map(len_dict)
train["essay_words"] = train["essay_id"].map(word_dict)

In [None]:
train

In [None]:
#initialize column
train['gap_length'] = np.nan

#set the first one
train.loc[0, 'gap_length'] = 7 #discourse start - 1 (previous end is always -1)

#loop over rest
for i in tqdm(range(1, len(train))):
    #gap if difference is not 1 within an essay
    if ((train.loc[i, "essay_id"] == train.loc[i-1, "essay_id"])\
        and (train.loc[i, "discourse_start"] - train.loc[i-1, "discourse_end"] > 1)):
        train.loc[i, 'gap_length'] = train.loc[i, "discourse_start"] - train.loc[i-1, "discourse_end"] - 2
        #minus 2 as the previous end is always -1 and the previous start always +1
    #gap if the first discourse of an new essay does not start at 0
    elif ((train.loc[i, "essay_id"] != train.loc[i-1, "essay_id"])\
        and (train.loc[i, "discourse_start"] != 0)):
        train.loc[i, 'gap_length'] = train.loc[i, "discourse_start"] -1


 #is there any text after the last discourse of an essay?
last_ones = train.drop_duplicates(subset="essay_id", keep='last')
last_ones['gap_end_length'] = np.where((last_ones.discourse_end < last_ones.essay_len),\
                                       (last_ones.essay_len - last_ones.discourse_end),\
                                       np.nan)

cols_to_merge = ['essay_id', 'discourse_id', 'gap_end_length']
train = train.merge(last_ones[cols_to_merge], on = ["essay_id", "discourse_id"], how = "left")

In [None]:
def add_gap_rows(essay):
    cols_to_keep = ['discourse_start', 'discourse_end', 'discourse_type', 'gap_length', 'gap_end_length']
    df_essay = train.query('essay_id == @essay')[cols_to_keep].reset_index(drop = True)

    #index new row
    insert_row = len(df_essay)
   
    for i in range(1, len(df_essay)):          
        if df_essay.loc[i,"gap_length"] >0:
            if i == 0:
                start = 0 #as there is no i-1 for first row
                end = df_essay.loc[0, 'discourse_start'] -1
                disc_type = "Nothing"
                gap_end = np.nan
                gap = np.nan
                df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
                insert_row += 1
            else:
                start = df_essay.loc[i-1, "discourse_end"] + 1
                end = df_essay.loc[i, 'discourse_start'] -1
                disc_type = "Nothing"
                gap_end = np.nan
                gap = np.nan
                df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
                insert_row += 1

    df_essay = df_essay.sort_values(by = "discourse_start").reset_index(drop=True)

    #add gap at end
    if df_essay.loc[(len(df_essay)-1),'gap_end_length'] > 0:
        start = df_essay.loc[(len(df_essay)-1), "discourse_end"] + 1
        end = start + df_essay.loc[(len(df_essay)-1), 'gap_end_length']
        disc_type = "Nothing"
        gap_end = np.nan
        gap = np.nan
        df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
        
    return(df_essay)

In [None]:
add_gap_rows("00066EA9880D")

In [None]:
def print_colored_essay(essay):
    df_essay = add_gap_rows(essay)
    #code from https://www.kaggle.com/odins0n/feedback-prize-eda, but adjusted to df_essay
    essay_file = "../input/feedback-prize-effectiveness/train/" + essay + ".txt"

    ents = []
    for i, row in df_essay.iterrows():
        ents.append({
                        'start': int(row['discourse_start']), 
                         'end': int(row['discourse_end']), 
                         'label': row['discourse_type']
                    })

    with open(essay_file, 'r') as file: data = file.read()

    doc2 = {
        "text": data,
        "ents": ents,
    }
    colors = {
            'Lead': '#8000ff',
            'Position': '#2b7ff6',
            'Evidence': '#2adddd',
            'Claim': '#80ffb4',
            'Concluding Statement': 'd4dd80',
            'Counterclaim': '#ff8042',
            'Rebuttal': '#ff0000',
            'Other': '#007f00',
         }
    #colors = {'Lead': '#EE11D0','Position': '#AB4DE1','Claim': '#1EDE71','Evidence': '#33FAFA','Counterclaim': '#4253C1','Concluding Statement': 'yellow','Rebuttal': 'red'}
    options = {"ents": df_essay.discourse_type.unique().tolist(), "colors": colors}
    spacy.displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);

In [None]:
train.loc[train['essay_id'] == '00066EA9880D']['discourse_text'][14340]

In [None]:
print_colored_essay("00066EA9880D")