In [None]:
import os
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn


from transformers import AutoTokenizer
from pprint import pprint
from collections import Counter

In [None]:
class config:
    sample = False

# label maps

In [None]:
segment2label = {
    "B": 0,
    "I": 1,
    "O": 2
}

label2segment = {
    0: "B",
    1: "I",
    2: "O"
}

discourse2label={
    'Lead': 0,
    'Position' : 1,
    'Evidence' : 2,
    'Claim' : 3,
    'Concluding Statement' : 4,
    'Counterclaim' : 5,
    'Rebuttal': 6,
    'O': 7
}
label2discourse={
    0: 'Lead',
    1: 'Position',
    2: 'Evidence',
    3: 'Claim',
    4: 'Concluding Statement',
    5: 'Counterclaim',
    6: 'Rebuttal',
    7: 'O'
}

In [None]:
def read_essay(filename):
    essay_folder='../input/feedback-prize-2021/train'
    filepath = os.path.join(essay_folder, filename+".txt")
    essay = ''
    with open(filepath) as file:
        essay = file.read()
    return essay


def get_discourse_labels(row):
    predictionstrings = row.predictionstring
    content = row.content
    discourse_type = row.discourse_type
    labels = ['O'] * len(content)
    
    for i, cls_label in enumerate(discourse_type):
        predstring = predictionstrings[i]
        token_ids = [int(x) for x in predstring.split()]
        for j, token_id in enumerate(token_ids):
            labels[token_id] = cls_label
    return labels

def get_segment_labels(row):
    predictionstrings = row.predictionstring
    content = row.content
    labels = ['O']*len(content)
    
    for i, predstring in enumerate(predictionstrings):
        token_ids = [int(x) for x in predstring.split()]
        for j, token_id in enumerate(token_ids):
            if j==0:
                label = 'B'
            else:
                label = 'I'
            labels[token_id] = label
    return labels

In [None]:
def map_discourse2metafeatures(row):
    discourse_type   = row.discourse_type
    predictionstring = row.predictionstring
    paragraph_len    = row.paragraph_len
    num_paragraphs = row.num_paragraphs
    
    lst=[]
    for i, discourse in enumerate(discourse_type):
        pstring=predictionstring[i]
        start_idx = pstring[0]
        essay_len = 0
        pidx = -1
        
        for j in range( len(paragraph_len) ):
            essay_len += paragraph_len[j]
            if start_idx < essay_len:
                pidx=j
                break
        
        lst.append({
            'element_type' : discourse,
            'num_paragraphs': num_paragraphs,
            'para_id': pidx,
            'segment_length': paragraph_len[pidx]
        })
    return lst

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
if config.sample:
    train_df = train_df.head(100)

train_df = train_df.groupby('id')[['discourse_type', 'predictionstring']].agg(list).reset_index()
train_df['content'] = train_df.id.apply(read_essay)
train_df.predictionstring = train_df.predictionstring.apply(lambda lst:  [ [int(x) for x in p.split()] for p in lst] )

In [None]:
train_df['paragraphs'] = train_df.content.apply(lambda content: [ p for p in content.split('\n') if p!=''])
train_df['num_paragraphs'] = train_df.paragraphs.apply(lambda lst: len(lst))
train_df['paragraph_len'] = train_df.paragraphs.apply( lambda lst: [len(p.split()) for p in lst] )

train_df['discourse_meta_features'] = train_df.apply(map_discourse2metafeatures, axis=1)
train_df.head()

# lets check the number of paragraphs in the essay

In [None]:
para_df = train_df.copy()
para_df['first_para_length'] = para_df['paragraph_len'].apply(lambda lst: lst[0])
para_df['last_para_length'] = para_df['paragraph_len'].apply(lambda lst: lst[-1])

para_df.head()

In [None]:
para_df.num_paragraphs.describe()

In [None]:
print( "Number of essays with <10 paragraphs - {:.4f}".format(100*len(para_df[para_df.num_paragraphs<10])/len(para_df)))

plt.figure(figsize=(15, 5))
plt.title("Number of Paragraphs in Essay")
sns.countplot(data=para_df, x='num_paragraphs')
plt.show()

In [None]:
df = []
for index, row in train_df.iterrows():
    for plen in row.paragraph_len:
        df.append({
            'essay_id': index,
            'num_paragraphs': row.num_paragraphs,
            'plen' : plen
        })
df = pd.DataFrame.from_dict(df)
df.head()

In [None]:
df.plen.describe()

In [None]:
plt.title("Distribution of paragraph lengths")
sns.histplot(data=df[df.plen > 3], x='plen', stat='probability')
plt.show()

In [None]:
df = df.groupby(['essay_id', 'num_paragraphs'], as_index=False)[['plen']].mean()

_, ax=plt.subplots(1,2, figsize=(15, 5))

ax[0].set_title("Number of Paragraphs (vs) Avergae Paralengths")
ax[1].set_title("Number of Paragraphs (>10) (vs) Avergae Paralengths")

sns.scatterplot(data=df, x='num_paragraphs', y='plen', ax=ax[0])
sns.scatterplot(data=df[df.num_paragraphs>10], x='num_paragraphs', y='plen', ax=ax[1])
plt.show()

1. more variations in the paragraph length if number of paragraphs are less.
2. As the number of paragraphs increases (i.e mostly after 10) , paragraph lengths are very less variant
3. for number of paragraphs >=20 --> paragraphs lengths are between [15, 30] --> may be there are like sentences.

In [None]:
para_df.head(2)

In [None]:
df=[]
for idx, all_meta_features in enumerate(train_df.discourse_meta_features.values):
    for meta in all_meta_features:
        para_id = meta['para_id']
        segment_length = meta['segment_length']
        df.append({
            'essay_id':idx,
            'para_id': para_id,
            'segment_length': segment_length,
            'ecnt': 1
        })
df = pd.DataFrame.from_dict(df)
df = df.groupby(['essay_id', 'para_id', 'segment_length'], as_index=False)[['ecnt']].sum()
df.head()

In [None]:
_, ax= plt.subplots(1, 4, figsize=(15, 5))

ax[0].set_title("number of Elements in the paragraph")
ax[1].set_title("#elements (vs) Segment Length")
ax[2].set_title("avg segment length vs #elements")
ax[3].set_title("#elements (vs) avg segment length")

sns.countplot(data=df, x='ecnt', ax=ax[0])
sns.scatterplot(data=df, y='ecnt', x='segment_length', ax=ax[1])
sns.lineplot(data=df, y='ecnt', x='segment_length', ci=None, ax=ax[2])
sns.lineplot(data=df, x='ecnt', y='segment_length', ci=None, ax=ax[3])

plt.show()

1. with small average segment length's --> number of discourse elements are smaller.
2. most of the paragraphs have <=2 elements

In [None]:
para_df[['first_para_length', 'last_para_length']].describe()

In [None]:
_, ax=plt.subplots(1, 2, figsize=(15, 5))
ax[0].set_title("First Paragraph Length")
ax[1].set_title("Last Paragraph Length")

sns.boxplot(data=para_df, x='first_para_length', ax=ax[0])
sns.boxplot(data=para_df, x='last_para_length', ax=ax[1])
plt.show()

looks there are many paragraphs with <= 2 words

In [None]:
print( "Number of Essays with first paragraph with <=3 words:" , len(para_df[(para_df.first_para_length<=3)] ))
print( "Number of Essays with last paragraph with <=3 words:" , len(para_df[(para_df.last_para_length<=3)] ))
print( "Number of Essays with first or last paragraph with <=3 words:" , len(para_df[(para_df.first_para_length<=3) |
                                                                                     (para_df.last_para_length<=3)] ))

In [None]:
_, ax=plt.subplots(1, 2, figsize=(15, 5))

ax[0].set_title("First Paragraph Length")
ax[1].set_title("Last Paragraph Length")

sns.histplot(data=para_df, x='first_para_length', ax=ax[0])
sns.histplot(data=para_df, x='last_para_length', ax=ax[1])
plt.show()

the spike in the graph looks abormal and 

In [None]:
plt.title("Correlation between the first and last paragraph lengths")
sns.heatmap( para_df[['first_para_length', 'last_para_length']].corr() , annot=True)
plt.show()

Lets check a few first paragraph with 

In [None]:
first_para_df = para_df[(para_df.first_para_length<=3)].copy()
last_para_df = para_df[(para_df.last_para_length<=3)].copy()

first_paragraphs = first_para_df.paragraphs.apply(lambda lst: lst[0]).values
last_paragraphs = last_para_df.paragraphs.apply(lambda lst: lst[-1]).values


print("Number of Unique first paragraphs:", len(set(first_paragraphs)))
print("Number of Unique last paragraphs:", len(set(last_paragraphs)))



In [None]:
df1 = pd.DataFrame.from_dict( {
    "text": Counter(first_paragraphs).keys(),
    "num_occurs": Counter(first_paragraphs).values()
})

df1.sort_values('num_occurs', ascending=False).head(10)

In [None]:
train_df.head(1)

lets check if any of there first para with <=3 words have 

In [None]:
discourse_para_data=[]

for lst in train_df.discourse_meta_features.values:
    discourse_para_data += lst
df=pd.DataFrame.from_dict(discourse_para_data)
df['is_first_para'] = (df['para_id']==0).astype(int)
df['is_last_para']  = (df['para_id']== df['num_paragraphs']-1).astype(int)
df['para_id_from_last'] = df['para_id'] - df['num_paragraphs']
df.head()

In [None]:
_, ax= plt.subplots(1, 2, figsize=(16, 5))
sns.countplot(data=df[df.para_id < 20], x='para_id', ax=ax[0])
sns.countplot(data=df[df.para_id_from_last > -20], x='para_id_from_last', ax=ax[1])

plt.show()

In [None]:
def plot_paraids_for_elements(etype):
    para_lmt=10
    _, ax=plt.subplots(1, 2, figsize=(15, 5))
    plt.suptitle(etype)
    ax[0].set_title("para ids")
    ax[1].set_title("para ids from last")
    
    sns.countplot(data=df[(df.element_type == etype) & (df.para_id < para_lmt)], x='para_id', ax=ax[0])
    sns.countplot(data=df[(df.element_type == etype) & 
                          (df.para_id_from_last > -para_lmt)].sort_values('para_id_from_last', ascending=False),
                  x='para_id_from_last',
                  ax=ax[1],
                  order = np.arange(-1, -20, -1)
                 )

    plt.show()

In [None]:
for i, etype in enumerate(["Lead", 'Position', 'Evidence', 'Claim',
                           'Counterclaim', 'Rebuttal', 'Concluding Statement']):
    plot_paraids_for_elements(etype)

From the graphs above 
1. Lead - occurs at 1st 2 paragraphs
2. Concluding Statemtent - occurs in the last paragraph
3. Lead , Position --> comes together in the 1st 2 paragraph.

In [None]:
train_df['first_discourse_type'] = train_df.discourse_type.apply(lambda lst: lst[0])
train_df['last_discourse_type'] = train_df.discourse_type.apply(lambda lst: lst[-1])

train_df.head(2)

In [None]:
train_df.first_discourse_type.value_counts()

In [None]:
_, ax=plt.subplots(1, 2, figsize=(17, 5))

ax[0].set_title("First Discourse Type in Essay")
ax[1].set_title("Last Discourse Type in Essay")

plt.xticks(rotation=40)
sns.countplot(data=train_df, x='first_discourse_type', ax=ax[0])

plt.xticks(rotation=40)
sns.countplot(data=train_df, x='last_discourse_type', ax=ax[1])

plt.show()

1. Lead, Position --> are the most discourse types that comes first in the essays
2. Concluding Statement --> occurs most at the end of the essay

In [None]:
def is_concluding_at_last(discourse_type):
    if 'Concluding Statement' not in discourse_type:
        return -1
    for i, etype in enumerate(discourse_type):
        if etype == 'Concluding Statement':
            if i==len(discourse_type)-1:
                return 1
            return 0
    return 0

train_df['is_concluding_comes_last'] = train_df.discourse_type.apply(is_concluding_at_last)
train_df.head()

In [None]:
train_df.is_concluding_comes_last.value_counts()

In [None]:

plt.title("Concluding Statement coming at End of Essay")
sns.countplot(data=train_df[train_df.is_concluding_comes_last!=-1],
              x='is_concluding_comes_last')
plt.show()


To be continued............