In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
! cat ../input/feedback-prize-2021/train/0000D23A521A.txt

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
train_df['num_words'] = train_df.predictionstring.apply(lambda s: len(s.split()))

train_df.head()

In [None]:
print("Number of essays:", train_df.id.nunique())
print("Number of discourse examples:", len(train_df))
print("Number of Discourse Types:", train_df.discourse_type.nunique())
print("Number Of Discourse Type Numbers:", train_df.discourse_type_num.nunique())

**mean average length of the discourse segments.**

In [None]:
train_df.discourse_type.value_counts()

In [None]:
train_df.head(2)

In [None]:
data=train_df.groupby('discourse_type')[['num_words']].mean().reset_index().rename(columns={'num_words': 'avg_word_length'})

fig, ax=plt.subplots(1, 3, figsize=(15, 4), sharey=True)
ax[0].set_title("Frequency of Discourse types")
ax[0].set_label('')

ax[1].set_title("Average discourse type length")
ax[2].set_title("discourse type word count quantile distributions")

sns.set_style('dark')
sns.countplot(data=train_df, y='discourse_type',
              order=train_df.discourse_type.value_counts().index, ax=ax[0])

sns.barplot(data=data.sort_values('avg_word_length', ascending=False),
            x='avg_word_length',
            y='discourse_type',
            order=train_df.discourse_type.value_counts().index,
            ax=ax[1])

sns.boxplot(data=train_df, y='discourse_type', x='num_words',
            order=train_df.discourse_type.value_counts().index,
            ax=ax[2])
plt.show()

In [None]:
data = train_df.groupby('discourse_type')[['num_words']].agg([min, max]).reset_index()
data.columns=['discourse_type', 'Min Words', 'Max Words']

_, ax = plt.subplots(1, 2, sharey=True, figsize=(15, 5))
sns.barplot(data=data, y='discourse_type', x='Min Words', ax=ax[0])
sns.barplot(data=data, y='discourse_type', x='Max Words', ax=ax[1])
plt.show()

for all the discourse types there are few places where the minimum length is <=2

In [None]:
plt.pie(train_df[train_df.num_words<=2].discourse_type.value_counts(),
        labels=train_df[train_df.num_words<=2].discourse_type.value_counts().index)
plt.show()

In [None]:
train_df[train_df.num_words<=2].discourse_type.value_counts()

of all claims have very less number of words with shorter contexts

**some of the text from claims**

In [None]:
pprint(train_df[(train_df.num_words<=2) & 
         (train_df.discourse_type == 'Claim')
        ].discourse_text.sample(10).values)

# read Essays 

In [None]:
essay_folder='../input/feedback-prize-2021/train'
essay_df = []
for filename in os.listdir(essay_folder):
    filepath = os.path.join(essay_folder, filename)
    with open(filepath) as file:
        essay_df.append({
            'id': filename.replace('.txt', ''),
            'content': file.read()
        })
essay_df = pd.DataFrame.from_dict(essay_df)
essay_df['total_num_chars'] = essay_df.content.apply(lambda x: len(x))
essay_df['total_num_words'] = essay_df.content.apply(lambda x: len(x.split()))

essay_df.head()

# discourse type positions

In [None]:
position_df = train_df[['id', 'discourse_type', 'discourse_start', 'discourse_end']].copy()
position_df = position_df.merge( essay_df )
position_df['discourse_start_percentile'] = 100 * position_df.discourse_start.div(position_df.total_num_chars)
position_df['discourse_end_percentile'] = 100 * position_df.discourse_end.div(position_df.total_num_chars)

position_df.head()

In [None]:
train_df.discourse_type.unique()

In [None]:
plt.figure(figsize=(15, 5))
plt.title("positions in which discourse type occurs w.r.t Essays")
sns.boxplot(data=position_df,
            x = 'discourse_type',
            y='discourse_end_percentile',
            order=['Lead',  'Position',  'Claim',
                   'Counterclaim', 'Evidence', 'Rebuttal','Concluding Statement',]
           )
plt.show()

**1. Lead, position occurs mostly at the begining of the essays**

**2. CounterClaim, Evidence, Rebuttal occurs almost at similar positions**

**3. Concluding statemtnt can be obtained at the end of an essay most of times.**

In [None]:
essay_df[essay_df.id=='A8445CABFECE'].content.values[0]

In [None]:
for idx, row in train_df[train_df.id =='A8445CABFECE'].iterrows():
    discourse_type=row.discourse_type
    discourse_text=row.discourse_text
    
    print(discourse_type)
    print(discourse_text)
    print()

# coverage of the discourse elements in essays

In [None]:
data = train_df.groupby('id')[['num_words']].sum().reset_index()
data = data.merge(essay_df[['id', 'total_num_words']].copy())
data['coverage'] = data.num_words.div(data.total_num_words)

data.head()

In [None]:
_, ax=plt.subplots(1, 3, figsize=(15, 5))
plt.suptitle("Coverage of the Discourse Elements in the essays.")
sns.boxplot(data=data, y='coverage', ax=ax[0])
sns.histplot(data=data, x='coverage', ax=ax[1])
sns.scatterplot(data=data, x='total_num_words', y='coverage', ax=ax[2])
plt.show()

In [None]:
print("Number of essays with <0.2 coverage:", len(data[data.coverage<0.7]) )
print("Percent of essays with <0.2 coverage: {:.4f}".format( 100 * len(data[data.coverage<0.7])/len(data) ) )

In [None]:
_, ax=plt.subplots(1, 3, figsize=(15, 5))

plt.suptitle("Coverage of the Discourse Elements in the essays with >0.7 coverage.")
sns.boxplot(data=data[data.coverage>0.7], y='coverage', ax=ax[0])
sns.histplot(data=data[data.coverage>0.7], x='coverage', ax=ax[1])
sns.scatterplot(data=data[data.coverage>0.7], x='total_num_words', y='coverage', ax=ax[2])

plt.show()

**more than 98% of the essays have >70% of discourse type Coverage**

**Based on the Discorse types, can occurs at different positional segments in the essays**

**Training dataset have >70% Coverage with Discourse elements given, so almost always can find the elements in the element set**