# Table of Contents
* [Import and EDA](#1)
* [Wordclouds](#2)
* [Text Examples](#3)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plot
import matplotlib.pyplot as plt

# wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

<a id='1'></a>
# Import and EDA

In [None]:
# load training data (csv)
df = pd.read_csv('../input/feedback-prize-2021/train.csv')

In [None]:
# preview
df.head()

In [None]:
# structure of data frame
df.info()

In [None]:
# evaluate id
df.id.value_counts()

#### So we have 15.594 documents and a document can have up to 26 discourses.

In [None]:
# show example having many discourses
df[df.id=='71259B3EA87F']

In [None]:
df.discourse_start.plot(kind='hist', bins=50)
plt.title('discourse_start')
plt.grid()
plt.show()

In [None]:
df.discourse_end.plot(kind='hist', bins=50)
plt.title('discourse_end')
plt.grid()
plt.show()

In [None]:
# add length of discourse
df['discourse_len'] = df.discourse_end - df.discourse_start

In [None]:
# plot discourse length
df.discourse_len.plot(kind='hist', bins=50)
plt.title('discourse_len')
plt.grid()
plt.show()

In [None]:
# evaluate type of discourse
print(df.discourse_type.value_counts())

df.discourse_type.value_counts().plot(kind='bar')
plt.title('discourse_type')
plt.grid()
plt.show()

#### Length by discourse type:

In [None]:
df.groupby(by=['discourse_type'])['discourse_len'].mean().round(2)

#### Frequency by discourse type:

In [None]:
df.groupby(by=['discourse_type'])['id'].count()

In [None]:
# evaluate type of discourse combined with number
df.discourse_type_num.value_counts()

<a id='2'></a>
# Wordclouds

### Full text:

In [None]:
stopwords = set(STOPWORDS)

In [None]:
# collect text
text = ' '.join(txt for txt in df.discourse_text)

# show wordcloud
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color='white').generate(text)
plt.figure(figsize=(14,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Drilldown to specific type:

In [None]:
# filter for a specific type
df_temp = df[df.discourse_type=='Lead']

In [None]:
# collect text
text = ' '.join(txt for txt in df_temp.discourse_text)

# show wordcloud
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color='white').generate(text)
plt.figure(figsize=(14,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

<a id='3'></a>
# Text Examples

In [None]:
with open('../input/feedback-prize-2021/train/0000D23A521A.txt') as f:
    lines_train = f.readlines()
    
lines_train

In [None]:
with open('../input/feedback-prize-2021/test/0FB0700DAF44.txt') as f:
    lines_test = f.readlines()
    
lines_test