# Feedback Prize - Yet Another EDA (work in progress)

This is a EDA for data available by [Feedback Prize](https://www.kaggle.com/c/feedback-prize-2021) competition on Kaggle.

In [None]:
import os
import numpy as np
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import wordcloud
import spacy
import nltk
from collections import Counter

random.seed(42)
sns.set(rc={'figure.figsize':(12,6)})

Load CSV files and list of essays (I'll call a train file as an *essay*)

In [None]:
train_dir = "../input/feedback-prize-2021/train"
test_dir = "../input/feedback-prize-2021/test"
train_files = os.listdir(train_dir)
test_files = os.listdir(test_dir)

for file in range(len(train_files)):
    train_files[file] = str(train_dir) + "/" +  str(train_files[file])
for file in range(len(test_files)):
    test_files[file] = str(test_dir) + "/" +  str(test_files[file])
    
df_train = pd.read_csv("../input/feedback-prize-2021/train.csv")

In [None]:
df_train.head(2)

Analyzing label structure for first paragraph in `0000D23A521A` essay: </br></br>
*Some people belive that the so called "face" on mars was created by life on mars. This is not the case. The face on Mars is a naturally occuring land form called a mesa. It was not created by aliens, and there is no consiracy to hide alien lifeforms on mars. There is no evidence that NASA has found that even suggests that this face was created by aliens.*

In [None]:
pd.set_option('display.max_colwidth', 200)
df_train[df_train['id']=='0000D23A521A'][['discourse_start', 'discourse_end', 'discourse_text', 'predictionstring',  'discourse_type' ]]

It's possible to check that punctuation and apostrophe are included withing the word spam. For instance `mars.` or `NASAS's` counts only one position in `predictionstring`, so the correct text split looks like simple empty space.

He I convert some datatypes and create some counts

In [None]:
df_train['discourse_type'] = df_train['discourse_type'].astype('category')
df_train['discourse_type_num'] = df_train['discourse_type_num'].astype('category')
df_train['discourse_start'] = df_train['discourse_start'].astype(int)
df_train['discourse_end'] = df_train['discourse_end'].astype(int)
df_train['discourse_id'] = df_train['discourse_id'].astype('category')
df_train['id'] = df_train['id'].astype('category')
df_train['discourse_words'] = df_train['discourse_text'].apply(lambda x : len(x.split(' ')))
df_train['discourse_len'] = df_train['discourse_end'] - df_train['discourse_start']
df_train['discourse_text_lower'] = df_train['discourse_text'].str.lower()

## 1. Some statistics from data

In [None]:
print("Total number of train files = " , len(train_files))
print("Total number of test files = " , len(test_files))

Check if the number of unique IDs for essays are the same of train files

In [None]:
assert len(df_train.id.unique()) == len(train_files)

Plot `discourse_type` counts

In [None]:
ax = sns.histplot(data=df_train, x="discourse_type")
ticks = plt.xticks(rotation=45)

Plot `discourse_type_num` grouped by `discourse_type`

In [None]:
ax = sns.histplot(data=df_train, x="discourse_type_num", hue='discourse_type')
ticks = plt.xticks(rotation=90)
ax = sns.barplot(
    data=df_train, 
    x="discourse_type_num", hue='discourse_type', y=np.ones(len(df_train))
)


Since Claim and Evidence are most frequent types, it's natural them have more items two.

Now let's plot bloxplot of number of discourses by essay

In [None]:
df_count = df_train.groupby('id').size().reset_index(name='counts')
ax = sns.boxplot(x=df_count['counts'])
labels = ax.set(xlabel='Number of discourses by essay')
print(df_count['counts'].describe())

Let's plot the box plot of number of words by `discourse_type`

In [None]:
ax = sns.boxplot(data=df_train, y='discourse_words', x='discourse_type', orient='v', showfliers = False)
labels = ax.set(xlabel='Number of words by discourse')
ticks = plt.xticks(rotation=45)

## 2. Text Analysis

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))


Sample train essay

In [None]:
with open(random.sample(train_files,1)[0], "r") as essay:
    print(essay.read())

Sample test essay

In [None]:
with open(random.sample(test_files,1)[0], "r") as essay:
    print(essay.read())

<b>Let's visualize discourses type on a sample essay text</b>


In [None]:
## code from: https://www.kaggle.com/odins0n/feedback-prize-eda

sample_essay = random.sample(train_files,1)[0]
sample_id = sample_essay.split('/')[-1][:-4]
                                        
ents = []
for i, row in df_train[df_train['id'] == sample_id].iterrows():
    ents.append({
                    'start': int(row['discourse_start']), 
                     'end': int(row['discourse_end']), 
                     'label': row['discourse_type']
                })

with open(sample_essay, 'r') as file: 
    data = file.read()

doc2 = {
    "text": data,
    "ents": ents,
}

colors = {'Lead': '#EE11D0','Position': '#AB4DE1','Claim': '#1EDE71','Evidence': '#33FAFA',
          'Counterclaim': '#4253C1','Concluding Statement': 'yellow','Rebuttal': 'red'}
options = {"ents": df_train['discourse_type'].unique().tolist(), "colors": colors}
spacy.displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);

**Important to notice that several labels consists of multiple sentences and not all text on essay has a label**.

Word Cloud from train data. I removed stop words, punctuation and converted to lower case

In [None]:
all_text = df_train['discourse_text'].str.cat(sep=' ').translate(str.maketrans('', '', string.punctuation))
img_wordcloud = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, background_color='black').generate(all_text)
ax = plt.imshow(img_wordcloud, interpolation='bilinear')
plt.axis("off")

Now let's vizualize for test data

In [None]:
all_test_text = ''
for test_file in test_files:
    with open(test_file) as file:
        all_test_text += ' ' + file.read().translate(str.maketrans('', '', string.punctuation)).lower()
img_wordcloud = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, background_color='black').generate(all_test_text)
ax = plt.imshow(img_wordcloud, interpolation='bilinear')
ax = plt.axis("off")

The images are a little differents, but we can see some words in both clouds.

Now let's count word frequency for each type of discourse

In [None]:
df_wordfreq = (df_train.set_index('discourse_type')['discourse_text_lower']
       .str.split(' ', expand=True)
       .stack()
       .rename('discourse_type_lower')
       .reset_index(name='discourse_word'))
#df_wordfreq.head()
discourse_text_by_type = {}
for discourse_type in df_train['discourse_type'].unique():
    df_temp = df_wordfreq[df_wordfreq['discourse_type']==discourse_type]
    discourse_text_by_type[discourse_type] = df_temp['discourse_word'].str.cat(
        sep=' ').translate(str.maketrans('', '', string.punctuation))   

In [None]:
for discourse_type, text in discourse_text_by_type.items():
    text_without_stop = [w for w in word_tokenize(text) if not w in stop_words]
    cnt = Counter(text_without_stop)
    print(' --------  Most common words for discourse type ', discourse_type, '---------- \n')
    print(cnt.most_common(10), '\n')

## 3. Entity recognition using Spacy

In [None]:
from pandarallel import pandarallel
pandarallel.initialize()

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'lemmatizer', 'textcat'])

def show_ents(row):
    ents = []
    doc = nlp(row)
    if doc.ents: 
        ents = [ent.text for ent in doc.ents]
    return ents

df_train['spacy_entities'] = df_train['discourse_text'].parallel_apply(show_ents)

Let's print the most commont entities by `discourse_type`

In [None]:
df_wordfreq = (df_train.set_index('discourse_type').apply(lambda x: pd.Series(x['spacy_entities']),axis=1).stack()
       .rename('spacy_entities')
       .reset_index(name='spacy_entities'))

discourse_entities_by_type = {}
for discourse_type in df_train['discourse_type'].unique():
    df_temp = df_wordfreq[df_wordfreq['discourse_type']==discourse_type]
    discourse_entities_by_type[discourse_type] = df_temp['spacy_entities'].to_list()

In [None]:
for discourse_type, text in discourse_entities_by_type.items():
    cnt = Counter(text)
    print(' --------  Most common Entities for discourse type ', discourse_type, '---------- \n')
    print(cnt.most_common(10), '\n')