In [None]:
import os
import numpy as np 
import pandas as pd

from spacy import displacy

# settings
pd.set_option('display.max_colwidth', 200)

In [None]:
train_dir = '../input/feedback-prize-2021/train'
train_csv_path = '../input/feedback-prize-2021/train.csv'

# EDA

In [None]:
# total number of files
train_files = os.listdir(train_dir)
len(train_files)

In [None]:
# read the CSV file
train_df = pd.read_csv(train_csv_path)
train_df.head()

In [None]:
# Label names
train_df.discourse_type.unique().tolist()

In [None]:
# CSV total number of rows
len(train_df.index)

There are total *15,594* text files in the training dir. But *1,44,293* rows in the CSV file. That meas one file may contain multiple lebels/segments. Thus this can be a **multi-label classification problem** like NER.

In [None]:
# Check if there is any null
train_df.isnull().sum()

In [None]:
# Column names
cols = [*train_df]
cols

In [None]:
# Type of data in each column
train_df.dtypes

In [None]:
# Discourse type distribution
train_df.discourse_type.value_counts()

In [None]:
train_df.discourse_type.value_counts().plot(kind='pie', figsize=(8,8), ylabel='')

Clearly the dataset is not balanced. Examples of *Evidence* and *Claim* contains many examples where other labels have very few examples. 

In [None]:
# number of chars frequency in discourse text
train_df['discourse_text'].apply(len).value_counts().hist()

# Visualization 

In [None]:
# View a text file
index = 1
file_path = os.path.join(train_dir, train_files[index])
file_content = open(file_path).read()
file_content

In [None]:
# View discourses
file_id = train_files[index].split('.')[0]
train_df[train_df['id'] == file_id]

In [None]:
# Visualizing the segments. 
colors = {
    'Lead': '#F5B7B1',
    'Position': '#D7BDE2',
    'Evidence': '#AED6F1',
    'Claim': '#A3E4D7',
    'Concluding Statement': '#F9E79F',
    'Counterclaim': '#E5E7E9',
    'Rebuttal': '#FEF9E7'
}

def visualize_segments(df, text, file_id):
    segments = []
    seg_df = df[df['id'] == file_id]
    lable_names = df.discourse_type.unique().tolist()
    
    for _, row in seg_df.iterrows():
        seg_info = {
            'start': int(row['discourse_start']),
            'end': int(row['discourse_end']),
            'label': row['discourse_type'],
        }
        segments.append(seg_info)

    text_info = {
        'text': text,
        'ents': segments,
        'title': f'{file_id}.txt'
    }
    
    configs = {
        'colors': colors
    }
    
    displacy.render(
        text_info, 
        style='ent',
        options=configs,
        manual=True,
        jupyter=True
    )

In [None]:
visualize_segments(train_df, file_content, file_id)