In [None]:
from __future__ import annotations

!pip install nlplot

# Preparation
## Library imports

In [None]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

import nltk
from nltk.corpus import stopwords
import nlplot
from spacy import displacy

## General Functions

In [None]:
def read_text(data_dir: str, doc_id: str) -> str:
    path = os.path.join(data_dir, f'train/{doc_id}.txt')
    with open(path, 'r') as f:
        text = f.read()
    return text


def make_train_all_text(data_dir: str) -> pd.core.frame.DataFrame:
    dic = {'id': [], 'text': []}
    train_discourse = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    doc_ids = train_discourse['id'].unique().tolist()
    
    for doc_id in tqdm(doc_ids):
        dic['id'].append(doc_id)
        dic['text'].append(read_text(data_dir, doc_id))
        
    return pd.DataFrame(dic)

In [None]:
DATA_DIR = '../input/feedback-prize-2021'

train_all_text = make_train_all_text(DATA_DIR)
train_discourse = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

train_discourse['number'] = 1
train_discourse['number'] = train_discourse.groupby('id').cumsum()['number']

train_discourse_lea = train_discourse.query(' discourse_type=="Lead" ')
train_discourse_pos = train_discourse.query(' discourse_type=="Position" ')
train_discourse_evi = train_discourse.query(' discourse_type=="Evidence" ')
train_discourse_con = train_discourse.query(' discourse_type=="Concluding Statement" ')
train_discourse_cou = train_discourse.query(' discourse_type=="Counterclaim" ')
train_discourse_reb = train_discourse.query(' discourse_type=="Rebuttal" ')

In [None]:
display(train_all_text.head(2))

In [None]:
display(train_discourse.query(' id == "423A1CA112E2" '))

# Label Visualization

For this part, I took inspiration from: https://www.kaggle.com/odins0n/feedback-prize-eda, upvote for this notebook too! :)

In [None]:
def visualize_label(
    df: pd.core.frame.DataFrame,
    doc_id: str,
    data_dir: str) -> None:
    df_filtered = df.query(f' id == "{doc_id}" ')
    
    ents = []
    for _, row in df_filtered.iterrows():
        ents.append({
            'start': int(row['discourse_start']),
            'end': int(row['discourse_end']),
            'label': row['discourse_type']
        })
        
    text = read_text(data_dir, doc_id)
    doc = {
        'text': text,
        'ents': ents
    }
    
    colors = {
        'Lead': '#dad1f6',
        'Position': '#f9d5de',
        'Claim': '#adcfad',
        'Evidence': '#fbbf9a',
        'Counterclaim': '#bdf2fa',
        'Concluding Statement': '#eea69e',
        'Rebuttal': '#d1f8f4',
    }
    options = {
        'ents': df['discourse_type'].unique().tolist(),
        'colors': colors
    }
    displacy.render(doc, style='ent', options=options, manual=True, jupyter=True)

In [None]:
visualize_label(train_discourse, '423A1CA112E2', DATA_DIR)

# Flow of discourse type

In [None]:
def get_specified_discourse_flow(
    df: pd.core.frame.DataFrame,
    discourse_flow: List[str] = []) -> pd.core.frame.DataFrame:
    
    for i, dis in enumerate(discourse_flow):
        _df = df.query(f' number=={i+1} ')
        _df = _df.query(f' discourse_type=="{dis}" ')
        ids = set(_df['id'].unique())
        df = df[df['id'].isin(ids)]
    return df


def plot_pie_chart(
    df: pd.core.frame.DataFrame,
    number: int,
    discourse_flow: List[str] = []
) -> None:
    fig, ax = plt.subplots(figsize=(6, 6))
    df_flow = get_specified_discourse_flow(df, discourse_flow)
    
    print(f'Number of ids: {df_flow.shape[0]}')
    
    num_end = df_flow.groupby('id').last()['number'].reset_index().query(f'number == {number-1}').shape[0]
    
    df_plot = df_flow.query(f' number=={number} ')
    cnt = df_plot['discourse_type'].value_counts()
    x, y = cnt.keys().tolist(), cnt.values.tolist()
    x += ['End']
    y += [num_end]
    ax.pie(y, labels=x, counterclock=False, startangle=90,
                    autopct=lambda p: '{:.1f}'.format(p) if p >= 8 else '')
    ax.axis('equal')
    plt.show()

## First?

In [None]:
plot_pie_chart(train_discourse, 1, [])

## Lead -> ?

In [None]:
plot_pie_chart(train_discourse, 2, ['Lead'])

## Lead -> Position -> ?

In [None]:
plot_pie_chart(train_discourse, 3, ['Lead', 'Position'])

## Lead -> Position -> Claim -> ?

In [None]:
plot_pie_chart(train_discourse, 4, ['Lead', 'Position', 'Claim'])

## Lead -> Position -> Claim -> Claim -> ?

In [None]:
plot_pie_chart(train_discourse, 5, ['Lead', 'Position', 'Claim', 'Claim'])

## Lead -> Position -> Claim -> Claim -> Claim -> ?

In [None]:
plot_pie_chart(train_discourse, 6, ['Lead', 'Position', 'Claim', 'Claim', 'Claim'])

## Lead -> Position -> Claim -> Claim -> Claim -> Evidence -> ?

In [None]:
plot_pie_chart(train_discourse, 7, ['Lead', 'Position', 'Claim', 'Claim', 'Claim', 'Evidence'])

## Lead -> Position -> Claim -> Claim -> Claim -> Evidence -> Evidence -> ?

In [None]:
plot_pie_chart(train_discourse, 8, ['Lead', 'Position', 'Claim', 'Claim', 'Claim', 'Evidence', 'Evidence'])

## Lead -> Position -> Claim -> Claim -> Claim -> Evidence -> Evidence -> Evidence -> ?

In [None]:
plot_pie_chart(
    train_discourse,
    9,
    ['Lead', 'Position', 'Claim', 'Claim', 'Claim', 'Evidence', 'Evidence', 'Evidence']
)

## Lead -> Position -> Claim -> Claim -> Claim -> Evidence -> Evidence -> Evidence -> Concluding Statement -> ?

In [None]:
plot_pie_chart(
    train_discourse,
    10,
    ['Lead', 'Position', 'Claim', 'Claim', 'Claim', 'Evidence', 'Evidence', 'Evidence', 'Concluding Statement']
)

# N-gram

In [None]:
npt_all_text = nlplot.NLPlot(train_all_text, target_col='text')
npt_lea = nlplot.NLPlot(train_discourse_lea, target_col='discourse_text')
npt_pos = nlplot.NLPlot(train_discourse_pos, target_col='discourse_text')
npt_evi = nlplot.NLPlot(train_discourse_evi, target_col='discourse_text')
npt_con = nlplot.NLPlot(train_discourse_con, target_col='discourse_text')
npt_cou = nlplot.NLPlot(train_discourse_cou, target_col='discourse_text')
npt_reb = nlplot.NLPlot(train_discourse_reb, target_col='discourse_text')

In [None]:
stop_words = stopwords.words('english') + ['school', 'students', 'student', 'people']

In [None]:
n1 = npt_all_text.bar_ngram(
    title='N1 gram',
    xaxis_label='word count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    stopwords=stop_words
)
n2 = npt_all_text.bar_ngram(
    title='N2 gram',
    xaxis_label='word count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    stopwords=stop_words
)
n3 = npt_all_text.bar_ngram(
    title='N3 gram',
    xaxis_label='word count',
    yaxis_label='word',
    ngram=3,
    top_n=50,
    stopwords=stop_words
)
trace1 = n1['data'][0]
trace2 = n2['data'][0]
trace3 = n3['data'][0]

fig = make_subplots(rows=1, cols=3, subplot_titles=('N1', 'N2', 'N3'), shared_xaxes=False)
fig.update_xaxes(title_text='word count', row=1, col=1)
fig.update_xaxes(title_text='word count', row=1, col=2)
fig.update_xaxes(title_text='word count', row=1, col=3)

fig.update_layout(height=1800, width=1400, title_text='Nxx gram')
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)
fig.show()

# Co-occurrence network & Sunburst chart

In [None]:
npt_all_text.build_graph(stopwords=stop_words, min_edge_frequency=2000)

In [None]:
npt_all_text.co_network(
    title='Co-occurrence network',
)

In [None]:
npt_all_text.sunburst(
    title='Sunburst chart',
)

# Word cloud
## all discourse

In [None]:
npt_all_text.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

## Lead

In [None]:
npt_lea.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

## Position

In [None]:
npt_pos.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

## Evidence

In [None]:
npt_evi.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

## Concluding Statement

In [None]:
npt_con.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

## Counterclaim

In [None]:
npt_cou.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

## Rebuttal

In [None]:
npt_reb.wordcloud(
    max_words=100,
    max_font_size=100,
    colormap='Greys',
    stopwords=stop_words
)

# Dstribution of word count
## all text

In [None]:
npt_all_text.word_distribution(title='All text')

## Lead

In [None]:
npt_lea.word_distribution(title='Lead')

## Position

In [None]:
npt_pos.word_distribution(title='Position')

## Evidence

In [None]:
npt_evi.word_distribution(title='Evidence')

## Concluding Statement

In [None]:
npt_con.word_distribution(title='Concluding Statement')

## Counterclaim

In [None]:
npt_cou.word_distribution(title='Counterclaim')

## Rebuttal

In [None]:
npt_reb.word_distribution(title='Rebuttal')