## Context

In this notebook, I check the content of blocks with discussion types. To do this, I group the data by id and types, then join the types of the same type together.

```
discourse_data = train_data.groupby(
    ['id', 'discourse_type']
)['discourse_text'].agg(lambda s: "\n\n".join(s)).unstack(level=1)
```

After that, I select the desired sequence of blocks.

```
# List of selected (sorted) columns
cols_list = ["Lead", "Position",
             "Claim", "Counterclaim", "Rebuttal",
             "Evidence", "Concluding Statement"]

discourse_data = discourse_data.loc[:, cols_list]
```

To analyze discourse_data, I use modules:

* **missingno** (matrix, bar, heatmap)
* **pyLDAvis** (used CountVectorizer, TfidfVectorizer, LatentDirichletAllocation)

## Clarification

There are errors when importing the pyLDAvis module, so I load it in section 4. Topic-Term Visualization.

# 1. Import & Def & Set & Load

In [None]:
import os
import re

import numpy as np
import pandas as pd

import missingno as msno

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# import warnings
# warnings.filterwarnings('ignore')

In [None]:
def load_id(text_id: str) -> str:
    """ Load information by id. """
    base_dir = '/kaggle/input/feedback-prize-2021/'
    file_path = f'{base_dir}/train/{text_id}.txt'
    
    result = []
    if os.path.isfile(file_path):    
        with open(file_path) as f:
            result = f.readlines()
    
    return ''.join(result)
    

def get_id(data: pd.DataFrame, text_id: str) -> pd.DataFrame:
    """ Get information by id. """
    cols_list = [col for col in data.columns if 'discourse' in col]
    id_mask = data['id'].str.contains(text_id)
    
    return data.loc[id_mask, cols_list]


def del_long_spaces(string: str) -> str:
    """ Replace long spaces. """
    return re.sub(' +', ' ', string)


def get_describe_length(data: pd.DataFrame, long_spaces: bool = True) -> pd.DataFrame:
    """ Get data describe with/without long spaces. """
    if long_spaces == False:        
        data = data.copy()
        
        for col in data.columns:
            data[col] = data[col].map(del_long_spaces, na_action='ignore')
    
    result = {}
    percentiles_list = [.05, .15, .3, .5, .7, .85, .95]
    for col in data.columns:
        result[col] = data[col].str.len().describe(percentiles=percentiles_list)
    
    return pd.DataFrame(result).astype(int)


def get_pylda_data(data: pd.Series, config: dict, use_idf: bool = False) -> tuple:
    """ Get lda, dtm and vectorizer. """
    data = data.dropna()
    
    vec_config = config.get('VEC')
    lda_config = config.get('LDA')
    
    if not use_idf:
        vectorizer = CountVectorizer(**vec_config)
    else:
        vectorizer = TfidfVectorizer(**vec_config)
        
    
    dtm = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(**lda_config)
    lda.fit(dtm)
    
    return lda, dtm, vectorizer

In [None]:
pd.set_option('display.max_colwidth', 40)
pd.set_option("display.precision", 12)

In [None]:
train_data_path = "../input/feedback-prize-2021/train.csv"
train_data = pd.read_csv(train_data_path)

In [None]:
train_data.shape

In [None]:
train_data.head()

# 2. Create discourse_data

In [None]:
get_id(train_data, '423A1CA112E2')

In [None]:
train_data['id'].nunique()

In [None]:
train_data['discourse_type'].nunique()

In [None]:
%%time
discourse_data = train_data.groupby(
    ['id', 'discourse_type']
)['discourse_text'].agg(lambda s: "\n\n".join(s)).unstack(level=1)

In [None]:
discourse_data.info(memory_usage='deep')

In [None]:
# List of selected (sorted) columns
cols_list = ["Lead", "Position",
             "Claim", "Counterclaim", "Rebuttal",
             "Evidence", "Concluding Statement"]

discourse_data = discourse_data.loc[:, cols_list]
discourse_data

In [None]:
# get_id(train_data, 'FFFF80B8CC2F')

In [None]:
# print(load_id('FFFF80B8CC2F'))

# 3. Check discourse types/text

## 3.1. Missing values

In [None]:
msno.matrix(discourse_data, figsize=(12,8), fontsize=8,
            color=(0.27, 0.52, 1.0)
);

In [None]:
msno.bar(discourse_data, figsize=(12,5), fontsize=8,
         color=(0.27, 0.52, 1.0)
);

In [None]:
msno.heatmap(discourse_data, figsize=(12,8), fontsize=10
);

## 3.2. Statistics information

In [None]:
discourse_data.describe().T

In [None]:
get_describe_length(discourse_data).T

# 4. Topic-Term Visualization

In [None]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
config = {
    'VEC': {
        'strip_accents': 'unicode',
        'stop_words': 'english',
        'lowercase': True,
        'token_pattern': r'\b[a-zA-Z]{3,}\b',
        'max_df': 0.5,
        'min_df': 10        
    },
    'LDA': {
        'n_components': 10,
        'max_iter': 20,
        'random_state': 42
    }
}

## 4.1. "Lead" type of discourse

In [None]:
discourse_type = "Lead"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    ), mds='tsne'
)

## 4.2. "Position" type of discourse

In [None]:
discourse_type = "Position"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)

## 4.3. "Claim" type of discourse

In [None]:
discourse_type = "Claim"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)

## 4.4. "Counterclaim" type of discourse

In [None]:
discourse_type = "Counterclaim"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)

## 4.5. "Rebuttal" type of discourse

In [None]:
discourse_type = "Rebuttal"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)

## 4.6. "Evidence" type of discourse

In [None]:
discourse_type = "Evidence"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)

## 4.7. "Concluding Statement" type of discourse

In [None]:
discourse_type = "Concluding Statement"

In [None]:
%%time
pyLDAvis.sklearn.prepare(*get_pylda_data(
        discourse_data[discourse_type],
        config
    )
)