In [None]:
!pip install  pyecharts

In [None]:
import numpy as np 
import pandas as pd 
import os
from copy import deepcopy, copy
from pyecharts.charts import *
from pyecharts.components import Table
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
import random
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
palette = sns.color_palette("mako")

## Task Definition

The dataset presented here contains argumentative essays written by U.S students in grades 6-12. These essays were annotated by expert raters for discourse elements commonly found in argumentative writing:

* Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
* Position - an opinion or conclusion on the main question
* Claim - a claim that supports the position
* Counterclaim - a claim that refutes another claim or gives an opposing reason to the position
* Rebuttal - a claim that refutes a counterclaim
* Evidence - ideas or examples that support claims, counterclaims, or rebuttals.
* Concluding Statement - a concluding statement that restates the claims

Your task is to predict the quality rating of each discourse element. Human readers rated each rhetorical or argumentative element, in order of increasing quality, as one of:
* 
* Ineffective
* Adequate
* Effective

**Annotation Scheme**

[Argumentation Annotation Scheme and Descriptions.](https://docs.google.com/document/d/1G51Ulb0i-nKCRQSs4p4ujauy4wjAJOae/)

The annotation scheme adopts seven elements as the building blocks of the argumentation framework. Table 1 shows the definitions of the argumentation elements and their examples.

Table 1
Definitions and Examples of Argumentation Elements

|Elements|Definitions|Examples|
|----|----|----|
|Lead|The introduction begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis.|“What would you choose, thousands of screaming fans beckoning you to perform your guitar solo or a quiet shelf in a library with only a couple hundred pages detailing your life. It is the sad choice between being a celebrity on one hand and a hero on the other.”|
|Position|An opinion or conclusion on the main question|" In my opinion, every individual has an obligation to think seriously about important matters, although this might be difficult."|
|Claim|A claim that supports the position.|"The next reason why I agree that every individual has an obligation to think seriously about important matters is that this simple task can help each person get ahead in life and be successful."|
|Counterclaim|A claim that refutes another claim or gives an opposing reason to the position.|"Some may argue that obligating every individual to think seriously is not necessary and even annoying as some people may choose to just follow the great thinkers of the nation."|
|Rebuttal|A claim that refutes a counterclaim.|"Even though people can follow others' steps without thinking seriously in some situations, the ability to think critically for themselves is a very important survival skill."|
|Evidence|Ideas or examples that support claims, counterclaims, or rebuttals.|"For instance, the presidential debate is currently going on. In order to choose the right candidate, voters need to research all sides of both candidates and think seriously to make a wise decision for the good of the whole nation."|
|Concluding Statement|A concluding statement that restates the claims.|"To sum up, thinking seriously is important in making decisions because each decision has an outcome that affects lives. It is also important because if you think seriously it can help you succeed."|

## Load Data

In [None]:
INPUT_DIR='../input/feedback-prize-effectiveness'
train=pd.read_csv(f'{INPUT_DIR}/train.csv')
test=pd.read_csv(f'{INPUT_DIR}/test.csv')
sub=pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')

In [None]:
print("train.shape",train.shape)
print("test.shape",test.shape)
print("sub.shape",sub.shape)

In [None]:
train.head()

In [None]:
train.nunique()

In [None]:
# merge text:credits to https://www.kaggle.com/code/vad13irt/fpe-exploratory-data-analysis
def read_file(path):
    with open(path, "r") as file:
        data = file.read()
        
    return data

def preprocess_data_frame(data_frame, directory):
    data_frame = deepcopy(data_frame)
    data_frame["essay_path"] = data_frame["essay_id"].apply(lambda essay_id: os.path.join(directory, f"{essay_id}.txt"))
    data_frame["essay_text"] = data_frame["essay_path"].apply(lambda essay_path: read_file(essay_path))
    
    return data_frame
train_directory = "../input/feedback-prize-effectiveness/train"
test_directory = "../input/feedback-prize-effectiveness/test"
train = preprocess_data_frame(train, train_directory)
test = preprocess_data_frame(test, test_directory)

In [None]:
train.head()

## check1: discourse_type

In [None]:
# 虚假数据

df_value_counts= train['discourse_type'].value_counts()
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['unique_values', 'counts']
x_data = df_value_counts['unique_values'].values.tolist() 
y_data = df_value_counts['counts'].values.tolist() 
# 背景颜色配置
bar = (
    Bar(init_opts=opts.InitOpts(theme='walden'))
    .add_xaxis(x_data)
    .add_yaxis('', y_data)
)

bar.render_notebook()

## check2: discourse_effectiveness

In [None]:
train['discourse_effectiveness'].value_counts()

In [None]:
# 虚假数据

df_value_counts= train['discourse_effectiveness'].value_counts()
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['unique_values', 'counts']
x_data = df_value_counts['unique_values'].values.tolist() 
y_data = df_value_counts['counts'].values.tolist() 
# 背景颜色配置
bar = (
    Bar(init_opts=opts.InitOpts(theme='chalk'))
    .add_xaxis(x_data)
    .add_yaxis('', y_data)
)

bar.render_notebook()

In [None]:
pie = (Pie(init_opts=opts.InitOpts(theme='light'))
       .add('', [list(z) for z in zip(x_data, y_data)])
       )

pie.render_notebook()

## check3:text len

credits to：https://www.kaggle.com/code/akashadesai/exploratory-data-analysis-getting-started?scriptVersionId=96520091&cellId=22

In [None]:
train["essay_text_length"] = train["essay_text"].apply(lambda text: len(text.split()))
train["discourse_text_length"] = train["discourse_text"].apply(lambda text: len(text.split()))

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
sns.kdeplot(x="essay_text_length", data=train, ec="#000", color=palette[-1], fill=True, alpha=1, ax=ax, zorder=2)
ax.set_title("Essay text length distribution")
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
sns.kdeplot(x="discourse_text_length", data=train, ec="#000", color=palette[2], fill=True, alpha=1, ax=ax, zorder=2)
ax.set_title("Discourse text length distribution")
fig.show()

another view：histplot

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
sns.histplot(x="essay_text_length", data=train, ec="#000", color=palette[-1], fill=True, alpha=1, ax=ax, zorder=2)
ax.set_title("Essay text length distribution")
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot()
sns.histplot(x="discourse_text_length", data=train, ec="#000", color=palette[2], fill=True, alpha=1, ax=ax, zorder=2)
ax.set_title("Discourse text length distribution")
fig.show()