In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import spacy
import string
import re
from wordcloud import WordCloud
import os

## Look at the train.csv Data Frame

In [None]:
df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
df = df[['discourse_start','discourse_end','discourse_text','discourse_type']]
df.head()

Shape of the Data Frame:

In [None]:
df.shape

Unique target values:

In [None]:
df.discourse_type.unique().tolist()

## What does the Target Variables Mean?
Basically, there are 7 types of discourse:
- **Lead**: intro part to hook readers' attention; 
- **Position**: demonstrate you understand the other side's viewpoint, but you explain your own stance; 
- **Evidence**: provide the readers with facts/data to prove the argument is strong;
- **Claim**: explain the overall thesis on the subject. The main argument is made in this part;
- **Concluding Statement**: draw conclusion;
- **Counterclaim**: the opposite perspective;
- **Rebuttal**: evidence that disagrees with the counterclaim.

According to [Purdue owl writing lab](https://owl.purdue.edu/owl/general_writing/academic_writing/establishing_arguments/organizing_your_argument.html).

## Target Variables Distribution

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(8,6))
sns.barplot(y=df['discourse_type'].unique(),
            x=df['discourse_type'].value_counts())
plt.xlabel('Count of Discourse Type')
plt.title('Discourse Type Distribution')
plt.show()


## Position of Each Target Variables

Where does each element target start at the student's article? Where do they end? In this section, we will plot each element's position in an article.

In [None]:
fig = px.box(df, y='discourse_type', x='discourse_start',
            title='Where Each Type Element Starts')
fig.show()

In [None]:
fig = px.box(df, y='discourse_type', x='discourse_end',
            title='Where Each Type Element Ends')
fig.show()

In [None]:
fig = px.scatter(df.sample(frac=0.01, random_state=20), x='discourse_start', y='discourse_end',
            title='Correlation between Discourse Start and End',
            opacity = 0.3,
            color = 'discourse_type')
fig.show()

In conclusion, we can all agree that lead statement start usually at the beginning of an article, where as concluding statement usually is at the very end of an article.

Usually, when a sentence start early, it will end early. So I am not suprised that there is an obvious linear correlation between start and end point for each types. However, there "Evidence" seems scattered more and 'Lead' usually is very short.

## How Long Are Each Type of Elements?

In [None]:
df['txt_len'] = df['discourse_text'].apply(lambda x: len(x.split()))

In [None]:
fig = px.box(df, y='discourse_type', x='txt_len',
            title='Length of Each Type of Element')
fig.show()

## Remove Stopwords, Tokenization and Lemmatization

First, add a few customized stopwords:

In [None]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
print(f'There are {len(stopwords)} default stop-words before customization.')

In [None]:
customized_stopwords = ['student', 'people', 'make', 'school', 'teacher', 'be', 'electoral', 'college', 'think']

for token in customized_stopwords:
    stopwords.add(token)
    nlp.vocab[token].is_stop = True
    
print(f'There are {len(stopwords)} stop-words after customization.')

In [None]:
# Create tokenzer function from a given sentence
def clean_text(sentence):
    # Remove nan, @username, punctuation, URL, or any non alpanumeric characters and seperate word using a single space.
    sentence = sentence.lower()
    sentence = ' '.join(re.sub("(nan)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", sentence).split())
    # Removing stop words and obtain the lemma
    text = [ word.lemma_ for word in nlp(sentence) if not word.text in stopwords]
    return ' '.join(text).strip()

In [None]:
# Apply clean_text function to the column.
df['text_cleaned'] = df['discourse_text'].map(clean_text)

df.head()

## Top 10 Uni-grams of the tf-idf of the Text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a function returning the top words
def get_top_tf_idf_words(df = df, col = "text", use_idf = True, ngram_range =(1, 1), top_n= 10):
    
    tf_idf = TfidfVectorizer(#stop_words='english',
        ngram_range = ngram_range, use_idf = use_idf)
    
    # Fit and transform the corpus
    X_sparse_matrix = tf_idf.fit_transform(df[col])
    feature_names = np.array(tf_idf.get_feature_names())
    
    # Generate the tf-idf matrix
    tf_idf_sparse_matrix = tf_idf.transform(df[col])
    
    # Rank the matrix by tf-idf values and return the indices of the top_n values
    sorted_idx = np.argsort(tf_idf_sparse_matrix.data)[:-(top_n+1):-1]
    
    # Return the feature names and corresponding tf_idf values in a df
    return pd.DataFrame(
    {'feature': feature_names[tf_idf_sparse_matrix.indices[sorted_idx]],
     'tf_idf': tf_idf_sparse_matrix.data[sorted_idx],
    })

TOP_N = 10
df_text_lead = get_top_tf_idf_words(df = df[df['discourse_type']=='Lead'], col = "discourse_text", top_n= TOP_N)
df_text_posi = get_top_tf_idf_words(df = df[df['discourse_type']=='Position'], col = "discourse_text", top_n= TOP_N)
df_text_evid = get_top_tf_idf_words(df = df[df['discourse_type']=='Evidence'], col = "discourse_text", top_n= TOP_N)
df_text_clai = get_top_tf_idf_words(df = df[df['discourse_type']=='Claim'], col = "discourse_text", top_n= TOP_N)
df_text_cclu = get_top_tf_idf_words(df = df[df['discourse_type']=='Concluding Statement'], col = "discourse_text", top_n= TOP_N)
df_text_cntr = get_top_tf_idf_words(df = df[df['discourse_type']=='Counterclaim'], col = "discourse_text", top_n= TOP_N)
df_text_rebt = get_top_tf_idf_words(df = df[df['discourse_type']=='Rebuttal'], col = "discourse_text", top_n= TOP_N)

x=range(0, TOP_N)

fig, ax = plt.subplots(7, 1, figsize = (10, 25))
fig.suptitle('Top 10 Bigrams of the TF-IDF', fontsize= 18)

ax[0].plot(x, df_text_lead.tf_idf, 'bo')
ax[0].set_title('Lead', fontsize= 14)
ax[0].set_xticks(x)
ax[0].set_xticklabels(df_text_lead.feature, rotation='vertical', fontsize=10)

ax[1].plot(x, df_text_posi.tf_idf, 'bo')
ax[1].set_title('Position', fontsize= 14)
ax[1].set_xticks(x)
ax[1].set_xticklabels(df_text_posi.feature, rotation='vertical', fontsize=10)

ax[2].plot(x, df_text_evid.tf_idf, 'bo')
ax[2].set_title('Evidence', fontsize= 14)
ax[2].set_xticks(x)
ax[2].set_xticklabels(df_text_evid.feature, rotation='vertical', fontsize=10)

ax[3].plot(x, df_text_clai.tf_idf, 'bo')
ax[3].set_title('Claim', fontsize= 14)
ax[3].set_xticks(x)
ax[3].set_xticklabels(df_text_clai.feature, rotation='vertical', fontsize=10)

ax[4].plot(x, df_text_cclu.tf_idf, 'bo')
ax[4].set_title('Concluding Statement', fontsize= 14)
ax[4].set_xticks(x)
ax[4].set_xticklabels(df_text_cclu.feature, rotation='vertical', fontsize=10)

ax[5].plot(x, df_text_cntr.tf_idf, 'bo')
ax[5].set_title('Counterclaim', fontsize= 14)
ax[5].set_xticks(x)
ax[5].set_xticklabels(df_text_cntr.feature, rotation='vertical', fontsize=10)

ax[6].plot(x, df_text_rebt.tf_idf, 'bo')
ax[6].set_title('Rebuttal', fontsize= 14)
ax[6].set_xticks(x)
ax[6].set_xticklabels(df_text_rebt.feature, rotation='vertical', fontsize=10)

fig.subplots_adjust(hspace=1.5)
plt.show()

## Word Clouds for each Type
Wordcloud before clean the text:

In [None]:
elem = df.discourse_type.unique().tolist()

plt.figure(figsize=(15,10))
for i in range(1,8):
    plt.subplot(4, 2, i)
    plt.imshow(WordCloud().generate(' '.join(df[df.discourse_type == elem[i-1]].discourse_text.apply(lambda x: x.lower()))),
               interpolation='bilinear')
    plt.title(elem[i-1])
    plt.axis('off')
plt.suptitle('Wordcloud for Each Category before Clean')
plt.tight_layout()

Wordcloud after clean the text:

In [None]:
plt.figure(figsize=(15,10))
for i in range(1,8):
    plt.subplot(4, 2, i)
    plt.imshow(WordCloud().generate(' '.join(df[df.discourse_type == elem[i-1]].text_cleaned)),
               interpolation='bilinear')
    plt.title(elem[i-1])
    plt.axis('off')
plt.suptitle('Wordcloud for Each Category After Clean')
plt.tight_layout()