In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.core.display import display, HTML
import ipywidgets as widgets
from IPython.display import display,clear_output
from ipywidgets import Output, Button
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
import networkx as nx # create network
import plotly.express as px
import plotly.graph_objects as go
# make sure the code in plotly is able to run properly
from plotly.offline import plot, iplot, init_notebook_mode
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
init_notebook_mode(connected=True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from umap import UMAP
# remove unnecessary warnings in the output
pd.options.mode.chained_assignment = None  # default='warn'

# Introduction

This notebook aims at creating **interactive visualizations** for readers to grasp the data easily at the first glance.

It includes:

* Paint the discourse type with different colors
* Number of discourse elements per type in each essay
* A histogram showing the length of discourse type
* Average word length in each element per discourse type
* Lexical uniqueness per discourse type
* The correlation between essay length and its number of discourse elements
* Relative median position of each discourse type in essays
* The chance of discourse type coming next to each other
* Does the text contain "I think"?
* Top 100 signal tokens in each discourse type
* Does discourse type favor specific punctuation?
* Text clustering visualization
* What is the next step?

# Paint the discourse type with different colors

Why do we need it? If we open a text file **007ACE74B050.txt** as an example -

In [None]:
#sample = pd.read_csv('../input/feedback-prize-2021/train/007ACE74B050.txt')
# f = open('../input/feedback-prize-2021/train/007ACE74B050.txt','r')
with open('../input/feedback-prize-2021/train/007ACE74B050.txt') as f:
    contents = f.read()
    print(contents)

**The text itself doesn't show the discourse type on sentences.**

So let's use some colors to make each part more outstanding.

Firstly, let's give each discourse type a color, as shown below:


<h3 style="color:rgb(255, 102, 204);">Lead</h3>
<h3 style="color:rgb(0, 0, 102);">Position</h3>
<h3 style="color:rgb(51, 102, 255);">Claim</h3>
<h3 style="color:rgb(153, 102, 51);">Counterclaim</h3>
<h3 style="color:rgb(102, 204, 255);">Rebuttal</h3>
<h3 style="color:rgb(0, 0, 0);">Evidence</h3>
<h3 style="color:rgb(51, 51, 153);">Concluding Statement</h3>

Then, taking the same passage as an example, let's see how it looks like after we've painted it.

In [None]:
# load data
train = pd.read_csv("/kaggle/input/feedback-prize-2021/train.csv")

# taking 007ACE74B050.txt as an example
sample = train.loc[train['id'] == '007ACE74B050'] # you can change the id to whatever article you want to see

In [None]:
# iterate each discourse type and print it in the color as we wished
def color_article(id):
    sample = train.loc[train['id'] == id]
    for i in range(sample.shape[0]):
        text = sample['discourse_text'].iloc[i]
        discourse = sample['discourse_type'].iloc[i]
        if discourse == 'Lead':
            color = '(255, 102, 204)'
        elif discourse == 'Position':
            color = '(0, 0, 102)'
        elif discourse == 'Claim':
            color = '(51, 102, 255)'
        elif discourse == 'Counterclaim':
            color = '(153, 102, 51)'
        elif discourse == 'Rebuttal':
            color = '(102, 204, 255)'
        elif discourse == 'Evidence':
            color = '(0, 0, 0)'
        elif discourse == 'Concluding Statement':
            color = '(51, 51, 153)'

        sample_html_text = '<p style="color:rgb' + str(color) + ';">' + str(text) +'  (' +str(discourse)+') ' + '</p>'
        display(HTML(sample_html_text))
color_article("007ACE74B050")

**I also add the type name at the end of each segment. Hope it makes each one looks more clearly to you.** 

### Choose whatever article you are interested

So what if you're interested in any other article? Or just intend to play around with the data by randomly clicking an id?

Sure. Here I've created a dropdown list where you can choose an article id, and once you click the **Start** button, the screen will present an article in colors by discouse type.

**Please be aware that, for now, the interactive dropdown list can be working only in the edit mode** - You can edit/copy the notebook and test it yourself. Since the dataset has around 15k articles in total, it might take 3~5 seconds to load the data.

Hope you like it!

In [None]:
# unique id of each article
ids = train['id'].unique()

# create a dropdown list containing unique ids
# create a button to get input
output = Output()
start = Button(description="Start")
plays_widget = widgets.Dropdown(
    options=list(ids),
    value=list(list(ids))[0],
    description='Essay id:',
    disabled=False,
)

def click_start(b):
    with output:
        clear_output()
        #print("Essay id: " + str(plays_widget.value))
        color_article(plays_widget.value)
       
start.on_click(click_start)

# once click the start button, the corresponding article will be displayed
display(plays_widget, start, output)

# Number of discourse elements per type in each essay

When I was a student at primary or middle school, my writing teach suggested us writing only one Lead for one article, since it won't let the content distracting to the users. 

Then what about our datasets? 

* Do stutdents usually write one Lead as I did?

* How many evidences do they usually write in an essay?

* Does they usually include at least one counterclaim or rebuttal as they wrote the Lead?

Let's figure it out!

In [None]:
# count number of elements per discouse type in each essay
discourse_type_number = train.groupby(['id','discourse_type']).nunique().reset_index()[['id','discourse_type','discourse_id']]

# plot a box chart to show quantiles, mean and median of nr of elements per discourse type
fig = go.Figure()
discourse = train['discourse_type'].unique()

for d in discourse:
    dataset = discourse_type_number.loc[discourse_type_number['discourse_type'] == d]    
    fig.add_trace(go.Box(y=dataset['discourse_id'], 
                         text = dataset['id'],
                         hovertemplate='<b>id: %{text} - <br>Nr of discourse elements - %{y:.1f}</b>',
                         name=d))
    
fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.update_layout(showlegend = False,
                  title = 'Nr of discouse per types in an essay <sup><br>Claims and Evidence have the highest medians at 3 elements in an essay</br></sup>',
                  yaxis_title="Nr. discourse elements")
fig.show()

# Average word length in each element per discourse type

Do people usually use longer words when they state Claims or Position?

From this chart, it looks Claims and Postions both have the highest means (6.5 letters per word on average), however, Claims has a wider range of the value, indicating that it can have extremely long words in some sentences.

In [None]:
# avgerage word length = sum of word length / number of words in an element
def average_word_length(text):
    words = text.split()
    word_count = len(words)
    word_length = 0
    for word in words:
        word_length += len(word)
    avg_word_length = round(word_length/word_count,2)
    return avg_word_length

# create a new column avg_word_length to store the data
train['avg_word_length'] = train['discourse_text'].apply(average_word_length)

# plot a box chart to show quantiles, mean and median of average word length per discourse type
fig = go.Figure()
discourse = train['discourse_type'].unique()

for d in discourse:
    dataset = train.loc[train['discourse_type'] == d]    
    fig.add_trace(go.Box(y=dataset['avg_word_length'], 
                         text = dataset['discourse_id'],
                         hovertemplate='<b>discourse_id: %{text} - <br>Average word length - %{y:.1f}</b>',
                         name=d))
    
fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.update_layout(showlegend = False,
                  title = 'Average word length in each discourse element per type<sup><br>* Average word length = Sum of each word length in an element / Number of words in an element</br></sup>',
                  yaxis_title="Avg. word length")
fig.show()

# Lexical uniqueness per discourse type

I come up with a term probaly made myself - lexical uniqueness, which indicates the variability of the word usage in a sentence. In other words, if an essay has used words repeatedly, its lexical uniqueness will be low.

To calculate it, I use this equation below

> Lexical uniqueness = number of unique words * 100 / number of total words

From the chart, **Evidence** seems to have the **lowest** lexical uniqueness on average. 

Why is that?

In [None]:
# nr of unique words / number of total words
def unique_word_share(text):
    words = text.split()
    word_count = len(words)
    unique_word_count = len(set([w.lower() for w in words]))
    word_uniqueness = round(unique_word_count * 100/word_count,2)
    return word_uniqueness

train['uniqueness'] = train['discourse_text'].apply(unique_word_share)

# plot a box chart to show quantiles, mean and median of lexical uniqueness per discourse type
fig = go.Figure()
discourse = train['discourse_type'].unique()

for d in discourse:
    dataset = train.loc[train['discourse_type'] == d]    
    fig.add_trace(go.Box(y=dataset['uniqueness'], 
                         text = dataset['discourse_id'],
                         hovertemplate='<b>discourse_id: %{text} - <br> lexical uniqueness - %{y:.1f}%</b>',
                         name=d))
    
fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.update_layout(showlegend = False,
                  title = 'Lexical uniqueness in each discourse element per type<sup><br>* lexical uniqueness = Number of unique words in an element * 100 / Number of words in an element</br></sup>',
                  yaxis_title="Lexical uniqueness (%)")
fig.show()

# The correlation between essay length and its number of discourse elements

Does a long essay usually have more discourse elements than a shorter one?



In [None]:
# get the last position of each essay
train['last_position'] = train['predictionstring'].apply(lambda x: x.split(' ')[-1])
train['final_position'] = train['last_position'].astype(float).groupby(train['id']).transform('max')

# number of discourse elements per essay
train_discourse_aggr = train.groupby(['id']).nunique().reset_index()[['id','discourse_id']]

# length of each essay
train_discourse_aggr['length_essay'] = train.groupby(['id']).mean()[['final_position']].reset_index()['final_position']

In [None]:
# the correlation coefficient between essay length and number of its discourse elements
# is ~0.6, strong positive
train_discourse_aggr.corr()

In [None]:
# plot the scatter chart showing the correlation between essay length and number of its discourse elements
fig = go.Figure(data=go.Scatter(x=train_discourse_aggr['length_essay'], 
                                y=train_discourse_aggr['discourse_id'], 
                                text=train_discourse_aggr['id'],
                                opacity = 0.85,
                                name = "",
                                hovertemplate = "Essay id: %{text} <br>Essay length: %{x} <br>Nr of discourse: %{y}",
                                mode='markers')
               )
fig.update_xaxes(showgrid=False)
fig.update_layout(showlegend = False,
                  title = 'A strong positive correlation between the length of an essay and the number of discourse elements <sup><br>The correlation coefficient is 0.6</sup>',
                  xaxis_title="Essay length",
                 yaxis_title="Nr of discourse elements")

fig.show()

# A histogram showing the length of discourse type

In [None]:
# get the length of each discourse type element
train['discourse_text_length'] = train['discourse_text'].apply(lambda x: len(x.split(' ')))

# plot the length of each element into a histogram chart by type
import plotly.graph_objects as go
fig = go.Figure()

x0 = np.random.randn(500)
# Add 1 to shift the mean of the Gaussian distribution
x1 = np.random.randn(500) + 1
discourse = train['discourse_type'].unique()

for d in discourse:
    
    fig.add_trace(go.Histogram(
        x=train.loc[train['discourse_type'] == d]['discourse_text_length'],
        name = d
                    )
                  )

# Overlay both histograms
fig.update_layout(
    title = "Histogram of discourse element length per type <br><sup>Evidence and Concluding Statement have longer tails than the others. </sup>",
    barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

# Relative median position of each discourse type in essays

How usually does each discourse type be placed in an essay in general. Here I get the median value of both the element starts and ends per discourse type, and plot a dumbbell chart of it.

In [None]:
# get the start and end position of each element
train['first_position'] = train['predictionstring'].apply(lambda x: x.split(' ')[0])
train['last_position'] = train['predictionstring'].apply(lambda x: x.split(' ')[-1])

# normalize the two values in percentiles as start_perc and last_perc respectively
train['start_perc'] = round(train['first_position'].astype(float) * 100 / train['final_position'].astype(float),0)
train['last_perc'] = round(train['last_position'].astype(float) * 100/ train['final_position'].astype(float),0)

# get the median value of start_perc and last_perc per discourse type
train_type_position_aggr = train.groupby('discourse_type').median().reset_index().sort_values(by = 'start_perc', ascending = False)

# plot the dumbbell chart showing the relative median position of each discourse type
fig=go.Figure()
for i in range(train_type_position_aggr.shape[0]):
    fig.add_shape(
            type='line',
            x0=train_type_position_aggr['start_perc'].iloc[i], 
            y0=train_type_position_aggr['discourse_type'].iloc[i], 
            x1=train_type_position_aggr['last_perc'].iloc[i], 
            y1=train_type_position_aggr['discourse_type'].iloc[i],
            line_color="#cccccc"
        )
    fig.add_trace(go.Scatter(x=train_type_position_aggr["start_perc"], 
                             y=train_type_position_aggr["discourse_type"], 
                             #hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K chose <b>%{y}</b>',
                             hovertemplate='<b>%{y} usually starts at %{x:.1f}% of the essay</b>',
                             mode='markers',
                             name = "",
                             marker=dict(size=[10] * train_type_position_aggr.shape[0], color=["#DEBAE6"] * train_type_position_aggr.shape[0]),
                            ))
    
    fig.add_trace(go.Scatter(x=train_type_position_aggr["last_perc"], 
                             y=train_type_position_aggr["discourse_type"], 
                             hovertemplate='<b>%{y} usually ends at %{x:.1f}% of the essay</b>',
                             mode='markers',
                             name = "",
                             marker=dict(size=[10] * train_type_position_aggr.shape[0],color=["#C54DFD"] * train_type_position_aggr.shape[0])
                             #visible=(question_name==default_state)
                            ))

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.update_layout(showlegend = False,
                  title = 'Position of the discourse type in essays <sup><br>The position is calculated as the median value of normalized percentile of each element</br></sup>',
                  xaxis_title="% of the essay")
fig.show()

# The chance of discourse type coming next to each other

From the chart below, we can see in most cases -

* Lead always sits at the beginning of the essay. 
* Position follows a Lead or a Concluding Statement.
* Claims come after Poitions 
* Evidence follows either a cliam or a Rebuttal.

In [None]:
# get the next type
train['next_discourse_type'] = train.groupby(['id'])['discourse_type'].shift(-1)

# select relevant colums
discourse_network = train[['id','discourse_type','next_discourse_type']]

# get the occurrence of each pair
discourse_network_aggr = discourse_network.groupby(['discourse_type','next_discourse_type']).nunique().reset_index()
discourse_network_aggr = discourse_network_aggr.sort_values(by=['discourse_type','id'], ascending = False)

# get the chance of each pair
discourse_network_aggr['sum'] = discourse_network_aggr['id'].groupby(discourse_network_aggr['discourse_type']).transform('sum')
discourse_network_aggr['share'] = round(discourse_network_aggr['id']*100/discourse_network_aggr['sum'],1)

# plot a bar chart showing the % of each discourse type coming next to each type
discourse=discourse_network_aggr['discourse_type'].unique()

fig = go.Figure()

for d in discourse:
    sample = discourse_network_aggr.loc[discourse_network_aggr['next_discourse_type'] == d]
    fig.add_trace(
        go.Bar(name=d, 
               x=discourse, 
               y=sample['share'],
               text = d,
               hovertemplate = "Share (%): %{y}"
              )
    )

# Change the bar mode
fig.update_layout(
    title = 'What is the discouse type coming next?',
    xaxis_title="Discourse type",
    yaxis_title="next discourse element (%)",
    barmode='stack')
fig.show()

# Does the text contain "I think"?

When I read an essay and come across a phrase such as 'I think', without any hesitation, I know that the author started to state the position thereafter. Though curiosity drives me do check if it is still the case in our dataset.

I check the discourse element one by one that whether it contains the phrase 'I think'.

The outcome is no surprise - 12.7% of the elements in the type **Position** does and the next one following up is **Concluding Statement**, with 8.5%.

In [None]:
def contain_phrase(phrase):
# add a new column if_contain indicating if the phrase 
# I think exists in the column discourse_text
    train['if_contain'] = np.where(train['discourse_text'].str.find(phrase) != -1, True, False)

    # get the share of the elements that contain target phrase
    train_if_contain_aggr = train.groupby(['discourse_type','if_contain']).nunique().reset_index()[['discourse_type','if_contain','discourse_id']]
    train_if_contain_aggr['sum'] = train_if_contain_aggr['discourse_id'].groupby(train_if_contain_aggr['discourse_type']).transform('sum')
    train_if_contain_aggr['share'] = round(train_if_contain_aggr['discourse_id'] * 100 / train_if_contain_aggr['sum'],1)
    train_if_contain_aggr = train_if_contain_aggr.loc[train_if_contain_aggr['if_contain'] == True].sort_values(by = 'share', ascending = False)
    
    # plot a bar chart 
    colors = ['lightslategray',] * 7
    colors[0] = "#DEBAE6"
    
    fig = go.Figure(data=[go.Bar(
        x=train_if_contain_aggr['discourse_type'],
        y=train_if_contain_aggr['share'],
        name = "",
        hovertemplate='<b>%{y:.1f}% of elements in %{x} does</b>',
        marker_color=colors # marker color can be a single color value or an iterable
    )])
    fig.update_layout(title_text='How much % of the elements contain the phrase "{}" per discourse type?'.format(phrase),
                     yaxis_title="% of the elements")
    fig.show()
contain_phrase("I think")

Here you can input whatever string you want to check how much each discourse type contains.

Please be aware that for now the string is case sensitive, which means if you input 'For example' or 'for example', it will return different outcomes.

**Running the notebook is required, so you may copy/fork the notebook and play with it on your own.**

In [None]:
# create a input box 
# create a button start the program
output = Output()
start = Button(description="Start")


phrase_widget = widgets.Textarea(
    #value='Hello World',
    placeholder='Type something',
    description='Input:',
    disabled=False
)

def click_start(b):
    with output:
        clear_output()
        contain_phrase(phrase_widget.value)
       
start.on_click(click_start)

# once click the start button, the corresponding article will be displayed
display(phrase_widget, start, output)

# Top 100 signal tokens in each discourse type

Ok, if we see 'I think', we are pretty confident the Position of the arthur will show up later on.

Then what about when we see the word 'example'? It looks easy as well - The author must state an evidence most likely!

But what about 'may', 'however', or a single question mark '?'?

I want to know do we have some data-driven signal words for one specific discourse. i.e. if a word occurs at a very larger probability in discourse X than any other one, I would regard it as a signal for the discourse X.

Below are my code counting this -

In [None]:
# tokenize each discourse element 
# count the frequency of each token and sum them up per discourse type
unique_discourse = train['discourse_type'].unique()
discourse_freq = []
for d in unique_discourse:
    dataset = train.loc[train['discourse_type'] == d]
    discourse_text = dataset.discourse_text
    word_freq = dict(FreqDist(word.lower() for word in word_tokenize(''.join(discourse_text))))
    table = pd.DataFrame(list(word_freq.items()))
    table['type'] = d
    table = table.rename(columns={0: "word", 1: "frequency"})
    #table = table.sort_values(by='frequency',ascending = False)
    table['freq_rank'] = table['frequency'].rank(ascending=False)
    #table = table.head(1000)
    discourse_freq.append(table)
    total_table = pd.concat(discourse_freq)

# count the occurrance probability of each token i.e. number of frequency per token / total frequency
total_table['sum'] = total_table['frequency'].groupby(total_table['type']).transform('sum')
total_table['occur_probability'] = total_table['frequency'] / total_table['sum']
total_table['mean'] = total_table['occur_probability'].groupby(total_table['word']).transform('mean')

# count the variability of frequency among discourse types for each token
# e.g. the word 'example' occurres in Evidence at a probability of 0.001428%, and such value is 245% of its average occurrance probablity among discouse types  
# Based on that, if we see 'example' in a piece of text, our first reasonable guess of the text discourse type should be 'Evidence'.
total_table['change_over_mean'] = round((total_table['occur_probability'] - total_table['mean']) * 100 / total_table['mean'],0)

e.g. the word 'example' occurres in Evidence at a probability of 0.001428%, and such value is 245% of its average occurrance probablity among discouse types  
Based on that, if we see 'example' in a piece of text, our first reasonable guess of the text discourse type should be 'Evidence'.

In [None]:
total_table.loc[total_table['word'] == 'example'][['word','type','occur_probability','change_over_mean']]

In [None]:
# get the top 20 tokens for each discourse
total_table['type_rank'] = total_table.groupby("type")["change_over_mean"].rank("dense", ascending=False)
total_table_100 = total_table.loc[total_table['type_rank']<=100]

# get the most occurred type of each token
# the dataset for visualization is ready!
total_table_max = total_table_100.groupby(['word']).max().reset_index()[['word','change_over_mean']]
data_aggr = pd.merge(
    total_table_max,
    total_table,
    how="inner",
    on=None,
    left_on=['word','change_over_mean'],
    right_on=['word','change_over_mean'],
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)


data_aggr = data_aggr.rename(columns={'frequency': "Frequency", 
                                      'occur_probability': "Probability of occurance"})
# plot scatter chart per discourse
# place the tokens as points
fig = px.scatter(data_aggr, 
                 x="Frequency", 
                 y="Probability of occurance", 
                 size = 'Probability of occurance',
                 text = 'word',
                 facet_col="type", 
                 hover_data=['word']
                 #facet_col_wrap= 3
                )

fig.update_traces(textposition='top center')

fig.update_layout(
    title_text='Top 100 discourse signal token <sup><br> If a token occurs at a larger probability in the discourse type X than any other one, we would regard this token as a signal for discourse X.</sup>'
)

fig.show()

# Does discourse type favor specific punctuation?

From the Top 100 signal tokens chart above, I see one interesting fact that in Lead type, the most outstanding token is the question mark.

Inspired from it, let's check does each discourse type have specific favored punctuation?

In [None]:
punctuations = ['.',',',':','-','?','!',';','"']
def count_punt(text, p):
    return text.count(p)
for p in punctuations:
    train[p] = train['discourse_text'].apply(count_punt,args=(p))
train_punct = train.groupby('discourse_type').sum().reset_index()
train_punct['sum'] = train_punct['!'] + train_punct[','] + train_punct['.'] + train_punct[':'] + train_punct['-'] + train_punct[';'] + train_punct['"'] + train_punct['?']
p_share = []
for p in punctuations:
    train_punct[p + " percentage"] = round(train_punct[p] * 100 / train_punct['sum'],1)
    p_share.append(p + " percentage")

fig = px.bar(train_punct, x="discourse_type", y=p_share, title="% of punctuation occurring in each discourse type")
fig.show()

# Text clustering visualization

As inspired by the interesting work - [how to visualize text data](https://www.kaggle.com/subinium/how-to-visualize-text-dataset) from SUBIN AN, I created some cluster visualization for our dataset.

In [None]:
tfidf = TfidfVectorizer() 

train_sample = train.sample(10000)
docs = tfidf.fit_transform(train_sample['discourse_text'])

# create a list of our conditions
conditions = [
    (train_sample['discourse_type'] == "Lead"),
    (train_sample['discourse_type'] == "Position"),
    (train_sample['discourse_type'] == "Counterclaim"),
    (train_sample['discourse_type'] == 'Claim'),
    (train_sample['discourse_type'] == 'Evidence'),
    (train_sample['discourse_type'] == 'Concluding Statement'),
    (train_sample['discourse_type'] == 'Rebuttal')
    ]

values = [0, 1, 2, 3, 4, 5, 6]

# create a new column and use np.select to assign values to it using our lists as arguments
train_sample['discourse_type_transform'] = np.select(conditions, values)
target = train_sample['discourse_type_transform']

umap = UMAP(random_state=0)
dr = umap.fit_transform(docs, target)

# for visualization
dark_palette = ['green','red','yellow','blue','purple','pink','orange']

color = [dark_palette[0] if i==0 else dark_palette[1] if i==1 else dark_palette[2] if i==2 else dark_palette[3] if i==3 else dark_palette[4] if i==4 else dark_palette[5] if i==5 else dark_palette[6] for i in target]


fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111)
ax.axis('off')

ax.scatter(x=dr[:,0], y=dr[:,1], s=10, alpha=0.25, c=color)
ax.set_title('TFIDF Dimension Reduction', loc='left', fontsize=20, fontweight='bold')

fig.tight_layout()
plt.show()

# What is the next step?

This notebook is an ongoing work. 

Do you have any other idea? Just let me know in the comment if you do or like the notebook (or not).

Thanks for reading!