# Introduction nlplot

nlplot is a package developed for the visualization of natural languages.
In this kernel, we will show you how to use nlplot and analyze Tweets.


## "nlplot" Description
Facilitates the visualization of natural language processing and provides quicker analysis

You can draw the following graph

1. N-gram bar chart
2. N-gram tree Map
3. Histogram of the word count
4. wordcloud
5. co-occurrence networks
6. sunburst chart
7. pyLDAvis

（Tested in English and Japanese）

## Contributions Welcome!
### Github → https://github.com/takapy0210/nlplot


# Install

In [None]:
! pip install nlplot

# import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nlplot
from plotly.subplots import make_subplots
import plotly.express as px

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_colwidth', 5000)

# Data Loading

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')

In [None]:
# In this case, we're going to sample and visualize 10000 pieces of data
train = train.sample(n=10000, random_state=0)

In [None]:
# Convert text to lowercase
train['text'] = train['text'].apply(lambda x: x.lower())

In [None]:
display(train.head(), train.shape)

In [None]:
df = train.groupby('sentiment').size().reset_index(name='count')
fig = px.bar(df, y='count', x='sentiment', text='count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(
    title=str('sentiment counts'),
    xaxis_title=str('sentiment'),
    width=700,
    height=500,
    )
fig.show()

# Using nlplot

Create instances with all data and positive/neutral/negative

In [None]:
# initialize
npt = nlplot.NLPlot(train, target_col='text')
npt_negative = nlplot.NLPlot(train.query('sentiment == "negative"'), target_col='text')
npt_neutral = nlplot.NLPlot(train.query('sentiment == "neutral"'), target_col='text')
npt_positive = nlplot.NLPlot(train.query('sentiment == "positive"'), target_col='text')

Stopword calculations can be performed.

In [None]:
stopwords = npt.get_stopword(top_n=30, min_freq=0)
print(stopwords)

## N-gram bar chart

In [None]:
# uni-gram
npt.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

In [None]:
# bi-gram
npt.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

In [None]:
# tri-gram
npt.bar_ngram(
    title='tri-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=3,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

### Comparison of each sentiment

In [None]:
# positive/neutral/negative
fig_unigram_positive = npt_positive.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_unigram_neutral = npt_neutral.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_unigram_negative = npt_negative.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

In [None]:
# subplot
trace1 = fig_unigram_positive['data'][0]
trace2 = fig_unigram_neutral['data'][0]
trace3 = fig_unigram_negative['data'][0]

fig = make_subplots(rows=1, cols=3, subplot_titles=('positive', 'neutral', 'negative'), shared_xaxes=False)
fig.update_xaxes(title_text='word count', row=1, col=1)
fig.update_xaxes(title_text='word count', row=1, col=2)
fig.update_xaxes(title_text='word count', row=1, col=3)

fig.update_layout(height=1100, width=1000, title_text='unigram positive vs neutral vs negative')
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)

fig.show()

In [None]:
# positive/neutral/negative
fig_bigram_positive = npt_positive.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_bigram_neutral = npt_neutral.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_bigram_negative = npt_negative.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

In [None]:
# subplot
trace1 = fig_bigram_positive['data'][0]
trace2 = fig_bigram_neutral['data'][0]
trace3 = fig_bigram_negative['data'][0]

fig = make_subplots(rows=1, cols=3, subplot_titles=('positive', 'neutral', 'negative'), shared_xaxes=False)
fig.update_xaxes(title_text='word count', row=1, col=1)
fig.update_xaxes(title_text='word count', row=1, col=2)
fig.update_xaxes(title_text='word count', row=1, col=3)

fig.update_layout(height=1100, width=1000, title_text='bigram positive vs neutral vs negative')
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)

fig.show()

## Tree Map

In [None]:
npt.treemap(
    title='All sentiment Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)

In [None]:
npt_positive.treemap(
    title='Positive Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)

In [None]:
npt_neutral.treemap(
    title='Neutral Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)

In [None]:
npt_negative.treemap(
    title='Negative Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)

## Histogram of the word count

In [None]:
npt.word_distribution(
    title='number of words distribution'
)

### positive/neutral/negative

In [None]:
fig_wd_positive = npt_positive.word_distribution()
fig_wd_neutral = npt_neutral.word_distribution()
fig_wd_negative = npt_negative.word_distribution()

In [None]:
trace1 = fig_wd_positive['data'][0]
trace2 = fig_wd_neutral['data'][0]
trace3 = fig_wd_negative['data'][0]

fig = make_subplots(rows=3, cols=1, subplot_titles=('positive', 'neutral', 'negative'), shared_xaxes=True)

fig.update_layout(height=1200, width=900, title_text='words distribution positive vs neutral vs negative')
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)
fig.add_trace(trace3, row=3, col=1)

fig.show()

# Word cloud

In [None]:
# All sentiment
npt.wordcloud(
    stopwords=stopwords,
    colormap='tab20_r',
)

### positive/neutral/negative

In [None]:
# positive
npt_positive.wordcloud(
    stopwords=stopwords,
    colormap='tab20_r',
)

In [None]:
# neutral
npt_neutral.wordcloud(
    stopwords=stopwords,
    colormap='tab20_r',
)

In [None]:
# negative
npt_negative.wordcloud(
    stopwords=stopwords,
    colormap='tab20_r',
)

# Co-occurrence network

In [None]:
npt.build_graph(stopwords=stopwords, min_edge_frequency=25)
npt_positive.build_graph(stopwords=stopwords, min_edge_frequency=10)
npt_neutral.build_graph(stopwords=stopwords, min_edge_frequency=10)
npt_negative.build_graph(stopwords=stopwords, min_edge_frequency=10)

In [None]:
# graph data
display(
    npt.node_df.head(),
    npt.edge_df.head(),
)

In [None]:
# all data
npt.co_network(
    title='All sentiment Co-occurrence network',
    color_palette='hls',
    width=1000,
    height=1200,
)

### positive/neutral/negative

In [None]:
npt_positive.co_network(
    title='Positive Co-occurrence network',
    color_palette='hls',
    width=1000,
    height=1200,
)

In [None]:
npt_neutral.co_network(
    title='Neutral Co-occurrence network',
    color_palette='hls',
    width=1000,
    height=1200,
)

In [None]:
npt_negative.co_network(
    title='Negative Co-occurrence network',
    color_palette='hls',
    width=1000,
    height=1200,
)

# sunburst chart

In [None]:
npt.sunburst(
    title='All sentiment sunburst chart',
    colorscale=True,
    color_continuous_scale='Oryel',
    width=1000,
    height=800,
)

### positive/neutral/negative

In [None]:
npt_positive.sunburst(
    title='Positive sunburst chart',
    colorscale=True,
    color_continuous_scale='Oryel',
    width=1000,
    height=800,
)

In [None]:
npt_neutral.sunburst(
    title='Neutral sunburst chart',
    colorscale=True,
    color_continuous_scale='Oryel',
    width=1000,
    height=800,
)

In [None]:
npt_negative.sunburst(
    title='Negative sunburst chart',
    colorscale=True,
    color_continuous_scale='Oryel',
    width=1000,
    height=800,
)

# pyLDAvis

In [None]:
npt.ldavis(num_topics=3, passes=5, save=False)