# Citation

Much of the code and examples are copied/modified from 

> Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.
>

- https://github.com/blueprints-for-text-analytics-python/blueprints-text
- https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch01/First_Insights.ipynb

---

# Setup

In [None]:
cd ../..

In [None]:
%run "source/config/notebook_settings.py"

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
from source.library.utilities import Timer, get_logger
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, count_keywords_by, impurity

In [None]:
with Timer("Loading Data"):
    path = 'artifacts/data/processed/reddit.pkl'
    df = pd.read_pickle(path)

---

# Exploratory Data Analysis

This section provides a basic exploration of the text and dataset.

## Dataset Summary

In [None]:
df.head(1)

### Numeric Summary

In [None]:
hlp.pandas.numeric_summary(df)

### Non-Numeric

In [None]:
hlp.pandas.non_numeric_summary(df)

---

### Examples

In [None]:
df['post'].iloc[0][0:1000]

In [None]:
'|'.join(df['partial_lemmas'].iloc[0])[0:1000]

In [None]:
'|'.join(df['bi_grams'].iloc[0])[0:1000]

In [None]:
'|'.join(df['noun_phrases'].iloc[0])[0:1000]

## Explore Non-Text Columns

#### Impurity

In [None]:
ax = df['impurity'].plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution of Post Impurity")
ax.set_xlabel("Impurity")
ax.set_yticklabels([])
ax;

In [None]:
df[['impurity', 'post', 'post_clean']].sort_values('impurity', ascending=False).head()

In [None]:
df['language'].value_counts(ascending=False)

In [None]:
df['subreddit'].value_counts(ascending=False)

Explore idiosyncrasies of various columns, e.g. same speaker represented multiple ways.

---

## Explore Text Column|

### Top Words Used

In [None]:
remove_tokens = {'_number_', 'car'}
count_tokens(df['partial_lemmas'], remove_tokens=remove_tokens).head(10)

---

### Distribution of Text Length

In [None]:
ax = df['post_length'].plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution of Post Length")
ax.set_xlabel("# of Characters")
ax.set_yticklabels([])
ax;

In [None]:
ax = df['post_length'].plot(kind='hist', bins=60, figsize=(10, 2));
ax.set_title("Distribution of Post Length")
ax.set_xlabel("# of Characters")
ax;

In [None]:
import seaborn as sns
sns.displot(df['post_length'], bins=60, kde=True, height=3, aspect=3);

In [None]:
where = df['subreddit'].isin([
    'Lexus', 
    'mercedes_benz',
    'Audi',
    'Volvo',
    'BMW',
])
g = sns.catplot(data=df[where], x="subreddit", y="post_length", kind='box')
g.fig.set_size_inches(6, 3)
g.fig.set_dpi(100)
g = sns.catplot(data=df[where], x="subreddit", y="post_length", kind='violin')
g.fig.set_size_inches(6, 3)
g.fig.set_dpi(100)

### Word Frequency

In [None]:
counts_df = count_tokens(df['partial_lemmas'], remove_tokens=remove_tokens)

In [None]:
def plot_wordcloud(frequency_dict):
    wc = wordcloud.WordCloud(background_color='white',
        #colormap='RdYlGn',
        colormap='tab20b',
        width=round(hlp.plot.STANDARD_WIDTH*100),
        height=round(hlp.plot.STANDARD_HEIGHT*100),
        max_words = 200, max_font_size=150,
        random_state=42
    )
    wc.generate_from_frequencies(frequency_dict)

    fig, ax = plt.subplots(figsize=(hlp.plot.STANDARD_WIDTH, hlp.plot.STANDARD_HEIGHT))
    ax.imshow(wc, interpolation='bilinear')
    #plt.title("XXX")
    plt.axis('off')

In [None]:
plot_wordcloud(counts_df.to_dict()['frequency']);

### TF-IDF

In [None]:
tf_idf_lemmas = tf_idf(
    df=df,
    tokens_column='partial_lemmas',
    segment_columns = None,
    min_frequency_corpus=20,
    min_frequency_document=20,
    remove_tokens=remove_tokens,
)
tf_idf_lemmas.head()

In [None]:
remove_tokens_bi_grams = {'_number_ year', '_number_ _number_', 'hey guy'}
tf_idf_bi_grams = tf_idf(
    df=df,
    tokens_column='bi_grams',
    segment_columns = None,
    min_frequency_corpus=20,
    min_frequency_document=20,
    remove_tokens=remove_tokens_bi_grams,
)
tf_idf_bi_grams.head()

In [None]:
tf_idf_nouns = tf_idf(
    df=df,
    tokens_column='nouns',
    segment_columns = None,
    min_frequency_corpus=20,
    min_frequency_document=20,
    remove_tokens=remove_tokens,
)
tf_idf_nouns.head()

In [None]:
tf_idf_noun_phrases = tf_idf(
    df=df,
    tokens_column='noun_phrases',
    segment_columns = None,
    min_frequency_corpus=20,
    min_frequency_document=20,
    remove_tokens=remove_tokens_bi_grams,
)
tf_idf_noun_phrases.head()

---

In [None]:
ax = tf_idf_lemmas.head(30)[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Uni-Grams")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
ax = tf_idf_bi_grams.head(30)[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Bi-Grams")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
ax = tf_idf_nouns.head(30)[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Bi-Grams")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
ax = tf_idf_noun_phrases.head(30)[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Bi-Grams")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
plot_wordcloud(tf_idf_lemmas.to_dict()['tf-idf']);

In [None]:
plot_wordcloud(tf_idf_bi_grams.to_dict()['tf-idf']);

#### By Subreddit

In [None]:
remove_tokens_subreddit = set(df.subreddit.str.lower().unique())
remove_tokens_subreddit

In [None]:
tf_idf_lemmas_per_sub = tf_idf(
    df=df,
    tokens_column='partial_lemmas',
    segment_columns = 'subreddit',
    min_frequency_corpus=10,
    min_frequency_document=10,
    remove_tokens=remove_tokens | remove_tokens_subreddit 
)
tf_idf_lemmas_per_sub.head(5)

In [None]:
tf_idf_bigrams_per_sub = tf_idf(
    df=df,
    tokens_column='bi_grams',
    segment_columns = 'subreddit',
    min_frequency_corpus=10,
    min_frequency_document=10,
    remove_tokens=remove_tokens_bi_grams
)
tf_idf_bigrams_per_sub.head(5)

In [None]:
tf_idf_nouns_per_sub = tf_idf(
    df=df,
    tokens_column='nouns',
    segment_columns = 'subreddit',
    min_frequency_corpus=10,
    min_frequency_document=10,
    remove_tokens=remove_tokens | remove_tokens_subreddit
)
tf_idf_nouns_per_sub.head(5)

In [None]:
tf_idf_nounphrases_per_sub = tf_idf(
    df=df,
    tokens_column='noun_phrases',
    segment_columns = 'subreddit',
    min_frequency_corpus=10,
    min_frequency_document=10,
    remove_tokens=remove_tokens_bi_grams
)
tf_idf_nounphrases_per_sub.head(5)

---

In [None]:
tokens_to_show = tf_idf_lemmas_per_sub.query("subreddit in ['Lexus', 'Volvo']").reset_index()
tokens_to_show.head()

In [None]:
px.bar(
    tokens_to_show.groupby(['subreddit']).head(20).sort_values('tf-idf', ascending=True),
    x='tf-idf',
    y='token',
    color='subreddit',
    barmode='group',
    title="Top 20 Lemmas for Volvo & Lexus"
)

In [None]:
tokens_to_show = tf_idf_bigrams_per_sub.query("subreddit in ['Lexus', 'Volvo']").reset_index()
tokens_to_show.head()

In [None]:
px.bar(
    tokens_to_show.groupby(['subreddit']).head(20).sort_values('tf-idf', ascending=True),
    x='tf-idf',
    y='token',
    color='subreddit',
    barmode='group',
    title="Top 20 Bi-Grams for Volvo & Lexus"
)

---

In [None]:
get_context_from_keyword(df.query("subreddit == 'Lexus'")['post'], keyword='think')

In [None]:
get_context_from_keyword(df.query("subreddit == 'Volvo'")['post'], keyword='think')

##### Lexus

In [None]:
tokens_to_show = tf_idf_lemmas_per_sub.query("subreddit == 'Lexus'").reset_index()
#tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']
plot_wordcloud(tokens_to_show);

##### Volvo

In [None]:
tokens_to_show = tf_idf_lemmas_per_sub.query("subreddit == 'Volvo'").reset_index()
#tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']
plot_wordcloud(tokens_to_show);

### Keywords in Context

In [None]:
contexts = get_context_from_keyword(
    documents=df['post'],
    window_width=50,
    keyword='replac',
    num_samples = 20,
    random_seed=42
)
for x in contexts:
    print(x)

---