# Citation

Much of the code and examples are copied/modified from 

> Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.
>

- https://github.com/blueprints-for-text-analytics-python/blueprints-text
- https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch01/First_Insights.ipynb

---

# Setup

In [None]:
%run "/code/source/config/notebook_settings.py"

In [None]:
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, \
    count_keywords_by, impurity

In [None]:
with Timer("Loading Data"):
    path = 'artifacts/data/processed/un-general-debates-blueprint.pkl'
    df = pd.read_pickle(path)

---

# Exploratory Data Analysis

This section provides a basic exploration of the text and dataset.

## Dataset Summary

In [None]:
hlp.pandas.numeric_summary(df)

In [None]:
hlp.pandas.non_numeric_summary(df)

In [None]:
df.head(2)

---

### Impurity

In [None]:
df['text'].apply(impurity).max()

### Examples

In [None]:
df['text'].iloc[0][0:1000]

In [None]:
'|'.join(df['tokens'].iloc[0])[0:1000]

In [None]:
'|'.join(df['bi_grams'].iloc[0])[0:1000]

## Explore Non-Text Columns

Explore idiosyncrasies of various columns, e.g. same speaker represented multiple ways.

In [None]:
df[df['speaker'].str.contains('Bush')]['speaker'].value_counts()

---

## Explore Text Column|

### Top Words Used

In [None]:
count_tokens(df['tokens']).head(20)

---

### Distribution of Text Length

In [None]:
ax = df['text_length'].plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution of Text Length")
ax.set_xlabel("# of Characters")
ax.set_yticklabels([])
ax;

In [None]:
ax = df['text_length'].plot(kind='hist', bins=60, figsize=(10, 2));
ax.set_title("Distribution of Text Length")
ax.set_xlabel("# of Characters")
ax;

In [None]:
import seaborn as sns
sns.displot(df['text_length'], bins=60, kde=True, height=3, aspect=3);

In [None]:
where = df['country'].isin(['USA', 'FRA', 'GBR', 'CHN', 'RUS'])
g = sns.catplot(data=df[where], x="country", y="text_length", kind='box')
g.fig.set_size_inches(6, 3)
g.fig.set_dpi(100)
g = sns.catplot(data=df[where], x="country", y="text_length", kind='violin')
g.fig.set_size_inches(6, 3)
g.fig.set_dpi(100)

In [None]:
assert not df[['year', 'country']].duplicated().any()
df.groupby('year').size().plot(title="Number of Countries");

In [None]:
df.\
    groupby('year').\
    agg({'text_length': 'mean'}).\
    plot(title="Avg. Speech Length", ylim=(0,30000));

### Word Frequency

In [None]:
counts_df = count_tokens(df['tokens'])

In [None]:
counts_df.head()

In [None]:
def plot_wordcloud(frequency_dict):
    wc = wordcloud.WordCloud(
        background_color='white',
        #colormap='RdYlGn',
        colormap='tab20b',
        width=round(hlp.plot.STANDARD_WIDTH*100),
        height=round(hlp.plot.STANDARD_HEIGHT*100),
        max_words = 200, max_font_size=150,
        random_state=42
    )
    wc.generate_from_frequencies(frequency_dict)

    fig, ax = plt.subplots(figsize=(hlp.plot.STANDARD_WIDTH, hlp.plot.STANDARD_HEIGHT))
    ax.imshow(wc, interpolation='bilinear')
    #plt.title("XXX")
    plt.axis('off')

In [None]:
plot_wordcloud(counts_df.to_dict()['frequency']);

### TF-IDF

In [None]:
tf_idf_df = tf_idf(
    df=df,
    tokens_column='tokens',
    segment_columns = None,
    min_frequency_corpus=20,
    min_frequency_document=20,
)

In [None]:
ax = tf_idf_df.\
    sort_values(by='frequency', ascending=False).\
    head(30)[['frequency']].\
    plot(kind='barh', width=0.99)
ax.set_title("Frequency of Uni-Grams")
ax.set_xlabel("Frequency")
ax.invert_yaxis();

In [None]:
ax = tf_idf_df.head(30)[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Uni-Grams")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
plot_wordcloud(tf_idf_df.to_dict()['tf-idf']);

#### Per Year - 1970

In [None]:
tf_idf_per_year = tf_idf(
    df=df,
    tokens_column='tokens',
    segment_columns = 'year',
    min_frequency_corpus=10,
    min_frequency_document=10,
)

In [None]:
stop_words = ['twenty-fifth', 'twenty-five', 'twenty', 'fifth']
tokens_to_show = tf_idf_per_year.query('year == 1970').reset_index()
tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]

In [None]:
ax = tokens_to_show.head(30).set_index('token')[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Uni-Grams - 1970")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']

In [None]:
plot_wordcloud(tokens_to_show);

#### Per Year - 2015

In [None]:
stop_words = ['seventieth']
tokens_to_show = tf_idf_per_year.query('year == 2015').reset_index()
tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]

In [None]:
ax = tokens_to_show.head(30).set_index('token')[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Uni-Grams - 2015")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

In [None]:
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']

In [None]:
plot_wordcloud(tokens_to_show);

### Keywords in Context

In [None]:
contexts = get_context_from_keyword(
    documents=df[df['year'] == 2015]['text'],
    window_width=50,
    keyword='sdgs', random_seed=42
)
for x in contexts:
    print(x)

In [None]:
contexts = get_context_from_keyword(
    documents=df[df['year'] == 2015]['text'],
    window_width=50,
    keyword='sids', random_seed=42
)
for x in contexts:
    print(x)

In [None]:
contexts = get_context_from_keyword(
    documents=df[df['year'] == 2015]['text'],
    window_width=50,
    keyword='pv', random_seed=42
)
for x in contexts:
    print(x)

---

## Bi-Grams

In [None]:
tf_idf_df = tf_idf(
    df=df,
    tokens_column='bi_grams',
    segment_columns = None,
    min_frequency_corpus=20,
    min_frequency_document=20,
)

In [None]:
ax = tf_idf_df.sort_values(by='frequency', ascending=False).head(30)[['frequency']].plot(kind='barh', width=0.99)
ax.set_title("Frequency of Bi-Grams")
ax.set_xlabel("Frequency")
ax.invert_yaxis();

In [None]:
ax = tf_idf_df.head(30)[['tf-idf']].plot(kind='barh', width=0.99)
ax.set_title("TF-IDF of Bi-Grams")
ax.set_xlabel("TF-IDF")
ax.invert_yaxis();

#### By Year

In [None]:
tf_idf_per_year = tf_idf(
    df=df,
    tokens_column='bi_grams',
    segment_columns = 'year',
    min_frequency_corpus=3,
    min_frequency_document=3,
)

In [None]:
stop_words = ['twenty-fifth anniversary', 'twenty-five years', 'twenty years', 'twenty fifth']
tokens_to_show = tf_idf_per_year.query('year == 1970').reset_index()
tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']
plot_wordcloud(tokens_to_show);

In [None]:
stop_words = ['seventieth anniversary']
tokens_to_show = tf_idf_per_year.query('year == 2015').reset_index()
tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']
plot_wordcloud(tokens_to_show);

In [None]:
stop_words = ['seventieth anniversary']
#tokens_to_show = tf_idf_per_year.query('year == 2015').reset_index()
tokens_to_show = tf_idf_per_year.reset_index()
tokens_to_show = tokens_to_show[~tokens_to_show.token.isin(stop_words)]

# filter for "climate"
tokens_to_show = tokens_to_show[tokens_to_show['token'].str.contains('climate')]
tokens_to_show

In [None]:
tokens_to_show = tokens_to_show[['token', 'tf-idf']].set_index('token')
tokens_to_show = tokens_to_show.to_dict()['tf-idf']
plot_wordcloud(tokens_to_show);

---

# Count Keywords Over Time

In [None]:
keyword_count_over_time = count_keywords_by(
    df=df,
    by='year',
    tokens='tokens',
    keywords = ['nuclear', 'terrorism', 'climate', 'freedom'],
)
keyword_count_over_time.head()

In [None]:
ax = keyword_count_over_time.plot(kind='line')
ax.set_title("Keyword count over time")
ax.set_ylabel("# of Occurrences")

In [None]:
keyword_count_over_time = count_keywords_by(
    df=df,
    by='year',
    tokens='bi_grams',
    keywords = ['climate change', 'human rights', 'middle east'],
)
keyword_count_over_time.head()

In [None]:
ax = keyword_count_over_time.plot(kind='line')
ax.set_title("Keyword count over time")
ax.set_ylabel("# of Occurrences")

In [None]:
contexts = get_context_from_keyword(
    documents=df[df['year'] <= 1980]['text'],
    window_width=50,
    keyword='human rights', random_seed=42
)
for x in contexts:
    print(x)

---

In [None]:
keywords = [
    'terrorism', 'terrorist', 'nuclear', 'war', 'oil',
    'syria', 'syrian', 'refugees', 'migration', 'peacekeeping', 
    'humanitarian', 'climate', 'change', 'sustainable', 'sdgs'
]

freq_df = count_keywords_by(df, by='year', tokens='tokens', keywords=keywords)
freq_df.head()

In [None]:
df.groupby('year')['num_tokens'].sum().head()

In [None]:
# compute relative frequencies based on total number of tokens per year
freq_df = freq_df.div(df.groupby('year')['num_tokens'].sum(), axis=0)
# apply square root as sublinear filter for better contrast
freq_df = freq_df.apply(np.sqrt)
freq_df.head()

In [None]:
plt.figure(figsize=(10, 3))
sns.set(font_scale=1)
sns.heatmap(
    data=freq_df.T, 
    xticklabels=True, yticklabels=True, cbar=False, cmap="Reds"
)
sns.set(font_scale=1);

---