In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Import packages
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Import scripts
from wordcloud import WordCloud

# Working directory to save files
path_data = "../../../bld/data/"
path_df = path_data + "df"

from stargazer.stargazer import Stargazer

In [None]:
##########################
##########################
# Analysis
##########################
##########################
"""
Outcomes already included:
    1. Frequencies of lemmas & occurrences of some topics
    2. Complexity
    3. Sentiment Analysis
Possible Other Outcomes:
    1. Ngrams
    2. Variables from Topic Modelling

Q: Does Cb Communication change during populist governments?

Analysis:
    - Descriptives
    - reg outcome ~ political variable

"""

##########################
### Frequencies
##########################

# Read speech-level dataset file
df_sp = pd.read_pickle(os.path.join(path_df, "merged_final_ind.pickle"))

# def Counter(x):

#
#

In [None]:
path_data = "../../../bld/data/"
path_bld = path_data + "df"
path_plots = "../../../bld/plots"
path_tables = "../../../bld/tables"

In [None]:
def make_folders():
    paths = [path_plots, path_tables]
    for path in paths:
        if not os.path.exists(path):
            os.mkdir(path)


make_folders()

In [None]:
# with all speeches it takes too long, so just take a random sample of speeches.
text = " ".join(str(x) for x in df_sp["lemma_sep"].sample(n=200, random_state=1))

In [None]:
# Read the whole text.

# Generate a word cloud image
wordcloud = WordCloud(background_color="white").generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
test = plt.show
# what are those asteriscs?

In [None]:
def make_labels_dict():
    clean_labels = [
        "Growth References",
        "Inflation References",
        "Inequality References",
        "Climate References",
        "Number of Words",
        "Flesch Reading Ease Index",
        "Automated Readability Index",
        "Gunning Fog Index",
        "Subjectivity Sentiments",
        "Polarization Sentiments",
    ]
    labels = [
        "growth_count",
        "inflation_count",
        "inequality_count",
        "climate_count",
        "num_words",
        "flesch_ease",
        "ari",
        "gunning_fog",
        "sent_subj",
        "sent_pol",
    ]
    return dict(zip(labels, clean_labels))

In [None]:
clean_labels = [
    "Growth References",
    "Inflation References",
    "Inequality References",
    "Climate References",
    "Number of Words",
    "Flesch Reading Ease Index",
    "Automated Readability Index",
    "Gunning Fog Index",
    "Subjectivity Sentiments",
    "Polarization Sentiments",
]
labels_dict = dict(zip(plot_data.columns, clean_labels))

In [None]:
def make_historical_plots(data):
    plot_data = (
        data[
            [
                "year",
                "growth_count",
                "inflation_count",
                "inequality_count",
                "climate_count",
                "num_words",
                "flesch_ease",
                "ari",
                "gunning_fog",
                "sent_subj",
                "sent_pol",
            ]
        ]
        .groupby("year")
        .agg("mean")
    )
    fig, ax = plt.subplots(5, 2, figsize=(8, 8))
    ax = ax.flatten()

    for i, measure in enumerate(plot_data.columns):
        ax[i].set_title(make_labels_dict()[measure])
        ax[i].plot(plot_data.index, plot_data[measure])

    fig.tight_layout()
    return fig

In [None]:
make_historical_plots(df_sp)

In [None]:
def make_crosssectional_plots(data):
    plot_data = (
        data[
            [
                "country",
                "growth_count",
                "inflation_count",
                "inequality_count",
                "climate_count",
                "num_words",
                "flesch_ease",
                "ari",
                "gunning_fog",
                "sent_subj",
                "sent_pol",
            ]
        ]
        .loc[
            df_sp["country"].isin(
                [
                    "United States",
                    "Germany",
                    "ECB",
                    "China",
                    "United Kingdom",
                    "Japan",
                    "France",
                ],
            )
        ]
        .groupby("country")
        .agg("mean")
    )
    fig, ax = plt.subplots(5, 2, figsize=(8, 16))
    ax = ax.flatten()

    for i, measure in enumerate(plot_data.columns):
        plot_data = plot_data.sort_values(by=measure)
        ax[i].bar(plot_data.reset_index()["country"], plot_data[measure])
        ax[i].set_xticklabels(
            plot_data.reset_index()["country"],
            rotation=60,
            ha="right",
            rotation_mode="anchor",
        )
        ax[i].set_title(make_labels_dict()[measure])

    fig.tight_layout()
    return fig

In [None]:
df_sp["country"]

In [None]:
def make_politics_plots(data):
    plot_data = (
        data[
            [
                "left",
                "right",
                "country",
                "growth_count",
                "inflation_count",
                "inequality_count",
                "climate_count",
                "num_words",
                "flesch_ease",
                "ari",
                "gunning_fog",
                "sent_subj",
                "sent_pol",
            ]
        ]
        .assign(
            pop=np.where(
                data["left"] == 1,
                "Populist Left",
                np.where(data["right"] == 1, "Populist Right", "Not Populist"),
            ),
        )
        .loc[~data["country"].isin(["ECB"])]
        .drop(["country", "left", "right"], axis=1)
        .groupby("pop")
        .agg("mean")
    )
    fig, ax = plt.subplots(5, 2, figsize=(6, 12))
    ax = ax.flatten()

    for i, measure in enumerate(plot_data.columns):
        ax[i].bar(plot_data.index, plot_data[measure])
        ax[i].set_xticklabels(
            plot_data.index,
            rotation=60,
            ha="right",
            rotation_mode="anchor",
        )
        ax[i].set_title(make_labels_dict()[measure])

    fig.tight_layout()
    return fig

In [None]:
def make_cbi_pol_plot(data):
    return (
        data[["CBIE", "left", "right"]]
        .assign(
            populist=np.where(
                df_sp["left"] == 1,
                "Populist Left",
                np.where(df_sp["right"] == 1, "Populist Right", "Not Populist"),
            ),
        )[["populist", "CBIE"]]
        .groupby("populist")
        .agg("mean")
        .plot.bar()
    )

In [None]:
def make_cbi_pol_plot(data):
    plot_data = (
        data[["CBIE", "left", "right"]]
        .assign(
            populist=np.where(
                data["left"] == 1,
                "Populist Left",
                np.where(data["right"] == 1, "Populist Right", "Not Populist"),
            ),
        )[["populist", "CBIE"]]
        .groupby("populist")
        .agg("mean")
    )

    fig = plt.figure()
    plt.bar(plot_data.reset_index()["populist"], plot_data["CBIE"], width=0.3)

    return fig


make_cbi_pol_plot(df_sp).savefig(os.path.join(path_plots, "politics_cbi_plot.png"))

In [None]:
df_sp_cbi["cbi_bin"] = np.where(df_sp_cbi["CBIE"] >= 0.4, 1, 0)

In [None]:
plot_data = (
    df_sp[["CBIE", "left", "right"]]
    .assign(
        populist=np.where(
            df_sp["left"] == 1,
            "Populist Left",
            np.where(df_sp["right"] == 1, "Populist Right", "Not Populist"),
        ),
    )[["populist", "CBIE"]]
    .groupby("populist")
    .agg("mean")
)
fig = plt.bar(plot_data.reset_index()["populist"], plot_data["CBIE"], width=0.3)

In [None]:
def make_cbi_speech_plot(data):
    plot_data = (
        data.reset_index()[
            [
                "left",
                "right",
                "country",
                "cbi_bin",
                "growth_count",
                "inflation_count",
                "inequality_count",
                "climate_count",
                "num_words",
                "flesch_ease",
                "ari",
                "gunning_fog",
                "sent_subj",
                "sent_pol",
            ]
        ]
        .loc[~data.reset_index()["country"].isin(["ECB"])]
        .drop(["left", "right"], axis=1)
        .groupby("cbi_bin")
        .agg("mean")
    )

    plot_data.index = plot_data.index.map({0: "Dep. CB", 1: "Indep. CB"})

    fig, ax = plt.subplots(5, 2, figsize=(6, 12))
    ax = ax.flatten()

    for i, measure in enumerate(plot_data.columns):
        ax[i].bar(plot_data.index, plot_data[measure])
        ax[i].set_xticklabels(
            plot_data.index,
            rotation=60,
            ha="right",
            rotation_mode="anchor",
        )
        ax[i].set_title(make_labels()[measure])

    fig.tight_layout()

## Regressions

In [None]:
def run_regressions(data):
    data.reset_index().dropna()

    depvars = [
        "growth_count",
        "inflation_count",
        "inequality_count",
        "climate_count",
        "num_words",
        "flesch_ease",
        "flesch_grade",
        "ari",
        "gunning_fog",
        "sent_subj",
        "sent_pol",
    ]

    for depvar in depvars:
        exec(
            f"model_{depvar} = sm.OLS.from_formula('{depvar} ~  CBIE*left + CBIE*right', data=regression_df)"
            ".fit(cov_type='cluster', cov_kwds={{'groups': regression_df['country']}})",
        )

    stargazer = Stargazer(
        [
            model_growth_count,
            model_inflation_count,
            model_climate_count,
            model_inequality_count,
        ],
    )

    stargazer.title("Political Determinants of CB Topics")
    stargazer.covariate_order(
        ["CBIE", "left", "CBIE:left", "right", "CBIE:right", "Intercept"],
    )
    stargazer.dependent_variable_name("Dependent variables:")
    stargazer.custom_columns(
        ["Growth Count", "Inflation Count", "Climate Count", "Inequality Count"],
        [
            len(stargazer.models) // 4,
            len(stargazer.models) // 4,
            len(stargazer.models) // 4,
            len(stargazer.models) // 4,
        ],
    )
    with open(os.path.join(path_tables, "regressions_part1.tex"), "w") as f:
        f.write(latex)

In [None]:
stargazer = Stargazer(
    [
        model_growth_count,
        model_inflation_count,
        model_climate_count,
        model_inequality_count,
    ],
)

stargazer.title("Political Determinants of CB Topics")
stargazer.covariate_order(
    ["CBIE", "left", "CBIE:left", "right", "CBIE:right", "Intercept"],
)
stargazer.dependent_variable_name("Dependent variables:")
stargazer.custom_columns(
    ["Growth Count", "Inflation Count", "Climate Count", "Inequality Count"],
    [
        len(stargazer.models) // 4,
        len(stargazer.models) // 4,
        len(stargazer.models) // 4,
        len(stargazer.models) // 4,
    ],
)
stargazer

In [None]:
clean_labels[4:]

In [None]:
list(make_labels_dict())[4:]

In [None]:
stargazer = Stargazer(
    [
        model_num_words,
        model_flesch_ease,
        model_ari,
        model_gunning_fog,
        model_sent_subj,
        model_sent_pol,
    ],
)

stargazer.title("Political Determinants of CB Communication Style")
stargazer.covariate_order(
    ["CBIE", "left", "CBIE:left", "right", "CBIE:right", "Intercept"],
)
stargazer.dependent_variable_name("Dependent variables:")
stargazer.custom_columns(
    clean_labels[4:],
    [
        len(stargazer.models) // 6,
        len(stargazer.models) // 6,
        len(stargazer.models) // 6,
        len(stargazer.models) // 6,
        len(stargazer.models) // 6,
        len(stargazer.models) // 6,
    ],
)
stargazer

## Topic Modelling

In [None]:
from gensim import corpora  # Create Dictionary

id2word = corpora.Dictionary(df_sp["lemma_sep"].to_list())  # Create Corpus
texts = df_sp["lemma_sep"].to_list()  # Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]  # View

In [None]:
from pprint import pprint  # number of topics

import gensim

num_topics = 4  # Build LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
)  # Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
import matplotlib.colors as mcolors
from matplotlib import pyplot as plt
from wordcloud import WordCloud

cols = [
    color for name, color in mcolors.TABLEAU_COLORS.items()
]  # more colors: 'mcolors.XKCD_COLORS'
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
cloud = WordCloud(
    stopwords=stop_words,
    background_color="white",
    width=2500,
    height=1800,
    max_words=10,
    colormap="tab10",
    color_func=lambda *args, **kwargs: cols[i],
    prefer_horizontal=1.0,
)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title("Topic " + str(i), fontdict={"size": 16})
    plt.gca().axis("off")


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis("off")
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis
# save this to html