In [None]:
%load_ext lab_black

In [None]:
# Loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from dateutil import parser
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
import matplotlib.pyplot as plt
import random
import unidecode
import emoji
import json
import gc
import os
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [None]:
%matplotlib inline

sp = spacy.load("it_core_news_lg")

In [None]:
TYPE_PAGE_LIST = ["politicians"]
TYPE_SOCIAL_LIST = ["facebook"]  # ["facebook", "instagram"]
COUNTRY_PAGE_LIST = ["italy"]

DATA_PATH = "Data/"

COLUMNS_TYPES_FB = {}
COLUMNS_TYPES_IG = {}

In [None]:
politicians_party_map = {
    "Matteo Salvini": "Lega - Salvini Premier",
    "Luigi Di Maio": "MoVimento 5 Stelle",
    "Silvio Berlusconi": "Forza Italia",
    "Nicola Zingaretti": "Partito Democratico",
    "Emma Bonino": "Più Europa",
    "Giorgia Meloni": "Fratelli d'Italia",
    "Matteo Renzi": "Italia Viva",
    "Nicola Fratoianni": "",
    "Giuseppe Conte": "",
}

## Read Data

In [None]:
MONTHS_2019 = {"dec": 12}
MONTHS_2020 = {
    "jan": 1,
    "feb": 2,
    "mar": 3,
    "apr": 4,
    "may": 5,
    "jun": 6,
    "jul": 7,
    "aug": 8,
}
df_map = {}
df_months_map = {
    "dec": {},
    "jan": {},
    "feb": {},
    "mar": {},
    "apr": {},
    "may": {},
    "jun": {},
    "jul": {},
    "aug": {},
}

In [None]:
for type_page in TYPE_PAGE_LIST:
    for type_social in TYPE_SOCIAL_LIST:
        for country_page in COUNTRY_PAGE_LIST:
            name_df = "_".join((type_page, type_social, country_page))
            df_map[name_df] = pd.read_csv(DATA_PATH + name_df + ".csv", header=0,)
            df_map[name_df]["Created"] = df_map[name_df]["Created"].apply(
                lambda x: parser.parse(x)
            )
            if type_social == "instagram":
                df_map[name_df]["Description"] = df_map[name_df]["Description"].fillna(
                    value=""
                )
            else:
                df_map[name_df]["Message"] = df_map[name_df]["Message"].fillna(value="")

In [None]:
# df_map["politicians_facebook_italy"].head()

### Create DFs Map per Month

In [None]:
for name_df, df in df_map.items():
    df_months_map[list(MONTHS_2019.keys())[0]][name_df] = df_map[name_df][
        (df_map[name_df]["Created"] >= "2019-12-01")
        & (df_map[name_df]["Created"] < "2020-01-01")
    ].copy()

for name_df, df in df_map.items():
    for month in MONTHS_2020.keys():
        df_months_map[month][name_df] = df_map[name_df][
            (df_map[name_df]["Created"] >= "2020-{}-01".format(MONTHS_2020[month]))
            & (df_map[name_df]["Created"] < "2020-{}-01".format(MONTHS_2020[month] + 1))
        ].copy()

In [None]:
del df_map
gc.collect()

### Creating Text Blocks

In [None]:
all_stopwords = sp.Defaults.stop_words
with open("stopwords_italian.json") as json_file:
    italian_stopwords = json.load(json_file)
all_stopwords |= set(italian_stopwords["stopwords"])

In [None]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u"", text)

In [None]:
def grey_color_func(
    word, font_size, position, orientation, random_state=None, **kwargs
):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

In [None]:
month_text_blocks = {
    "dec": defaultdict(str),
    "jan": defaultdict(str),
    "feb": defaultdict(str),
    "mar": defaultdict(str),
    "apr": defaultdict(str),
    "may": defaultdict(str),
    "jun": defaultdict(str),
    "jul": defaultdict(str),
    "aug": defaultdict(str),
}
df_specific_party_map = {
    "dec": {},
    "jan": {},
    "feb": {},
    "mar": {},
    "apr": {},
    "may": {},
    "jun": {},
    "jul": {},
    "aug": {},
}

In [None]:
def clean_text(text):
    text = remove_emoji(text)
    # text = unidecode.unidecode(text)
    text = text.lower()
    text = text.strip()
    text = re.sub("#\S+:", "", text)
    text = re.sub("# \S+ :", "", text)
    text = re.sub("#\S+ :", "", text)
    text = re.sub("# \S+:", "", text)
    text = re.sub("#\S+", "", text)
    text = re.sub("legaonline.it\S+", "", text)
    text = re.sub("[,\.!?#]", "", text)
    text = re.sub("\s+", " ", text)
    text = (
        text.replace("http", "")
        .replace("www", "")
        .replace("shortener", "")
        .replace("ref", "")
        .replace("matteo salvini", "salvini")
        .replace("user", "")
        .replace("legaonline.it/iostoconsalvini", "")
    )
    text = re.sub("\s+", " ", text)
    # stop_words = set(stopwords.words("italian"))
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]

    res = " ".join(tokens_without_sw)
    return res

In [None]:
for month, df_map_month in df_months_map.items():
    for df_name, df_data in df_map_month.items():
        for pol, party in politicians_party_map.items():
            if len(party) > 0:
                complete_name = (
                    pol.lower().replace(" - ", " ").replace(" ", "_")
                    + "__"
                    + party.lower().replace(" - ", " ").replace(" ", "_")
                )
                df_specific_party_map[month][complete_name] = df_data.loc[
                    df_data["Page Name"].isin([pol, party])
                ].copy()
            else:
                complete_name = pol.lower().replace(" - ", " ").replace(" ", "_")
                df_specific_party_map[month][complete_name] = df_data.loc[
                    df_data["Page Name"].isin([pol])
                ].copy()
            df_specific_party_map[month][complete_name][
                "Message"
            ] = df_specific_party_map[month][complete_name]["Message"].apply(
                lambda x: clean_text(x)
            )
            month_text_blocks[month][complete_name] += " ".join(
                list(df_specific_party_map[month][complete_name]["Message"].values)
            )

In [None]:
gc.collect()

### Create and generate wordclouds:

In [None]:
for month, map_texts in month_text_blocks.items():
    for pol_party, text_data in map_texts.items():
        wordcloud = WordCloud(
            # max_font_size=50,
            # max_words=50,
            # background_color="darkblue",
            # colormap="Blues",
            # min_font_size=10,
            # margin=10,
            # random_state=1,
            width=750,
            height=500,
        ).generate(text_data)
        # default_colors = wordcloud.to_array()
        plt.figure()
        plt.title(
            pol_party.replace("_", " ").replace("__", " ").capitalize(), fontsize=20
        )
        plt.imshow(
            wordcloud,
            # wordcloud.recolor(color_func=grey_color_func, random_state=3),
            interpolation="bilinear",
        )
        # plt.imshow(wordcloud, interpolation="bilinear", aspect="auto")
        plt.axis("off")
        if not path.isdir("Output/lda_wordcloud_by_month/wordclouds/" + month):
            os.mkdir("Output/lda_wordcloud_by_month/wordclouds/" + month)
        plt.savefig(
            "Output/lda_wordcloud_by_month/wordclouds/"
            + month
            + "/"
            + pol_party
            + ".png",
            dpi=300,
        )
        plt.close("all")
        # plt.show()

## LDA Analysis

In [None]:
sns.set_style("whitegrid")
# Helper function
def plot_10_most_common_words(count_data, count_vectorizer, month, name):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        # print(t)
        total_counts += t.toarray()[0]

    count_dict = zip(words, total_counts)
    count_dict = sorted(count_dict, key=lambda x: x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words))

    plt.figure(2, figsize=(15, 15 / 1.6180))
    plt.subplot(
        title="10 most common words - {} - {}".format(
            month.replace("_", " ").capitalize(), name.replace("_", " ").capitalize()
        )
    )
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette="husl")
    plt.xticks(x_pos, words, rotation=90)
    plt.xlabel("words")
    plt.ylabel("counts")
    plt.show()

In [None]:
vectorized_data_map = {
    "dec": {},
    "jan": {},
    "feb": {},
    "mar": {},
    "apr": {},
    "may": {},
    "jun": {},
    "jul": {},
    "aug": {},
}
count_vectorizer_map = {
    "dec": {},
    "jan": {},
    "feb": {},
    "mar": {},
    "apr": {},
    "may": {},
    "jun": {},
    "jul": {},
    "aug": {},
}
lda_map = {
    "dec": {},
    "jan": {},
    "feb": {},
    "mar": {},
    "apr": {},
    "may": {},
    "jun": {},
    "jul": {},
    "aug": {},
}

### Removing Immigration words to avoid redundancy

In [None]:
words_immigration = [
    "immigrati",
    "immigrate",
    "immigrato",
    "migranti",
    "migrante",
    "immigrazione",
    "sbarchi",
    "sbarco",
    "clandestini",
    "sbarcati",
    "sbarcato",
    "clandestino",
    "barconi",
]

In [None]:
def clean_text_immigration(text):
    for w in words_immigration:
        text = text.replace(w, "")
    return text

In [None]:
for month, dicts in df_specific_party_map.items():
    for name, df in dicts.items():
        # facebook
        df["Message"] = df["Message"].apply(lambda x: clean_text_immigration(x))

### Initialise the count vectorizer with the Italian stop words - Fit and transform the processed text

In [None]:
for month, dicts in df_specific_party_map.items():
    for name, df in dicts.items():
        count_vectorizer = CountVectorizer(stop_words=all_stopwords)
        vectorized_data_map[month][name] = count_vectorizer.fit_transform(df["Message"])
        count_vectorizer_map[month][name] = count_vectorizer
        del count_vectorizer
        gc.collect()
        # plot_10_most_common_words(
        #    vectorized_data_map[month][name],
        #    count_vectorizer_map[month][name],
        #    month,
        #    name,
        # )

### Create and fit the LDA model

In [None]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(len(topic))
        print(" ".join([words[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]))


# Tweak the two parameters below
number_topics = 4
number_words = 6

print("Topics found via LDA:")
for month, dicts in vectorized_data_map.items():
    print("∞" * 75)
    print("∞" * 75)
    print("MONTH: {}".format(month))
    print("∞" * 75)
    for name, df in dicts.items():
        print("NAME: {}".format(name))
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit_transform(df)
        lda_map[month][name] = lda
        gc.collect()
        # Print the topics found by the LDA model
        # print("Topics found via LDA:")
        print_topics(
            lda_map[month][name], count_vectorizer_map[month][name], number_words
        )
        print("=" * 75)

In [None]:
%%time
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
import os

for month, dicts in vectorized_data_map.items():
    for name, df in dicts.items():
        if not path.isdir("Output/lda_wordcloud_by_month/lda/" + month):
            os.mkdir("Output/lda_wordcloud_by_month/lda/" + month)
        LDAvis_data_filepath = 'Output/lda_wordcloud_by_month/lda/' + month + '/ldavis_{}_{}_{}_{}.pickle'.format(month,name,number_topics, number_words)
        LDAvis_prepared = sklearn_lda.prepare(lda_map[month][name], df, count_vectorizer_map[month][name])
        #with open(LDAvis_data_filepath, 'wb') as f:
        #    pickle.dump(LDAvis_prepared, f)
        pyLDAvis.save_html(LDAvis_prepared,'Output/lda_wordcloud_by_month/lda/' + month + '/ldavis_{}_{}_{}_{}.html'.format(month,name,number_topics, number_words))
    
# load the pre-prepared pyLDAvis data from disk
#with open(LDAvis_data_filepath) as f:
#    LDAvis_prepared = pickle.load(f)