In [None]:
import pandas as pd
import re
import numpy as np
from collections import Counter
import json

In [None]:
df = pd.read_csv('fwb-australia-topic-prediction.csv')

In [None]:
df.preprocessed_list_with_bi_tri[0]

In [None]:
# Convert the list of words within a string 
# to a list of words
df.preprocessed_list_with_bi_tri = df.preprocessed_list_with_bi_tri.apply(lambda x: re.findall(r"'(\w+)'", x))

In [None]:
df

In [None]:
# Convert the timestamp column to datetime
df['publication_date'] = pd.to_datetime(df['publication_date'], unit='ms')

In [None]:
df['year'] = df.publication_date.dt.year

# Convert it to a json format

## For 2022

In [None]:
def get_filtered_data_on_year(year):
    df_year = df[(df.year == year) | (df.year == year - 1)]
    list_of_year_words = np.concatenate(df_year.preprocessed_list_with_bi_tri.values).tolist()
    # Calculate the frequencies of each word
    word_frequencies_year = Counter(list_of_year_words)
    # Convert the frequencies to a dataframe
    frequencies_df_year = pd.DataFrame.from_dict(word_frequencies_year, orient='index', columns=['Frequency'])
    # Sort the dataframe by frequency in descending order
    frequencies_df_year = frequencies_df_year.sort_values('Frequency', ascending=False)
    frequencies_df_year = frequencies_df_year.iloc[0:60]
    return frequencies_df_year, df_year

In [None]:
def get_json_data(frequencies_df):
    json_data_year = []
    for i,r in frequencies_df.iterrows():
        json_data = {"word": '', "frequency": 0}
        json_data.update({"word": i, "frequency": int(r[0])})
        json_data_year.append(json_data)
    # Serialize the JSON data
    json_string = json.dumps(json_data_year, ensure_ascii=False)

    # Print the JSON string
    return json_string


In [None]:
frequencies_df_2022, df_2022 = get_filtered_data_on_year(year=2022)

In [None]:
get_json_data(frequencies_df_2022)

## For 2019

In [None]:
frequencies_df_2019, df_2019 = get_filtered_data_on_year(year=2019)

In [None]:
get_json_data(frequencies_df_2019)

# For the bar chart

## For 2022

In [None]:
def get_topic_names(year):
    topic_names_inc_exc = pd.read_excel('../../../../australia/Financial_wellbeing_topic_names.xlsx', sheet_name='in')
    # Retain the appropriate topics only
    imp_topics_df = topic_names_inc_exc
    imp_topics_df['Topic Number'] = imp_topics_df['Topic Number'].apply(lambda x: 'topic '+str(x))
    topics_renamed = dict(zip(imp_topics_df['Topic Number'].values, imp_topics_df['Topic Name (created by research team)'].values))
    if year == 2022:
        df_2022['renamed_topics'] = df_2022.corrected_topics.map(topics_renamed)
        return df_2022
    elif year == 2019:
        df_2019['renamed_topics'] = df_2019.corrected_topics.map(topics_renamed)
        return df_2019
    


In [None]:
def get_json_data_bar(df_value_counts):
    dates = []
    for num, (i, r) in enumerate(df_value_counts.iteritems()):
        dates.append({'date: ' + f'new Date("2021-0{num+3}-01"), value: {r}, label: "{i}"'})
    return dates

In [None]:
df_2022 = get_topic_names(2022)

In [None]:
df_2022_topic_counts = df_2022.renamed_topics.value_counts()

In [None]:
df_2022_topic_counts

## For 2019

In [None]:
df_2019 = get_topic_names(2019)

In [None]:
df_2019_topic_counts = df_2019.renamed_topics.value_counts()

In [None]:
df_2019_topic_counts

In [None]:
get_json_data_bar(df_2019_topic_counts)