In [1]:
import pandas as pd
import re
import numpy as np
from collections import Counter
import json

In [2]:
df = pd.read_csv('fwb-australia-topic-prediction.csv')

In [3]:
df.preprocessed_list_with_bi_tri[0]

"['form' 'compani' 'famili' 'much' 'land' 'hand' 'farm' 'children' 'part'\n 'succession_plan' 'final' 'decis' 'inevit' 'differ' 'everi' 'famili'\n 'work' 'issu' 'number' 'children' 'number' 'farm' 'children' 'amount'\n 'farm' 'debt' 'provis' 'made' 'parent' 'retir' 'coupl' 'farm' 'southern'\n 'success' 'plan' 'give' 'farm' 'son' 'run' 'properti' 'still' 'retain'\n 'control' 'land' 'start' 'think' 'plan' 'earli' 'prompt' 'main' 'account'\n 'suggest' 'could' 'form' 'compani' 'reduc' 'taxat' 'payment' 'would'\n 'turn' 'give' 'abil' 'fund' 'farm' 'superannu' 'seek' 'profession'\n 'advic' 'famili' 'compani' 'husband' 'wife' 'director' 'famili' 'trust'\n 'also' 'land' 'bought' 'either' 'compani' 'trust' 'name' 'requir' 'work'\n 'presumpt' 'least' 'son' 'would' 'return' 'home' 'sought' 'advic' 'earli'\n 'plan' 'allow' 'purchas' 'land' 'right' 'entiti' 'expand' 'busi' 'parent'\n 'retain' 'control' 'busi' 'time' 'offici' 'hand' 'son' 'like' 'will'\n 'happen' 'land' 'own' 'trust' 'compani' 'land

In [4]:
# Convert the list of words within a string 
# to a list of words
df.preprocessed_list_with_bi_tri = df.preprocessed_list_with_bi_tri.apply(lambda x: re.findall(r"'(\w+)'", x))

In [5]:
df

Unnamed: 0,text,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics,corrected_topics
0,"Forming a company key for one family HOW, wh...",AUSNZ AUSTR,1120089600000,West Australian Newspapers Limited,"['forming', 'company', 'family', 'much', 'land...","[form, compani, famili, much, land, hand, farm...",320,Topic 20,topic 20
1,"There are many different types of homebuyers, ...",AUSNZ AUSTR,1493424000000,Fairfax Media Management Pty Limited,"['many', 'different', 'types', 'homebuyers', '...","[mani, differ, type, homebuy, understand, help...",68,Topic 40,topic 40
2,Off-the-plan contracts review The NSW Governme...,AUSNZ AUSTR,1517616000000,Fairfax Media Management Pty Limited,"['plan', 'contracts', 'review', 'government', ...","[plan, contract, review, govern, releas, discu...",59,Topic 36,topic 36
3,'Super must rise to 12%': MP Whitlam MP Stephe...,AUSNZ AUSTR,1570579200000,Fairfax Media Management Pty Limited,"['super', 'must', 'rise', 'whitlam', 'stephen'...","[super, must, rise, whitlam, stephen_jon, say,...",130,Topic 28,topic 28
4,Where there's a will TODAY I'd like to talk ab...,AUSNZ AUSTR,1558742400000,Fairfax Media Management Pty Limited,"['today', 'like', 'talk', 'death', 'attention'...","[today, like, talk, death, attent, heaven_sak,...",333,Topic 46,topic 59
...,...,...,...,...,...,...,...,...,...
119942,Super choice brings costs ONE in two small b...,AUSNZ AUSTR,1130889600000,Nationwide News Pty Ltd.,"['super', 'choice', 'brings', 'costs', 'small'...","[super, choic, bring, cost, small, busi, owner...",42,Topic 47,topic 45
119943,"Home loans guide NO doc, low doc, combinatio...",AUSNZ AUSTR,1155686400000,Nationwide News Pty Ltd.,"['home', 'loans', 'guide', 'combination', 'spl...","[home, loan, guid, combin, split, honeymoon, s...",82,Topic 50,topic 50
119944,Taxing times for families TAXPAYERS having the...,AUSNZ AUSTR,1216771200000,Nationwide News Pty. Ltd.,"['taxing', 'times', 'families', 'taxpayers', '...","[tax, time, famili, taxpay, return, prepar, aw...",112,Topic 36,topic 36
119945,Legal papers vital Legal papers vital W...,AUSNZ AUSTR,1188950400000,Nationwide News Pty Ltd.,"['legal', 'papers', 'vital', 'legal', 'papers'...","[legal, paper, vital, legal, paper, vital, cen...",158,Topic 43,topic 40


In [6]:
# Convert the timestamp column to datetime
df['publication_date'] = pd.to_datetime(df['publication_date'], unit='ms')

In [7]:
df['year'] = df.publication_date.dt.year

# Convert it to a json format

## For 2022

In [8]:
def get_filtered_data_on_year(year, since=False):
    if since:
        df_year = df[df.year >= year]
    else:
        df_year = df[(df.year == year) | (df.year == year - 1)]
    list_of_year_words = np.concatenate(df_year.preprocessed_list_with_bi_tri.values).tolist()
    # Calculate the frequencies of each word
    word_frequencies_year = Counter(list_of_year_words)
    # Convert the frequencies to a dataframe
    frequencies_df_year = pd.DataFrame.from_dict(word_frequencies_year, orient='index', columns=['Frequency'])
    # Sort the dataframe by frequency in descending order
    frequencies_df_year = frequencies_df_year.sort_values('Frequency', ascending=False)
    frequencies_df_year = frequencies_df_year.iloc[0:60]
    return frequencies_df_year, df_year

In [9]:
def get_json_data(frequencies_df):
    json_data_year = []
    for i,r in frequencies_df.iterrows():
        json_data = {"word": '', "frequency": 0}
        json_data.update({"word": i, "frequency": int(r[0])})
        json_data_year.append(json_data)
    # Serialize the JSON data
    json_string = json.dumps(json_data_year, ensure_ascii=False)

    # Print the JSON string
    return json_string


In [10]:
frequencies_df_2022, df_2022 = get_filtered_data_on_year(year=2022)

In [11]:
get_json_data(frequencies_df_2022)

'[{"word": "year", "frequency": 34715}, {"word": "cent", "frequency": 29763}, {"word": "super", "frequency": 28387}, {"word": "fund", "frequency": 28095}, {"word": "said", "frequency": 24910}, {"word": "invest", "frequency": 23789}, {"word": "servic", "frequency": 21934}, {"word": "rate", "frequency": 20250}, {"word": "home", "frequency": 20002}, {"word": "financi", "frequency": 19148}, {"word": "would", "frequency": 19125}, {"word": "retir", "frequency": 17647}, {"word": "say", "frequency": 16551}, {"word": "superannu", "frequency": 14572}, {"word": "time", "frequency": 14083}, {"word": "pension", "frequency": 14002}, {"word": "market", "frequency": 13932}, {"word": "incom", "frequency": 13922}, {"word": "price", "frequency": 13645}, {"word": "properti", "frequency": 13529}, {"word": "make", "frequency": 13399}, {"word": "also", "frequency": 13192}, {"word": "peopl", "frequency": 13112}, {"word": "govern", "frequency": 12673}, {"word": "australian", "frequency": 12400}, {"word": "need

## For 2019

In [12]:
frequencies_df_2019, df_2019 = get_filtered_data_on_year(year=2019)

In [13]:
get_json_data(frequencies_df_2019)

'[{"word": "year", "frequency": 30123}, {"word": "said", "frequency": 28437}, {"word": "cent", "frequency": 21770}, {"word": "home", "frequency": 21706}, {"word": "fund", "frequency": 19869}, {"word": "servic", "frequency": 19409}, {"word": "financi", "frequency": 17548}, {"word": "would", "frequency": 16770}, {"word": "invest", "frequency": 16461}, {"word": "bank", "frequency": 15796}, {"word": "retir", "frequency": 15709}, {"word": "super", "frequency": 15626}, {"word": "properti", "frequency": 14093}, {"word": "rate", "frequency": 13773}, {"word": "peopl", "frequency": 13347}, {"word": "pension", "frequency": 13054}, {"word": "time", "frequency": 12927}, {"word": "make", "frequency": 12406}, {"word": "loan", "frequency": 12205}, {"word": "also", "frequency": 11758}, {"word": "incom", "frequency": 11502}, {"word": "market", "frequency": 11302}, {"word": "need", "frequency": 11253}, {"word": "money", "frequency": 11155}, {"word": "australian", "frequency": 10745}, {"word": "price", "f

## Since 2007 (to 2022)

In [14]:
frequencies_df_2007, df_2007 = get_filtered_data_on_year(year=2007, since=True)

In [17]:
df_2007

Unnamed: 0,text,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics,corrected_topics,year
1,"There are many different types of homebuyers, ...",AUSNZ AUSTR,2017-04-29,Fairfax Media Management Pty Limited,"['many', 'different', 'types', 'homebuyers', '...","[mani, differ, type, homebuy, understand, help...",68,Topic 40,topic 40,2017
2,Off-the-plan contracts review The NSW Governme...,AUSNZ AUSTR,2018-02-03,Fairfax Media Management Pty Limited,"['plan', 'contracts', 'review', 'government', ...","[plan, contract, review, govern, releas, discu...",59,Topic 36,topic 36,2018
3,'Super must rise to 12%': MP Whitlam MP Stephe...,AUSNZ AUSTR,2019-10-09,Fairfax Media Management Pty Limited,"['super', 'must', 'rise', 'whitlam', 'stephen'...","[super, must, rise, whitlam, stephen_jon, say,...",130,Topic 28,topic 28,2019
4,Where there's a will TODAY I'd like to talk ab...,AUSNZ AUSTR,2019-05-25,Fairfax Media Management Pty Limited,"['today', 'like', 'talk', 'death', 'attention'...","[today, like, talk, death, attent, heaven_sak,...",333,Topic 46,topic 59,2019
5,"Smaller, more affordable housing centre of $16...",AUSNZ AUSTR,2021-10-08,Fairfax Media Management Pty Limited,"['smaller', 'affordable', 'housing', 'centre',...","[smaller, afford, hous, centr, albion_park, pr...",316,Topic 40,topic 40,2021
...,...,...,...,...,...,...,...,...,...,...
119940,RIDLEYTON 13A MONMOUTH ST This three-bedroom c...,AUSNZ AUSTR,2017-05-17,News Ltd.,"['ridleyton', 'monmouth', 'three', 'bedroom', ...","[ridleyton, monmouth, three, bedroom, courtyar...",82,Topic 40,topic 40,2017
119941,How to save when selling WHEN IT COMES TO THE ...,AUSNZ AUSTR,2019-03-14,News Ltd.,"['save', 'selling', 'comes', 'costs', 'associa...","[save, sell, come, cost, associ, sell, home, a...",248,Topic 14,topic 14,2019
119944,Taxing times for families TAXPAYERS having the...,AUSNZ AUSTR,2008-07-23,Nationwide News Pty. Ltd.,"['taxing', 'times', 'families', 'taxpayers', '...","[tax, time, famili, taxpay, return, prepar, aw...",112,Topic 36,topic 36,2008
119945,Legal papers vital Legal papers vital W...,AUSNZ AUSTR,2007-09-05,Nationwide News Pty Ltd.,"['legal', 'papers', 'vital', 'legal', 'papers'...","[legal, paper, vital, legal, paper, vital, cen...",158,Topic 43,topic 40,2007


## Since 2016 (to 2022)

In [18]:
frequencies_df_2016, df_2016 = get_filtered_data_on_year(year=2016, since=True)

In [20]:
df_2016

Unnamed: 0,text,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics,corrected_topics,year
1,"There are many different types of homebuyers, ...",AUSNZ AUSTR,2017-04-29 00:00:00,Fairfax Media Management Pty Limited,"['many', 'different', 'types', 'homebuyers', '...","[mani, differ, type, homebuy, understand, help...",68,Topic 40,topic 40,2017
2,Off-the-plan contracts review The NSW Governme...,AUSNZ AUSTR,2018-02-03 00:00:00,Fairfax Media Management Pty Limited,"['plan', 'contracts', 'review', 'government', ...","[plan, contract, review, govern, releas, discu...",59,Topic 36,topic 36,2018
3,'Super must rise to 12%': MP Whitlam MP Stephe...,AUSNZ AUSTR,2019-10-09 00:00:00,Fairfax Media Management Pty Limited,"['super', 'must', 'rise', 'whitlam', 'stephen'...","[super, must, rise, whitlam, stephen_jon, say,...",130,Topic 28,topic 28,2019
4,Where there's a will TODAY I'd like to talk ab...,AUSNZ AUSTR,2019-05-25 00:00:00,Fairfax Media Management Pty Limited,"['today', 'like', 'talk', 'death', 'attention'...","[today, like, talk, death, attent, heaven_sak,...",333,Topic 46,topic 59,2019
5,"Smaller, more affordable housing centre of $16...",AUSNZ AUSTR,2021-10-08 00:00:00,Fairfax Media Management Pty Limited,"['smaller', 'affordable', 'housing', 'centre',...","[smaller, afford, hous, centr, albion_park, pr...",316,Topic 40,topic 40,2021
...,...,...,...,...,...,...,...,...,...,...
119935,'Why punish them?': Call to allow retirees to ...,AUSNZ AUSTR,2020-02-18 22:21:30,Fairfax Media Management Pty Limited,"['punish', 'call', 'allow', 'retirees', 'work'...","[punish, call, allow, retire, work, without, p...",309,Topic 51,topic 47,2020
119936,6 things I've learnt since becoming a newbie s...,AUSNZ AUSTR,2021-07-24 21:23:02,Fairfax Media Management Pty Limited,"['things', 'learnt', 'since', 'becoming', 'new...","[thing, learnt, sinc, becom, newbi, sharemarke...",393,Topic 54,topic 54,2021
119937,High cost of electric vehicles make Australian...,AUSNZ AUSTR,2021-10-29 23:44:58,Fairfax Media Management Pty Limited,"['high', 'cost', 'electric', 'vehicles', 'make...","[high, cost, electric_vehicl, make, australian...",296,Topic 56,topic 56,2021
119940,RIDLEYTON 13A MONMOUTH ST This three-bedroom c...,AUSNZ AUSTR,2017-05-17 00:00:00,News Ltd.,"['ridleyton', 'monmouth', 'three', 'bedroom', ...","[ridleyton, monmouth, three, bedroom, courtyar...",82,Topic 40,topic 40,2017


# For the bar chart

## For 2022

In [21]:
def get_topic_names(year):
    topic_names_inc_exc = pd.read_excel('../../../../australia/Financial_wellbeing_topic_names.xlsx', sheet_name='in')
    # Retain the appropriate topics only
    imp_topics_df = topic_names_inc_exc
    imp_topics_df['Topic Number'] = imp_topics_df['Topic Number'].apply(lambda x: 'topic '+str(x))
    topics_renamed = dict(zip(imp_topics_df['Topic Number'].values, imp_topics_df['Topic Name (created by research team)'].values))
    if year == 2022:
        df_2022['renamed_topics'] = df_2022.corrected_topics.map(topics_renamed)
        return df_2022
    elif year == 2019:
        df_2019['renamed_topics'] = df_2019.corrected_topics.map(topics_renamed)
        return df_2019
    elif year == 2007:
        df_2007['renamed_topics'] = df_2007.corrected_topics.map(topics_renamed)
        return df_2007
    elif year == 2016:
        df_2016['renamed_topics'] = df_2016.corrected_topics.map(topics_renamed)
        return df_2016
    


In [22]:
def get_json_data_bar(df_value_counts):
    dates = []
    for num, (i, r) in enumerate(df_value_counts.iteritems()):
        dates.append({'date: ' + f'new Date("2021-0{num+3}-01"), value: {r}, label: "{i}"'})
    return dates

In [None]:
df_2022 = get_topic_names(2022)

In [None]:
df_2022_topic_counts = df_2022.renamed_topics.value_counts()

In [None]:
df_2022_topic_counts

## For 2019

In [None]:
df_2019 = get_topic_names(2019)

In [None]:
df_2019_topic_counts = df_2019.renamed_topics.value_counts()

In [None]:
df_2019_topic_counts

In [None]:
get_json_data_bar(df_2019_topic_counts)

## Since 2007 (to 2016)

In [23]:
df_2007 = get_topic_names(2007)
df_2007

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics,corrected_topics,year,renamed_topics
1,"There are many different types of homebuyers, ...",AUSNZ AUSTR,2017-04-29,Fairfax Media Management Pty Limited,"['many', 'different', 'types', 'homebuyers', '...","[mani, differ, type, homebuy, understand, help...",68,Topic 40,topic 40,2017,Criminal law
2,Off-the-plan contracts review The NSW Governme...,AUSNZ AUSTR,2018-02-03,Fairfax Media Management Pty Limited,"['plan', 'contracts', 'review', 'government', ...","[plan, contract, review, govern, releas, discu...",59,Topic 36,topic 36,2018,Food and cooking
3,'Super must rise to 12%': MP Whitlam MP Stephe...,AUSNZ AUSTR,2019-10-09,Fairfax Media Management Pty Limited,"['super', 'must', 'rise', 'whitlam', 'stephen'...","[super, must, rise, whitlam, stephen_jon, say,...",130,Topic 28,topic 28,2019,Art markets
4,Where there's a will TODAY I'd like to talk ab...,AUSNZ AUSTR,2019-05-25,Fairfax Media Management Pty Limited,"['today', 'like', 'talk', 'death', 'attention'...","[today, like, talk, death, attent, heaven_sak,...",333,Topic 46,topic 59,2019,Stock warrants (options)
5,"Smaller, more affordable housing centre of $16...",AUSNZ AUSTR,2021-10-08,Fairfax Media Management Pty Limited,"['smaller', 'affordable', 'housing', 'centre',...","[smaller, afford, hous, centr, albion_park, pr...",316,Topic 40,topic 40,2021,Criminal law
...,...,...,...,...,...,...,...,...,...,...,...
119940,RIDLEYTON 13A MONMOUTH ST This three-bedroom c...,AUSNZ AUSTR,2017-05-17,News Ltd.,"['ridleyton', 'monmouth', 'three', 'bedroom', ...","[ridleyton, monmouth, three, bedroom, courtyar...",82,Topic 40,topic 40,2017,Criminal law
119941,How to save when selling WHEN IT COMES TO THE ...,AUSNZ AUSTR,2019-03-14,News Ltd.,"['save', 'selling', 'comes', 'costs', 'associa...","[save, sell, come, cost, associ, sell, home, a...",248,Topic 14,topic 14,2019,Australian stock market
119944,Taxing times for families TAXPAYERS having the...,AUSNZ AUSTR,2008-07-23,Nationwide News Pty. Ltd.,"['taxing', 'times', 'families', 'taxpayers', '...","[tax, time, famili, taxpay, return, prepar, aw...",112,Topic 36,topic 36,2008,Food and cooking
119945,Legal papers vital Legal papers vital W...,AUSNZ AUSTR,2007-09-05,Nationwide News Pty Ltd.,"['legal', 'papers', 'vital', 'legal', 'papers'...","[legal, paper, vital, legal, paper, vital, cen...",158,Topic 43,topic 40,2007,Criminal law


## Since 2016 (to 2022)

In [24]:
df_2016 = get_topic_names(2016)
df_2016

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0,text,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics,corrected_topics,year,renamed_topics
1,"There are many different types of homebuyers, ...",AUSNZ AUSTR,2017-04-29 00:00:00,Fairfax Media Management Pty Limited,"['many', 'different', 'types', 'homebuyers', '...","[mani, differ, type, homebuy, understand, help...",68,Topic 40,topic 40,2017,Criminal law
2,Off-the-plan contracts review The NSW Governme...,AUSNZ AUSTR,2018-02-03 00:00:00,Fairfax Media Management Pty Limited,"['plan', 'contracts', 'review', 'government', ...","[plan, contract, review, govern, releas, discu...",59,Topic 36,topic 36,2018,Food and cooking
3,'Super must rise to 12%': MP Whitlam MP Stephe...,AUSNZ AUSTR,2019-10-09 00:00:00,Fairfax Media Management Pty Limited,"['super', 'must', 'rise', 'whitlam', 'stephen'...","[super, must, rise, whitlam, stephen_jon, say,...",130,Topic 28,topic 28,2019,Art markets
4,Where there's a will TODAY I'd like to talk ab...,AUSNZ AUSTR,2019-05-25 00:00:00,Fairfax Media Management Pty Limited,"['today', 'like', 'talk', 'death', 'attention'...","[today, like, talk, death, attent, heaven_sak,...",333,Topic 46,topic 59,2019,Stock warrants (options)
5,"Smaller, more affordable housing centre of $16...",AUSNZ AUSTR,2021-10-08 00:00:00,Fairfax Media Management Pty Limited,"['smaller', 'affordable', 'housing', 'centre',...","[smaller, afford, hous, centr, albion_park, pr...",316,Topic 40,topic 40,2021,Criminal law
...,...,...,...,...,...,...,...,...,...,...,...
119935,'Why punish them?': Call to allow retirees to ...,AUSNZ AUSTR,2020-02-18 22:21:30,Fairfax Media Management Pty Limited,"['punish', 'call', 'allow', 'retirees', 'work'...","[punish, call, allow, retire, work, without, p...",309,Topic 51,topic 47,2020,Life insurance
119936,6 things I've learnt since becoming a newbie s...,AUSNZ AUSTR,2021-07-24 21:23:02,Fairfax Media Management Pty Limited,"['things', 'learnt', 'since', 'becoming', 'new...","[thing, learnt, sinc, becom, newbi, sharemarke...",393,Topic 54,topic 54,2021,Food and nutrition
119937,High cost of electric vehicles make Australian...,AUSNZ AUSTR,2021-10-29 23:44:58,Fairfax Media Management Pty Limited,"['high', 'cost', 'electric', 'vehicles', 'make...","[high, cost, electric_vehicl, make, australian...",296,Topic 56,topic 56,2021,Billionaires
119940,RIDLEYTON 13A MONMOUTH ST This three-bedroom c...,AUSNZ AUSTR,2017-05-17 00:00:00,News Ltd.,"['ridleyton', 'monmouth', 'three', 'bedroom', ...","[ridleyton, monmouth, three, bedroom, courtyar...",82,Topic 40,topic 40,2017,Criminal law


In [25]:
df_2007.to_csv('since-2007.csv')

In [26]:
df_2016.to_csv('since-2016.csv')