In [1]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora
import gensim
from gensim.models.coherencemodel import CoherenceModel
import re
# from tabulate import tabulate

from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import os 
import numpy as np
from nltk.stem.snowball import SnowballStemmer
import pickle

# import image module
from IPython.display import Image

In [2]:
# List stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# print(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\E116189\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\E116189\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\E116189\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
import random

random.seed(42)
print(random.random())

0.6394267984578837


### Helper functions

In [21]:
##########################################################################################################################
# Get the filtered data for each country
def get_country_filtered_data(df, country_name):
    df['region_of_origin'] = df.region_of_origin.apply(lambda x: str(x).strip())
    print('####################################')
    print('Statistics for the whole dataset')
    print('####################################')
    print(f'Total number of articles in the dataset (for all the countries): \033[1;32m{df.shape[0]:,}\033[0m')
    print(f'Total number of articles missing in the dataset (for all the countries): \033[1;31m{df.body.isna().sum():,}\033[0m')
    print(f'Percentage of missing articles: \033[1;31m{round((df.body.isna().sum()/df.shape[0])*100, 2):,}%\033[0m')
    print('\n\n')
    countries_dict = {'AUSTR': 'AUS', 'USA': 'USA', 'UK': 'UK', 'INDIA': 'IND', 'CAN': 'CAN'}
    df['country'] = df.region_of_origin.apply(lambda x: countries_dict[country_name] if re.search(country_name, x) else 'other')
    df_country = df[df.country == countries_dict[country_name]]
    df_country = df_country[['body', 'region_of_origin', 'publication_date', 'publisher_name']]
    df_country.body.drop_duplicates(inplace=True)

    print('####################################')
    print(f'Statistics for \033[1m{countries_dict[country_name]}\033[0m dataset')
    print('####################################')

    print(f'    Total number of articles retrieved for \033[1;32m{country_name}\033[0m, without any duplicates: \033[1;32m{df_country.shape[0]:,}\033[0m')
    print(f'    Total number of columns retrieved for \033[1;32m{country_name}\033[0m: \033[1;32m{df_country.shape[1]}\033[0m')
    print('-------------------------------------------------------------------------------')

    # Treating the missing articles in 'body' column
    missing_rows = df_country.body.isna().sum()
    print(f'    Total number of missing articles for \033[1;31m{country_name}\033[0m: \033[1;31m{missing_rows:,}\033[0m')
    print(f'    Percentage of missing articles for \033[1;31m{country_name}\033[0m: \033[1;31m{round((missing_rows/df_country.shape[0])*100, 2):,}%\033[0m')
    if missing_rows > 0:
        df_country.dropna(inplace=True)
        print('-------------------------------------------------------------------------------')
        print(f'    Total number of articles after treating the missing articles: \033[1;32m{df_country.shape[0]:,}\033[0m')
    return df_country
##########################################################################################################################


# Preprocessing functions
##########################################################################################################################
# Keeping longer words
def words_length(x):
    if (len(x) > 3 and x.isalpha()):
        return x
    else:
        return '....removeThispart....'
##########################################################################################################################


##########################################################################################################################
stopwords_re = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

# Removal
def preprocess_data(df):
    df['preprocessed_sentence'] = df.body.apply(lambda x: re.sub(r'(?i)\bhttps?://[^\s/$.?#].[^\s]*\b', '', x))
    print('Done removing web links...')
    df['preprocessed_sentence'] = df.preprocessed_sentence.map(lambda x: re.sub(r"[^a-zA-Z0-9\n\t]|\d+", " ", str(x)).lower())
    print('Done removing numbers, single letter words, new lines, tab spaces...')
    df['preprocessed_sentence'] = df.preprocessed_sentence.apply(lambda x: stopwords_re.sub('', x))
    print('Done removing the stop words...')
    df['preprocessed_sentence'] = df.preprocessed_sentence.apply(lambda x: re.sub(r" +", " ",x))
    print('Done removing extra spaces after removing the links, words, numbers...')
    df['preprocessed_sentence'] = df.preprocessed_sentence.apply(lambda x: np.vectorize(words_length)(np.array(x.split(' '))) )
    print('Done selecting specific word lengths...')
    df['preprocessed_sentence'] = df.preprocessed_sentence.apply(lambda x: re.sub(r"....removeThispart....", " ", " ".join(x)))
    print('Clean up done...')
    df['preprocessed_sentence'] = df.preprocessed_sentence.apply(lambda x: re.sub(r" +", " ", str(x)).strip().split(' '))
    print('Final clean up done...')

    # Output progress
    print(f"\nPreprocessing completed for {len(df)} articles!")

    return df
##########################################################################################################################



##########################################################################################################################
# Snowball stemming
#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')

def snowball_stemm(x):
    stemm = snow_stemmer.stem(x)

    return stemm
##########################################################################################################################


##########################################################################################################################
# Save the dictionary and bag of words
def save_dict_bow(dictionary, bow_corpus, country_name):
    dictionary.save(f'cc-bigrams-trigrams-{country_name}-australia.dict')

    with open(f'bow_corpus_bigrams-trigrams_{country_name}_australia.pickle', 'wb') as f:
        pickle.dump(bow_corpus, f)


# load the dictionary and bag of words locally
def load_dict_bow(country_name):
    dictionary = gensim.corpora.Dictionary.load(f'../../../countries/australia/for-students/cc-bigrams-trigrams-{country_name}-tfidf-australia.dict')

    with open(f'../../../countries/australia/for-students/bow_corpus_bigrams-trigrams_{country_name}-tfidf_australia.pickle', 'rb') as f:
        bow = pickle.load(f)
    return dictionary, bow
##########################################################################################################################


##########################################################################################################################
# Output writer
import csv

def output_writer(df, step_name):
# Open a CSV file for writing with UTF-8 encoding
    with open(f'{step_name}_output_australia.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\', delimiter='|')
        # Write the header row to the CSV file
        writer.writerow(df.columns)

        # Write the dataframe to the CSV file
        writer.writerows(df.values)
        # Write the dataframe to the CSV file
        # writer.writerows(df.values)
##########################################################################################################################

### Reading the data

In [5]:
# Read the data into a dataframe
df = pd.read_csv('../../../../../climate-change-with-dates.csv')

In [7]:
# Get the filtered data for Australia
# Use 'AUSTR' for Australia
# Use 'UK' for UK
# Use 'INDIA' for India
# Use 'USA' for USA
# Use 'CAND' for Canada

df_aus = get_country_filtered_data(df, "AUSTR")

####################################
Statistics for the whole dataset
####################################
Total number of articles in the dataset (for all the countries): [1;32m2,548,107[0m
Total number of articles missing in the dataset (for all the countries): [1;31m82,537[0m
Percentage of missing articles: [1;31m3.24%[0m



####################################
Statistics for [1mAUS[0m dataset
####################################
    Total number of articles retrieved for [1;32mAUSTR[0m, without any duplicates: [1;32m194,704[0m
    Total number of columns retrieved for [1;32mAUSTR[0m: [1;32m4[0m
-------------------------------------------------------------------------------
    Total number of missing articles for [1;31mAUSTR[0m: [1;31m6,673[0m
    Percentage of missing articles for [1;31mAUSTR[0m: [1;31m3.43%[0m
-------------------------------------------------------------------------------
    Total number of articles after treating the missing articles: [

In [8]:
df_aus

Unnamed: 0,body,region_of_origin,publication_date,publisher_name
1544,"""Dairy farmers are more than prepared to be pa...",AUSNZ AUSTR,1171497600000,West Australian Newspapers Limited
1545,"From simple shots of the night sky, replete wi...",AUSNZ AUSTR,1228348800000,West Australian Newspapers Limited
1546,They worked off a climate model that suggests ...,AUSNZ AUSTR,1288828800000,West Australian Newspapers Limited
1866,It can be more difficult to tell the hard stor...,AUSNZ AUSTR,1474243200000,Fairfax Media Management Pty Limited
1867,What are kids to think if our generation squib...,AUSNZ AUSTR,1489363200000,Fairfax Media Management Pty Limited
...,...,...,...,...
2546923,Five programs are planned for Whittlesea.\n\nT...,AUSNZ AUSTR,1269388800000,Nationwide News Pty Ltd.
2547014,The report also identified a rise in the natio...,AUSNZ AUSTR,1123027200000,Nationwide News Pty Ltd.
2547043,If only it was that simple. No one disputes th...,AUSNZ AUSTR,1617235200000,Yaffa Media Pty Ltd.
2547044,"""No one has invested money into streamlining t...",AUSNZ AUSTR,1629072000000,Yaffa Media Pty Ltd.


In [9]:
# Delete the bigger dataset to save on memory
del df

### Preprocessing the data

In [10]:
# Preprocess the data
df_aus = preprocess_data(df_aus)

Done removing web links...
Done removing numbers, single letter words, new lines, tab spaces...
Done removing the stop words...
Done removing extra spaces after removing the links, words, numbers...
Done selecting specific word lengths...
Clean up done...
Final clean up done...

Preprocessing completed for 184401 articles!


In [11]:
# df_aus['word_count'] = df_aus.preprocessed_sentence.apply(lambda x: len(x))
df_aus['preprocessed_sentence'].iloc[0]

['dairy',
 'farmers',
 'prepared',
 'part',
 'market',
 'water',
 'based',
 'sound',
 'plans',
 'provide',
 'water',
 'irrigators',
 'urban',
 'users',
 'environment',
 'burgess',
 'said',
 'community',
 'deserved',
 'well',
 'informed',
 'environmental',
 'issues',
 'crucial',
 'lost',
 'sight',
 'fact',
 'cent',
 'dairy',
 'farmers',
 'members',
 'natural',
 'resource',
 'management',
 'groups',
 'usually',
 'said',
 'dairy',
 'australia',
 'biggest',
 'food',
 'industry',
 'three',
 'billion',
 'dollars',
 'worth',
 'milk',
 'produced',
 'farmgate',
 'turned',
 'products',
 'worth',
 'nine',
 'billion',
 'dollars',
 'factory',
 'door']

In [12]:
df_aus.head()

Unnamed: 0,body,region_of_origin,publication_date,publisher_name,preprocessed_sentence
1544,"""Dairy farmers are more than prepared to be pa...",AUSNZ AUSTR,1171497600000,West Australian Newspapers Limited,"[dairy, farmers, prepared, part, market, water..."
1545,"From simple shots of the night sky, replete wi...",AUSNZ AUSTR,1228348800000,West Australian Newspapers Limited,"[simple, shots, night, replete, myriad, twinkl..."
1546,They worked off a climate model that suggests ...,AUSNZ AUSTR,1288828800000,West Australian Newspapers Limited,"[worked, climate, model, suggests, region, rec..."
1866,It can be more difficult to tell the hard stor...,AUSNZ AUSTR,1474243200000,Fairfax Media Management Pty Limited,"[difficult, tell, hard, stories, mental, healt..."
1867,What are kids to think if our generation squib...,AUSNZ AUSTR,1489363200000,Fairfax Media Management Pty Limited,"[kids, think, generation, squibs, chance, prev..."


### Adding Bigrams and Trigrams

In [13]:
# Bigrams
bigrams_phrases = gensim.models.Phrases(df_aus.preprocessed_sentence.tolist(), min_count=4, threshold=50)
print('Bigram phrases done...')

# Trigrams
trigrams_phrases = gensim.models.Phrases(bigrams_phrases[df_aus.preprocessed_sentence.tolist()], threshold=50)
print('Trigram phrases done...')

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigrams_phrases)
print('Bigram & Trigram done...')

def make_bigrams(bodys):
    return(bigram[doc] for doc in bodys)

def make_trigrams(bodys):
    return(trigram[bigram[doc]] for doc in bodys)

data_bigrams = make_bigrams(df_aus.preprocessed_sentence.tolist())
data_bigrams_trigrams = make_trigrams(data_bigrams)



Bigram phrases done...
Trigram phrases done...
Bigram & Trigram done...


In [14]:
# data_bigrams_trigrams_list = list(data_bigrams_trigrams)

In [15]:
df_aus['preprocessed_list_with_bi_tri'] = list(data_bigrams_trigrams)
df_aus['preprocessed_len'] = df_aus.preprocessed_list_with_bi_tri.apply(lambda x: len(x))
df_aus = df_aus[df_aus.preprocessed_len > 0]
df_aus.shape
# Stemming each word in each row
df_aus['preprocessed_list_with_bi_tri'] = df_aus.preprocessed_list_with_bi_tri.apply(lambda x: np.vectorize(snowball_stemm)(np.array(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(df_aus.preprocessed_list_with_bi_tri.tolist())

corpus = [id2word.doc2bow(body) for body in df_aus.preprocessed_list_with_bi_tri.tolist()]
# print(corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

    #reassign
    corpus[i] = new_bow

In [17]:

print(f'Total number of articles: {len(corpus)}')

# Adding bigram and trigram list as a new column 
# df_aus['preprocessed_list_with_bi_tri'] = data_bigrams_trigrams_list

# Calculating the words in each row and removing lists with no words
# df_aus['preprocessed_len'] = df_aus.tfidf_sentence.apply(lambda x: len(x))
# df_aus = df_aus[df_aus.preprocessed_len > 0]
# df_aus.shape

Total number of articles: 184330


In [18]:
df_aus.head()

Unnamed: 0,body,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len
1544,"""Dairy farmers are more than prepared to be pa...",AUSNZ AUSTR,1171497600000,West Australian Newspapers Limited,"[dairy, farmers, prepared, part, market, water...","[dairy_farm, prepar, part, market, water, base...",55
1545,"From simple shots of the night sky, replete wi...",AUSNZ AUSTR,1228348800000,West Australian Newspapers Limited,"[simple, shots, night, replete, myriad, twinkl...","[simpl, shot, night, replet, myriad, twinkl, s...",307
1546,They worked off a climate model that suggests ...,AUSNZ AUSTR,1288828800000,West Australian Newspapers Limited,"[worked, climate, model, suggests, region, rec...","[work, climat, model, suggest, region, receiv,...",124
1866,It can be more difficult to tell the hard stor...,AUSNZ AUSTR,1474243200000,Fairfax Media Management Pty Limited,"[difficult, tell, hard, stories, mental, healt...","[difficult, tell, hard, stori, mental_health, ...",101
1867,What are kids to think if our generation squib...,AUSNZ AUSTR,1489363200000,Fairfax Media Management Pty Limited,"[kids, think, generation, squibs, chance, prev...","[kid, think_generation_squib, chanc, prevent, ...",90


In [19]:
# dictionary = corpora.Dictionary(df_aus['preprocessed_list_with_bi_tri'].tolist())

# bow_corpus = [dictionary.doc2bow(body) for body in df_aus['preprocessed_list_with_bi_tri'].tolist()]

In [22]:
# Save the dictionary to a file

save_dict_bow(id2word, corpus,'AUS-tfidf')

In [6]:
ldamodel = gensim.models.ldamulticore.LdaMulticore(bow_corpus, num_topics=60, id2word = dictionary, passes=40, workers=15,random_state=42)

In [7]:
# save the model
ldamodel.save("lda_model_with_bigrams_and_trigrams-cc-aus-tfidf.model")

In [8]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, bow_corpus, dictionary)
vis

  from imp import reload
  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [22]:
dictionary, bow_corpus = load_dict_bow('AUS')

In [23]:
from gensim import  models
lda_model = models.ldamodel.LdaModel.load("../../../countries/australia/for-students/lda_model_with_bigrams_and_trigrams-cc-aus-tfidf.model")
# lda_model_id2w = models.ldamodel.LdaModel.load("lda_model.model.id2word")

In [24]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
vis

  from imp import reload
  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [25]:
# pyLDAvis.save_html(vis, 'lda_model_with_bigrams_and_trigrams_cc_aus-tfidf.html')


In [26]:
lambd = 0.2 # a specific relevance metric value

all_topics = {}
num_topics = lda_model.num_topics
num_terms = 58

for i in range(1,num_topics+1): ## Correct range
    topic = vis.topic_info[vis.topic_info.Category == 'Topic'+str(i)].copy()
    topic['relevance'] = topic['loglift']*(1-lambd)+topic['logprob']*lambd

    all_topics['Topic '+str(i)] = topic.sort_values(by='relevance', ascending=False).Term[:num_terms].values
    all_topics['Topic '+str(i)] = all_topics['Topic '+str(i)] +' '+ round(topic['relevance'][:num_terms], 3).astype(str).values
# pd.DataFrame(all_topics)
all_topics = pd.DataFrame(all_topics)
all_topics

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 51,Topic 52,Topic 53,Topic 54,Topic 55,Topic 56,Topic 57,Topic 58,Topic 59,Topic 60
0,politician 0.703,price 0.935,agreement 1.061,polit 1.07,letter 1.153,parti 1.476,rudd 1.326,film 1.252,scientist 1.232,morrison 1.432,...,reef 3.601,gore 3.623,court 3.439,flanneri 3.991,servic 3.501,fuel 3.894,tasmania 4.344,church 4.249,koala 3.868,robert 4.155
1,serious 0.685,permit 0.934,kyoto 1.007,ideolog 0.802,letters_editor 1.116,elect 1.282,wong 1.122,cricket 1.116,ipcc 1.228,zero 1.368,...,coral 3.429,flight 3.546,curti 3.405,newcastl 3.675,capital_bond 3.378,biofuel 3.712,tasmanian 4.177,wine 4.061,macdonald 3.792,peacock 3.882
2,take 0.596,garnaut 0.929,developing_countri 1.007,populist 1.08,monckton 1.115,voter 1.245,legisl 1.181,sport 1.067,scientif 1.207,taylor 1.342,...,great_barrier_reef 3.084,travel 3.569,ridd 3.34,hunter 3.589,cement_concret 3.372,ethanol 3.36,connor 3.815,pope 3.814,toowoomba 3.781,bullock 3.835
3,problem 0.596,abat 0.892,negoti 0.959,intellectu 0.65,yesterday_item 1.265,labor 1.139,amend 0.98,song 1.149,warm 1.204,fitzgibbon 1.239,...,coral_reef 2.949,airlin 3.715,litig 3.275,professor_flanneri 3.414,bondhold 3.269,biodiesel 3.35,launceston 3.632,christian 3.803,koala_habitat 3.72,gild 3.811
4,noth 0.575,trade 0.806,bali 0.955,orthodoxi 0.646,islam 1.099,liber 1.101,senat 0.956,mother 1.111,scienc 1.214,albanes 1.222,...,bleach 2.889,aviat 3.032,legal 3.223,cowan 3.419,apparatus 3.266,coleman 3.305,hobart 3.595,whale 3.802,carr 3.673,ghgsat 3.8
5,real 0.561,carbon 0.753,summit 0.936,iggulden 0.645,crime 1.058,seat 1.074,opposit 0.945,movi 0.948,computer_model 1.192,target 1.301,...,coral_bleach 2.874,qanta 3.014,lawyer 3.175,costa 3.383,transportation_transmission_storag 3.213,ship 3.24,gutwein 3.523,winemak 3.785,koala_popul 3.64,dhabi 3.787
6,must 0.539,compens 0.749,deleg 0.932,argument 0.641,crikey 1.026,candid 1.048,change_minister_penni 0.943,music 0.942,data 1.191,butler 1.106,...,unesco 2.856,airport 3.01,supreme_court 3.169,hamilton 3.93,services_rel 3.21,excis 3.202,barlow 3.459,grape 3.783,width_height_styl 3.64,submit_anonym 3.783
7,enough 0.536,cost 0.711,treati 0.919,debat 0.63,electronic_form_communicate_lett 1.024,vote 0.994,hunt 0.943,husband 0.948,plimer 1.178,bowen 1.105,...,turtl 2.853,aircraft 3.007,schwarzenegg 3.111,lake_macquari 3.359,baker 3.186,vail 3.195,mercuri 3.401,franci 3.787,mcgrath 3.632,rossit 3.803
8,simpli 0.525,trading_schem 0.696,kyoto_protocol 0.923,polaris 0.62,lord_monckton 1.022,elector 0.984,miln 1.225,love 0.935,university_east_anglia 1.303,energy_minister_angus 1.688,...,bleaching_ev 2.799,former_vic 2.943,woodley 3.094,rinehart 3.314,contact 3.183,biofuel_product 3.188,bowden 3.375,lander 3.741,rayner 3.628,judd 3.761
9,stigson 0.506,scheme 0.678,apec 0.896,lomborg 0.627,muslim 1.003,liberal_parti 0.958,garrett 1.029,comedi 0.947,peer_review 1.136,reach_zero 1.512,...,marine_park_author 2.71,president_gor 2.943,lawsuit 3.092,seymour 3.294,distribution_deliveri 3.132,stewart 3.182,uta 3.374,mcleod 3.72,kingaroy 3.614,matt_peacock 3.756


In [27]:
all_topics.to_csv("top-58-words-australia.csv", index=False)

In [28]:
from tqdm import tqdm
from functools import reduce
from collections import Counter

In [29]:
df_aus.preprocessed_sentence.iloc[0]

['dairy',
 'farmers',
 'prepared',
 'part',
 'market',
 'water',
 'based',
 'sound',
 'plans',
 'provide',
 'water',
 'irrigators',
 'urban',
 'users',
 'environment',
 'burgess',
 'said',
 'community',
 'deserved',
 'well',
 'informed',
 'environmental',
 'issues',
 'crucial',
 'lost',
 'sight',
 'fact',
 'cent',
 'dairy',
 'farmers',
 'members',
 'natural',
 'resource',
 'management',
 'groups',
 'usually',
 'said',
 'dairy',
 'australia',
 'biggest',
 'food',
 'industry',
 'three',
 'billion',
 'dollars',
 'worth',
 'milk',
 'produced',
 'farmgate',
 'turned',
 'products',
 'worth',
 'nine',
 'billion',
 'dollars',
 'factory',
 'door']

In [30]:
# Assume `new_doc` is a list of tokens representing a new document
new_topics = []
for i in tqdm(range(0,df_aus.shape[0])):

    word_count = dict(Counter(df_aus.preprocessed_list_with_bi_tri.iloc[i]))

    top_words = dict((k, v) for k, v in word_count.items() if v >= 3)

    # print(topic_words)
    new_doc_topics = []
    for j in  range(1, num_topics+1):
        topic_words = all_topics['Topic ' + str(j)]
        word_list = topic_words.apply(lambda x: x.split(' ')[0])

        # Find the intersection of words between the topic words and the word list
        intersection = set(list(top_words.keys())).intersection(word_list)
        # print(j,intersection)

        # Get a list of relevance scores for each intersected word
        relevance = [float(topic_words[topic_words.str.contains(i)].str.split().str[-1].values[0]) for i in intersection if len(i) > 0]
        # print(relevance)

        # Check if there are more than 1 relevance scores
        if len(relevance) > 0:
            result = reduce(lambda x, y: x*y, relevance)
        else:
            result = 0
        new_doc_topics.append((j, result))
    # print(new_doc_topics)
    new_doc_topics.sort(key=lambda x: x[1], reverse=True)
    # print(new_doc_topics)
    all_zeros = all(tup[1] == 0 for tup in new_doc_topics)

    if all_zeros:
        new_topics.append(1)
    else:
        new_topics.append(new_doc_topics[0][0])

100%|██████████| 184330/184330 [36:02<00:00, 85.23it/s]  


In [31]:
# Assign the predicted topics to the articles
df_aus['topics'] = new_topics

# Append the topic numbers with the word 'topic'
df_aus['topics'] = df_aus.topics.apply(lambda x: f'topic {str(x)}')

# Check the distribution of topics
print(df_aus.topics.value_counts())

# Check if there are any inappropriate column names
print(df_aus.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


topic 14    44125
topic 1     21330
topic 21    19901
topic 33     8114
topic 25     7016
topic 39     6862
topic 27     4804
topic 34     4203
topic 46     3478
topic 50     3339
topic 45     3210
topic 12     2900
topic 36     2882
topic 51     2630
topic 55     2520
topic 40     2301
topic 42     2179
topic 22     2112
topic 29     2095
topic 19     2088
topic 37     2057
topic 43     1751
topic 48     1723
topic 53     1674
topic 47     1594
topic 7      1582
topic 56     1534
topic 24     1456
topic 57     1448
topic 18     1440
topic 54     1434
topic 49     1407
topic 52     1402
topic 23     1388
topic 32     1225
topic 38     1042
topic 15     1009
topic 58      973
topic 10      800
topic 41      795
topic 35      779
topic 44      768
topic 31      709
topic 13      671
topic 6       671
topic 17      641
topic 26      566
topic 30      499
topic 59      491
topic 11      480
topic 16      353
topic 20      325
topic 8       286
topic 60      263
topic 9       252
topic 3   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [32]:
# Save the dataset to the local system
df_aus.to_csv('cc-australia-topic-prediction.csv', index=False)

# Read the files and run again

In [6]:
df_aus = pd.read_csv('cc-australia-topic-prediction.csv')

In [33]:
df_aus

Unnamed: 0,body,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics
1544,"""Dairy farmers are more than prepared to be pa...",AUSNZ AUSTR,1171497600000,West Australian Newspapers Limited,"[dairy, farmers, prepared, part, market, water...","[dairy_farm, prepar, part, market, water, base...",55,topic 1
1545,"From simple shots of the night sky, replete wi...",AUSNZ AUSTR,1228348800000,West Australian Newspapers Limited,"[simple, shots, night, replete, myriad, twinkl...","[simpl, shot, night, replet, myriad, twinkl, s...",307,topic 37
1546,They worked off a climate model that suggests ...,AUSNZ AUSTR,1288828800000,West Australian Newspapers Limited,"[worked, climate, model, suggests, region, rec...","[work, climat, model, suggest, region, receiv,...",124,topic 14
1866,It can be more difficult to tell the hard stor...,AUSNZ AUSTR,1474243200000,Fairfax Media Management Pty Limited,"[difficult, tell, hard, stories, mental, healt...","[difficult, tell, hard, stori, mental_health, ...",101,topic 14
1867,What are kids to think if our generation squib...,AUSNZ AUSTR,1489363200000,Fairfax Media Management Pty Limited,"[kids, think, generation, squibs, chance, prev...","[kid, think_generation_squib, chanc, prevent, ...",90,topic 14
...,...,...,...,...,...,...,...,...
2546923,Five programs are planned for Whittlesea.\n\nT...,AUSNZ AUSTR,1269388800000,Nationwide News Pty Ltd.,"[five, programs, planned, whittlesea, communit...","[five, program, plan, whittlesea, communiti, t...",26,topic 1
2547014,The report also identified a rise in the natio...,AUSNZ AUSTR,1123027200000,Nationwide News Pty Ltd.,"[report, also, identified, rise, national, ave...","[report, also, identifi, rise, nation, averag,...",96,topic 14
2547043,If only it was that simple. No one disputes th...,AUSNZ AUSTR,1617235200000,Yaffa Media Pty Ltd.,"[simple, disputes, measure, global, warming, p...","[simpl, disput, measur, global, warm, potenti,...",409,topic 44
2547044,"""No one has invested money into streamlining t...",AUSNZ AUSTR,1629072000000,Yaffa Media Pty Ltd.,"[invested, money, streamlining, clearance, pro...","[invest, money, streamlin, clearanc, process, ...",153,topic 21


In [34]:
# Convert the timestamps to a proper datetime format
df_aus['publication_datetime'] = pd.to_datetime(df_aus['publication_date'], unit='ms')

# Retain just the date and drop the time
df_aus['publication_date'] = df_aus.publication_datetime.dt.date
df_aus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,body,region_of_origin,publication_date,publisher_name,preprocessed_sentence,preprocessed_list_with_bi_tri,preprocessed_len,topics,publication_datetime
1544,"""Dairy farmers are more than prepared to be pa...",AUSNZ AUSTR,2007-02-15,West Australian Newspapers Limited,"[dairy, farmers, prepared, part, market, water...","[dairy_farm, prepar, part, market, water, base...",55,topic 1,2007-02-15
1545,"From simple shots of the night sky, replete wi...",AUSNZ AUSTR,2008-12-04,West Australian Newspapers Limited,"[simple, shots, night, replete, myriad, twinkl...","[simpl, shot, night, replet, myriad, twinkl, s...",307,topic 37,2008-12-04
1546,They worked off a climate model that suggests ...,AUSNZ AUSTR,2010-11-04,West Australian Newspapers Limited,"[worked, climate, model, suggests, region, rec...","[work, climat, model, suggest, region, receiv,...",124,topic 14,2010-11-04
1866,It can be more difficult to tell the hard stor...,AUSNZ AUSTR,2016-09-19,Fairfax Media Management Pty Limited,"[difficult, tell, hard, stories, mental, healt...","[difficult, tell, hard, stori, mental_health, ...",101,topic 14,2016-09-19
1867,What are kids to think if our generation squib...,AUSNZ AUSTR,2017-03-13,Fairfax Media Management Pty Limited,"[kids, think, generation, squibs, chance, prev...","[kid, think_generation_squib, chanc, prevent, ...",90,topic 14,2017-03-13
...,...,...,...,...,...,...,...,...,...
2546923,Five programs are planned for Whittlesea.\n\nT...,AUSNZ AUSTR,2010-03-24,Nationwide News Pty Ltd.,"[five, programs, planned, whittlesea, communit...","[five, program, plan, whittlesea, communiti, t...",26,topic 1,2010-03-24
2547014,The report also identified a rise in the natio...,AUSNZ AUSTR,2005-08-03,Nationwide News Pty Ltd.,"[report, also, identified, rise, national, ave...","[report, also, identifi, rise, nation, averag,...",96,topic 14,2005-08-03
2547043,If only it was that simple. No one disputes th...,AUSNZ AUSTR,2021-04-01,Yaffa Media Pty Ltd.,"[simple, disputes, measure, global, warming, p...","[simpl, disput, measur, global, warm, potenti,...",409,topic 44,2021-04-01
2547044,"""No one has invested money into streamlining t...",AUSNZ AUSTR,2021-08-16,Yaffa Media Pty Ltd.,"[invested, money, streamlining, clearance, pro...","[invest, money, streamlin, clearanc, process, ...",153,topic 21,2021-08-16


In [35]:
# Split the year and months
df_aus['publication_year'] = df_aus.publication_datetime.dt.year
df_aus['publication_month'] = df_aus.publication_datetime.dt.month

# prefix a 0 to the months
df_aus['publication_month'] = df_aus.publication_month.apply(lambda x: '0'+str(x) if len(str(x)) < 2 else str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [36]:
df_grouped_df = df_aus.groupby(['topics', 'publication_year', 'publication_month']).count()[['body']].reset_index()

In [37]:
df_grouped_df

Unnamed: 0,topics,publication_year,publication_month,body
0,topic 1,1989,07,2
1,topic 1,1990,02,1
2,topic 1,1990,09,1
3,topic 1,1990,11,2
4,topic 1,1991,07,1
...,...,...,...,...
12942,topic 9,2021,04,1
12943,topic 9,2021,05,2
12944,topic 9,2021,06,1
12945,topic 9,2021,07,1


In [38]:
df_grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12947 entries, 0 to 12946
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   topics             12947 non-null  object
 1   publication_year   12947 non-null  int64 
 2   publication_month  12947 non-null  object
 3   body               12947 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 404.7+ KB


In [40]:
topic_names_inc_exc = pd.read_excel('../../../countries/australia/for-students/Climate_change_topic_names.xlsx')

In [41]:
# Retain the appropriate topics only
imp_topics_df = topic_names_inc_exc[topic_names_inc_exc['Included in Line Chart'] == 1]

In [42]:
imp_topics_df

Unnamed: 0,Topic Number,Article Count,Topic Name (created by research team),Included in Line Chart
1,2,,Garnaut report,1
2,3,,International summits,1
3,4,,Debating denialism,1
6,7,,Climate Pollution Reduction Scheme (2008),1
8,9,,Climate Science,1
10,11,,Extreme weather events,1
14,15,,Coal fired power stations,1
15,16,,Gas mine development,1
16,17,,Climate forcasting,1
17,18,,Greenhouse gas emissions,1


In [43]:
imp_topics_df['Topic Number'] = imp_topics_df['Topic Number'].apply(lambda x: 'topic '+str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [44]:
imp_topics_df

Unnamed: 0,Topic Number,Article Count,Topic Name (created by research team),Included in Line Chart
1,topic 2,,Garnaut report,1
2,topic 3,,International summits,1
3,topic 4,,Debating denialism,1
6,topic 7,,Climate Pollution Reduction Scheme (2008),1
8,topic 9,,Climate Science,1
10,topic 11,,Extreme weather events,1
14,topic 15,,Coal fired power stations,1
15,topic 16,,Gas mine development,1
16,topic 17,,Climate forcasting,1
17,topic 18,,Greenhouse gas emissions,1


In [45]:
topics_renamed = dict(zip(imp_topics_df['Topic Number'].values, imp_topics_df['Topic Name (created by research team)'].values))

In [46]:
df_grouped_df['renamed_topics'] = df_grouped_df.topics.map(topics_renamed)

In [47]:
df_grouped_df = df_grouped_df.dropna()

In [48]:
# df_grouped_df = df_grouped_df[df_grouped_df.publication_year >= 2016]
df_grouped_df

Unnamed: 0,topics,publication_year,publication_month,body,renamed_topics
431,topic 11,2001,01,1,Extreme weather events
432,topic 11,2001,07,1,Extreme weather events
433,topic 11,2001,08,1,Extreme weather events
434,topic 11,2001,11,2,Extreme weather events
435,topic 11,2001,12,1,Extreme weather events
...,...,...,...,...,...
12942,topic 9,2021,04,1,Climate Science
12943,topic 9,2021,05,2,Climate Science
12944,topic 9,2021,06,1,Climate Science
12945,topic 9,2021,07,1,Climate Science


In [49]:
df_grouped_df.renamed_topics.unique()

array(['Extreme weather events', 'Coal fired power stations',
       'Gas mine development', 'Climate forcasting',
       'Greenhouse gas emissions',
       'Environmentally responsible investing', 'Garnaut report',
       'Bushfires', 'Renewable energy', 'International summits',
       'Biodiversity, habitats and extinction', 'Agriculture',
       'Climate activism and protests', 'Debating denialism',
       'Recycling and circular economy', 'Coal mining',
       'Glaciers and polar ice', 'Coral reefs',
       'Climate Pollution Reduction Scheme (2008)', 'Climate Science'],
      dtype=object)

In [50]:
df_grouped_df

Unnamed: 0,topics,publication_year,publication_month,body,renamed_topics
431,topic 11,2001,01,1,Extreme weather events
432,topic 11,2001,07,1,Extreme weather events
433,topic 11,2001,08,1,Extreme weather events
434,topic 11,2001,11,2,Extreme weather events
435,topic 11,2001,12,1,Extreme weather events
...,...,...,...,...,...
12942,topic 9,2021,04,1,Climate Science
12943,topic 9,2021,05,2,Climate Science
12944,topic 9,2021,06,1,Climate Science
12945,topic 9,2021,07,1,Climate Science


In [51]:
for i in df_grouped_df.renamed_topics.unique():
    print(f'if (name === "{i}")')
    sub_df = df_grouped_df[df_grouped_df.renamed_topics == i]
    # print(sub_df.publication_year.astype(str) + '-' + sub_df.publication_month)
    result = dict(zip(sub_df.publication_year.astype(str) + '-' + sub_df.publication_month, sub_df.body))
    result = [{'year': key, f'value{i}': value} for key, value in result.items()]

    print(f'data = {result}; \n')

if (name === "Extreme weather events")
data = [{'year': '2001-01', 'valueExtreme weather events': 1}, {'year': '2001-07', 'valueExtreme weather events': 1}, {'year': '2001-08', 'valueExtreme weather events': 1}, {'year': '2001-11', 'valueExtreme weather events': 2}, {'year': '2001-12', 'valueExtreme weather events': 1}, {'year': '2002-01', 'valueExtreme weather events': 1}, {'year': '2002-02', 'valueExtreme weather events': 3}, {'year': '2002-03', 'valueExtreme weather events': 1}, {'year': '2002-04', 'valueExtreme weather events': 1}, {'year': '2002-06', 'valueExtreme weather events': 2}, {'year': '2002-07', 'valueExtreme weather events': 1}, {'year': '2002-08', 'valueExtreme weather events': 1}, {'year': '2002-10', 'valueExtreme weather events': 2}, {'year': '2002-11', 'valueExtreme weather events': 1}, {'year': '2002-12', 'valueExtreme weather events': 1}, {'year': '2003-01', 'valueExtreme weather events': 1}, {'year': '2003-02', 'valueExtreme weather events': 1}, {'year': '2003-04'

In [90]:
x

[{'year': '1989-04', 'valuetopic 1': 1},
 {'year': '1989-05', 'valuetopic 1': 1},
 {'year': '1989-07', 'valuetopic 1': 1},
 {'year': '1990-02', 'valuetopic 1': 1},
 {'year': '1990-09', 'valuetopic 1': 2},
 {'year': '1990-11', 'valuetopic 1': 2},
 {'year': '1991-06', 'valuetopic 1': 1},
 {'year': '1991-07', 'valuetopic 1': 1},
 {'year': '1991-08', 'valuetopic 1': 1},
 {'year': '1991-09', 'valuetopic 1': 1},
 {'year': '1992-04', 'valuetopic 1': 1},
 {'year': '1992-07', 'valuetopic 1': 1},
 {'year': '1992-12', 'valuetopic 1': 1},
 {'year': '1993-11', 'valuetopic 1': 1},
 {'year': '1994-06', 'valuetopic 1': 1},
 {'year': '1994-08', 'valuetopic 1': 1},
 {'year': '1995-01', 'valuetopic 1': 2},
 {'year': '1995-11', 'valuetopic 1': 1},
 {'year': '1996-07', 'valuetopic 1': 1},
 {'year': '1996-12', 'valuetopic 1': 1},
 {'year': '1997-04', 'valuetopic 1': 1},
 {'year': '1997-06', 'valuetopic 1': 2},
 {'year': '1997-07', 'valuetopic 1': 1},
 {'year': '1997-09', 'valuetopic 1': 1},
 {'year': '1997-