In [1]:
# first preprocesses the data: 
# group the debate titles by date (month, year, decade)
# export to txt files? 

In [1]:
import os 
import re
import csv
import sys
import numpy as np
import pandas as pd 

from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary

from operator import itemgetter

#os.chdir('/users/sbuongiorno/democracy-lab/util/')

#from pyfunctions.str_functions import lemmatize_df_text
#from pyfunctions.parallelize_operation import parallelize_operation

os.chdir('/users/sbuongiorno/hansard-debate-titles/dynamic-topic-modeling/')

import warnings; warnings.simplefilter('ignore')

In [2]:
hansard = pd.read_csv('/users/sbuongiorno/hansard_justnine_w_year.csv')  

In [4]:
hansard = hansard.rename(columns = {'text': 'sentence'})

In [5]:
stopwords = pd.read_csv('/users/sbuongiorno/stopwords.csv')

In [6]:
list_of_stopwords = stopwords['stop_word'].tolist()

In [7]:
# do I want to lemmatize words before topic modeling? 

# dtm_gensim_functions.py

#def removing_leading_whitespaces(text):
#     return re.sub(r"^\s+","", text)

# place holder lists didn't do anything, so I will need to add date separation after the fact -- before plotting

# maybe I want to add a space instead, and maybe then I want to remove extra white space

def clean_strings(data, group):   
    patterns = ['\[', '\]', '\(', '\)', '—$', '^—', '\"', '^ ', '^\'', '—', '\.', '\,', '\?']

    for pattern in patterns:
        regex = re.compile(pattern)
        data[group] = data[group].str.replace(regex, '')
        
    shortword = re.compile(r'\W*\b\w{1,3}\b')
    data[group] = data[group].str.replace(shortword, '')
    
    return data


def dtm_dates(data, date_col, group, start, end, intv):
    start = start
    end = end
    
    date_exists = []
    
    while start < end:
        data = data[[date_col, group]]
        subset = data[(data[date_col] >= start) & (data[date_col] <= start + 9)]
        
        if not subset.empty:
            date_exists.append(1)
            
        else:
            date_exists.append(0)
            
        start = start + intv
    
    return date_exists


def label_topics(data, date_col, group, start, end, intv, **kwargs):
    stopwords_list = kwargs.get('stopwords_list', None) 
    start = start
    end = end
    
    n = 3
    
    while start < end:     
        count = {}
        
        data = data[[date_col, group]]
        subset = data[(data[date_col] >= start) & (data[date_col] <= start + 9)]
        
        if not subset.empty:
                        
            subset[group] = subset[group].astype(str).str.lower()
            
            subset = clean_strings(subset, group)
            
            subset[group] = subset[group].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))

            subset = subset[group].str.split().tolist()
            
            for ls in subset:
                for string in ls:
                    if string in count:
                        count[string] += 1
                    else:
                        count[string] = 1
                        
        else:
            pass
            
        start = start + intv
        
    topic_label_dict = dict(sorted(count.items(), key = itemgetter(1), reverse = True)[:n])

    topic_label = list(topic_label_dict.keys())
    
    return '-'.join(topic_label)
    
    
def dtm_model(data, date_col, topic_model, start, end, intv, num_topics, **kwargs):
    stopwords_list = kwargs.get('stopwords_list', None)
    
    start = start
    end = end
    
    corpus = []
    time_slice = []
    dict_subset = Dictionary()
    
    while start < end:
        
        data = data[[date_col, topic_model]]
        subset = data[(data[date_col] >= start) & (data[date_col] <= start + 9)]
        
        if not subset.empty:
            subset[topic_model] = subset[topic_model].astype(str).str.lower()
            
            subset = clean_strings(subset, topic_model)
            subset = subset.drop(columns=[date_col])
            
            subset = subset.drop_duplicates()
            
            subset[topic_model] = subset[topic_model].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))
                
            #subset = parallelize_operation(subset, lemmatize_df_text, n_cores)
                
            subset = subset[topic_model].str.split()       
            
            current_time_slice = int(subset.shape[0])
            time_slice.append(current_time_slice) 
                        
            subset = subset.values.tolist()
                        
            current_dict_subset = Dictionary(subset)
            dict_subset.merge_with(current_dict_subset)

            current_corpus = [dict_subset.doc2bow(s) for s in subset]
            corpus.extend(current_corpus)

        start = start + intv
    
    ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                     id2word=dict_subset,
                                     time_slice=time_slice,
                                     num_topics=num_topics)

    return ldaseq

In [8]:
ldaseq = dtm_model(data=hansard,
                   date_col='year', 
                   topic_model='sentence', 
                   start=1800, 
                   end=1920,
                   intv=10, 
                   num_topics=2,
                   stopwords_list=list_of_stopwords)

In [9]:
ldaseq.save('/users/sbuongiorno/hansard-debate-titles/dynamic-topic-modeling/ldaseq_test_model')

In [14]:
#!jupyter nbconvert --to script dtm_hansard.ipynb

[NbConvertApp] Converting notebook dtm_hansard.ipynb to script
[NbConvertApp] Writing 6754 bytes to dtm_hansard.py


In [11]:
# ldaseqmodel.LdaSeqModel.load('/users/sbuongiorno/hansard-debate-titles/dynamic-topic-modeling/ldaseq_test_model')

In [240]:
ldaseq.print_topics(time=0) # print all topics for a given time 

[[('increase', 0.04430505540456826),
  ('affairs', 0.04430505414162301),
  ('lords', 0.04430505414162301),
  ('salaries', 0.04430505414162301),
  ('depraved', 0.014098093769162288),
  ('feared', 0.009989856640807704),
  ('efficient', 0.009989856640807704),
  ('leave', 0.009989856640807704),
  ('government', 0.009989856640807704),
  ('dark', 0.009989856640807704),
  ('services', 0.009989856640807704),
  ('acceptance', 0.009989856640807704),
  ('left', 0.009989856640807704),
  ('addressing', 0.009989856640807704),
  ('altogether', 0.009989856640807704),
  ('augmentation', 0.009989856640807704),
  ('captain', 0.009989856640807704),
  ('officials', 0.009989856640807704),
  ('year', 0.009989856640807704),
  ('high', 0.009989856640807704)],
 [('commission', 0.026534267047512914),
  ('worked', 0.01992334523991067),
  ('home', 0.019868562401481715),
  ('rule', 0.019868562401481715),
  ('sunday', 0.019833990512345846),
  ('time', 0.012838106275778355),
  ('county', 0.009464679881260029),
  ('pl

In [239]:
ldaseq.print_topic_times(topic=0) # change over time in first topic, where each list represents the time interval 

[[('increase', 0.04430505540456826),
  ('affairs', 0.04430505414162301),
  ('lords', 0.04430505414162301),
  ('salaries', 0.04430505414162301),
  ('depraved', 0.014098093769162288),
  ('feared', 0.009989856640807704),
  ('efficient', 0.009989856640807704),
  ('leave', 0.009989856640807704),
  ('government', 0.009989856640807704),
  ('dark', 0.009989856640807704),
  ('services', 0.009989856640807704),
  ('acceptance', 0.009989856640807704),
  ('left', 0.009989856640807704),
  ('addressing', 0.009989856640807704),
  ('altogether', 0.009989856640807704),
  ('augmentation', 0.009989856640807704),
  ('captain', 0.009989856640807704),
  ('officials', 0.009989856640807704),
  ('year', 0.009989856640807704),
  ('high', 0.009989856640807704)],
 [('increase', 0.04435764798051931),
  ('affairs', 0.04435764691520652),
  ('lords', 0.04435764691520652),
  ('salaries', 0.04435764691520652),
  ('depraved', 0.014098812074167306),
  ('feared', 0.00999187298040105),
  ('efficient', 0.00999187298040105),


In [8]:
#test = label_topics(data = hansard,
#                    date_col = 'year', # change to date_col
#                    group = 'debate',
#                    start = 1800,
#                    end = 1910,
#                    intv = 10,
#                    stopwords_list=list_of_stopwords)

In [9]:
#test

'indentured-coolies-indian'

In [None]:
#ldaseq = LdaSeqModel.load(temp_file)

In [178]:

#t = 0

#for i in test:
#    if i != 0:
#        print(ldaseq.print_topics(time=t))
    
#    t = t + 1
        

#t = 0

#for i in range(0, len(test)):
    #print(test[i])
    
#    if test[i] != 0:
#        print(ldaseq.print_topics(time=t))
    
#    t = t + 1

#    print(t)

In [179]:
# for the length of range -- if item is not 0, topic (then topic + 1)