In [1]:
# first preprocesses the data: 
# group the debate titles by date (month, year, decade)
# export to txt files? 

In [2]:
#So it sounds like my assignment should be using dynamic topic modeling to order debate 
#titles by theme and then figure out where the biggest rises and falls are

In [138]:
import numpy as np
import os 
import pandas as pd 
import re

from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim.utils import simple_preprocess

import warnings; warnings.simplefilter('ignore')

In [139]:
hansard = pd.read_csv('/users/sbuongiorno/hansard_justnine_w_year.csv')  

In [140]:
hansard = hansard.sample(800)

In [146]:
def removing_leading_whitespaces(text):
     return re.sub(r"^\s+","",text)

def make_dir(data, fname): 
    path = os.getcwd()
    current_folder = os.path.basename(path)
    target_folder = fname + '_subsets'
    
    os.makedirs(target_folder, exist_ok=True)
    return target_folder


def clean_strings(data, keep):
    patterns = ['\[', '\]', '\(', '\)', '—$', '^—', '\"', '^ ', '^\'']#, '—', '\.']

    for pattern in patterns:
        regex = re.compile(pattern)
        data[keep] = data[keep].str.replace(regex, '')
        
    #shortword = re.compile(r'\W*\b\w{1,3}\b')
    #data[keep] = data[keep].str.replace(shortword, '')
    
    return data

def preprocess_dtm(data, col_name, keep, drop, start, end, intv, fname):
    target_folder = make_dir(data, fname)
    
    start = start
    end = end
    
    while start <= end:
        start = start + intv
        
        data = data[[col_name, keep]]
        subset = data[(data[col_name] >= start - intv) & (data[col_name] <= start - 1)]

        if not subset.empty:
            subset[keep] = subset[keep].astype(str).str.upper()
            
            stopwords = '|'.join(['THE', 'AND', 'NAN'])
            subset[keep] = subset[keep].str.replace(stopwords, '')

            descr = str(subset[col_name].iloc[0])
            descr_2 = str(subset[col_name].iloc[-1])
            
            subset = clean_strings(subset, keep)
            subset = subset.drop(columns=[drop])
            subset = subset.drop_duplicates()
            #if unit = 'tokens':
            #subset = subset[keep].str.split()
            
            time_slice = int(subset.shape[0])
            
            subset = subset.values.tolist()
                        
            dict_subset = Dictionary(subset)
            
            corpus = [dict_subset.doc2bow(s) for s in subset]
            
            ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                             id2word=dict_subset, 
                                             time_slice=[time_slice],
                                             num_topics=2)
            
            #print(subset)
            #print(ldaseq.print_topics())
        
        if start > end:
            return ldaseq
            

In [147]:
ldaseq = preprocess_dtm(data=hansard,
                        col_name='year', 
                        keep='debate', 
                        drop='year', 
                        start=1800, 
                        end=1910, 
                        intv=10, 
                        fname='hansard_decade')

# add unit = phrase or tokens 

In [144]:
ldaseq.print_topic_times(topic=0) # change in first topic 

[[('THIRD RESOLUTION.', 0.16037432740034796),
  ('NAVY ESTIMATES, 1910–11.', 0.16037432740034796),
  ('CIVIL SERVICES  REVENUE DEPARTMENTS ESTIMATES, 1910–11. ',
   0.16037432740034796),
  ('DEBATE ON  ADDRESS.', 0.16037432740034796),
  ('FICE BILL COLLECTION OF INCOME TAX.', 0.10178688038315399),
  ('CONSOLIDATED FUND NO. 2 BILL.', 0.051343162003090824),
  ('CLAUSE 1. —INCREASE OF NUMBER OF DEVELOPMENT COMMISSIONERS. ',
   0.051343162003090824),
  ('TOBACCO IMPORT DUTY.', 0.051343162003090824),
  ('CLASS II.', 0.051343162003090824),
  ("SECRETARY FOR SCOTL'S OFFICE.", 0.051343162003090824)]]

In [132]:
ldaseq.print_topic_times(topic=1) # change in second topic 

[[('PARLIAMENTARY FRANCHISE WOMEN BILL', 0.5641679998279904),
  ('BUSINESS  HOUSE', 0.24525165509357863),
  ('TRAINING COLLEGES  SECONDARY SCHOOLS', 0.19058034507843097)]]

In [133]:
ldaseq.print_topics(time=0)

[[('BUSINESS  HOUSE', 0.5881063701472296),
  ('TRAINING COLLEGES  SECONDARY SCHOOLS', 0.24513861366837908),
  ('PARLIAMENTARY FRANCHISE WOMEN BILL', 0.16675501618439137)],
 [('PARLIAMENTARY FRANCHISE WOMEN BILL', 0.5641679998279904),
  ('BUSINESS  HOUSE', 0.24525165509357863),
  ('TRAINING COLLEGES  SECONDARY SCHOOLS', 0.19058034507843097)]]