In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/My Drive/Project-CS410')

In [None]:
import numpy as np
import pandas as pd
from numba_plsa.corpus import CorpusBuilder
from numba_plsa.plsa import PLSAModel
stopword_file = 'datasets/stop.txt'
import scipy
from statsmodels.tsa.stattools import grangercausalitytests
import matplotlib.pyplot as plt

  import pandas.util.testing as tm


# Topic Modeling

In [None]:
def get_stopwords(fname):
  with open(fname, 'rb') as f:
    return set(
      line.split(b' ', 1)[0].strip() for line in f if line[0] not in [' ', '|']
    )
def print_title(txt):
  print ("\n{0}\n{1}".format(txt, '=' * len(txt)))

def get_topic_coverage(time_entries, topic_doc_matrix, time_unit=1):
  timestamps = sorted(set(time_entries))
  topic_coverage = np.zeros((len(timestamps), topic_doc_matrix.shape[1]))
  for i, timestamp in enumerate(timestamps):
    idx = time_entries==timestamp
    coverage = np.sum(topic_doc_matrix[idx,:], axis=0)
    topic_coverage[i] = coverage
  topic_coverage = pd.DataFrame(data=topic_coverage)
  return topic_coverage

def build_corpus(docs, silent=False):
  stopwords = get_stopwords(stopword_file)
  if silent==False:
    print_title("Building corpus")
  CB = CorpusBuilder(stopwords=stopwords, min_len=3, max_len=8)
  doc_count = 0
  for i, doc in enumerate(docs):
    CB.add_document(name=str(i), text=doc)
  return CB  

def run_plsa(CB, term_topic_prior, mu=0, n_topics=10, n_iter=100, min_count=5, method='numba', silent=False):
  doc_term = CB.get_doc_term()
  if silent==False:
    print_title("\nRunning pLSA")
  model = PLSAModel()
  model.train(doc_term, n_topics=n_topics, n_iter=n_iter, min_count=min_count, method=method, term_topic_prior=term_topic_prior, mu=mu)

  if silent==False:
    print_title("\nTop topic terms")
    top_topic_words = model.top_topic_terms(5, normalized=True)
    for i in range(n_topics):
      print ("Topic {0}: {1}".format(
        i + 1, ', '.join(CB.get_term(j) for j in top_topic_words[i])
      ))
  return model

# Topic-level Causality Modeling

In [None]:
def smoothing(series, window=3):
    series1 = series.rolling(window).mean()
    series1=series1-series1.shift(1)
    series1=series1.bfill()
    return series1

def granger_test(time_series, topic_coverage, lag=5, alpha=0.05):
  # smooth both time_series and topic coverage data
  smoothed_time_series = smoothing(time_series)
  smoothed_topic_coverage = topic_coverage.copy()
  for i in range(topic_coverage.shape[1]):
    smoothed_topic_coverage[:][i] = smoothing(smoothed_topic_coverage[:][i])

  causal_topics=[]
  causal_lags=[]
  #per topic granger test
  for i in range(topic_coverage.shape[1]):
    df_granger = pd.DataFrame(columns = ['time', 'topic'], data=zip(smoothed_time_series, smoothed_topic_coverage[:][i]))
    gc_res= grangercausalitytests(df_granger, lag, verbose=False)
    for j in range(1,lag+1):
      if(gc_res.get(j)[0].get('ssr_ftest')[1]<alpha):
        if(i not in causal_topics):
          causal_topics.append(i)
          causal_lags.append(j)
  return causal_topics, causal_lags

#Word-level Causality Modeling

In [None]:
#for each term (each column in doc_term_matrix_sum), perform Pearson correlation and return r and significance value
def get_word_level_pearson(word_freq, time_series):
  pearson_matrix=np.zeros(doc_term_matrix.shape[1])
  pearson_sig_matrix=np.zeros(doc_term_matrix.shape[1])
  for i in range(doc_term_matrix.shape[1]):
    r,sig=scipy.stats.pearsonr(doc_term_date_sum_matrix[i], time_series)
    pearson_matrix[i]=r
    pearson_sig_matrix[i]=sig
  return pearson_matrix, pearson_sig_matrix

#function that converts a list of significant values to a corresponding list of probabilities, based on the formula at the bottom of page 3
def get_prob_from_sig(sig, cutoff):
  a=np.array(sig)
  weights=a-cutoff
  prob=weights/weights.sum()
  return list(prob)

def generate_topic_priors(plsa_model, num_top_terms, num_topics, silent=False):
  ''' input: plsa_model: plsa object from topic modelling
            num_top_terms: number of top terms being retrieved from plsa output
            num_topics: number of topics used in plsa
      output: a (num_topics x num_terms) matrix reflecting unified topic priors from all groups of significant terms
  '''
  
  #get r and significance values for top terms in each topic
  top_topic_terms_indices=plsa_model.top_topic_terms(num_top_terms, normalized=True)
  top_topic_terms_r=np.zeros(top_topic_terms_indices.shape)
  top_topic_terms_sig=np.zeros(top_topic_terms_indices.shape)
  for i in range(num_topics):
    for j in range(num_top_terms):
      top_topic_terms_r[i,j]=word_level_pearson_matrix[top_topic_terms_indices[i,j]]
      top_topic_terms_sig[i,j]=word_level_pearson_sig_matrix[top_topic_terms_indices[i,j]]

  np.set_printoptions(suppress=True)
  if silent==False:
    print('Top topic terms:\n',top_topic_terms_indices)
    print('\nCorrelation r of top topic terms:\n',top_topic_terms_r)
    print('\nCorrelation significance value of top topic terms:\n', top_topic_terms_sig)

  #initializing list of topic models for prior, each list item is a numpy array of shape (num_topics, num_terms)
  topic_prior_list=[]
  num_terms=doc_term_matrix.shape[1]
  significant_terms_df=pd.DataFrame(columns=['topic', 'term_index', 'sig', 'r', 'r_sign']) #initialize a df to store significant terms summary

  for i in range(num_topics): #loop through each topic
    if silent == False:
      print('\nTopic '+str(i)+':')
    
    pos_indices=[]
    pos_sig=[]
    neg_indices=[]
    neg_sig=[]

    for j in range(num_top_terms): #loop through each term
      if top_topic_terms_sig[i, j]<.05: #if term is significant
        #append term index and significance value to a list based on the correlation sign
        if top_topic_terms_r[i, j]>0: 
          pos_indices.append(top_topic_terms_indices[i,j])
          pos_sig.append(1-top_topic_terms_sig[i, j])
        else:
          neg_indices.append(top_topic_terms_indices[i,j])
          neg_sig.append(1-top_topic_terms_sig[i, j])
        
        #add row to df for summary
        new_row={'topic': i, 'term_index': top_topic_terms_indices[i, j], 'sig': 1-top_topic_terms_sig[i, j], 'r': top_topic_terms_r[i, j], 'r_sign': '+' if top_topic_terms_r[i, j]>0 else '-'}
        significant_terms_df=significant_terms_df.append(new_row, ignore_index=True)
    
    #process each group of significant terms if group size is at least 10% of the other group size
    sig_cutoff=.95

    if len(pos_indices)/max(len(neg_indices), 1) >= .1:
      pos_prob=get_prob_from_sig(pos_sig, sig_cutoff)
      if silent==False:
        print('pos indices', pos_indices)
        print('pos sig', pos_sig)
        print('pos prob', pos_prob)
      topic_model=np.zeros((num_topics, num_terms))
      topic_model[i, pos_indices]=pos_prob
      topic_prior_list.append(topic_model)
      if silent==False:
        print('=> Added topic model for positive terms')

    if len(neg_indices)/max(len(neg_indices), 1) >= .1:
      neg_prob=get_prob_from_sig(neg_sig, sig_cutoff)
      if silent==False:
        print('neg indices', neg_indices)
        print('neg sig', neg_sig)  
        print('neg prob', neg_prob)
      topic_model=np.zeros((num_topics, num_terms))
      topic_model[i, neg_indices]=neg_prob
      topic_prior_list.append(topic_model)
      if silent==False:
        print('=> Added topic model for negative terms')

    if silent==False: 
      print('\nSignificant terms summary: \n', significant_terms_df)
    return sum(topic_prior_list)

# Prepare data

Get Text Data

In [None]:
data_path = 'datasets/corpus_may2000-oct2000_cleaned.txt'
text_data = pd.read_csv(data_path, sep='\t', header=None).values
time_entries, docs = text_data[:,0], text_data[:,1]

Get Time Series Data

In [None]:
#read time series data for Democratics
time_series_matrix_Dem= pd.read_csv("datasets/May_Oct_Dem.csv",parse_dates=['Date'], index_col='Date')
#read time series data for Republicans
time_series_matrix_Rep=pd.read_csv("datasets/May_Oct_Rep.csv",parse_dates=['Date'], index_col='Date')

#time series democrates 
time_series_Dem=time_series_matrix_Dem['AvgPrice']/(time_series_matrix_Rep['AvgPrice']+time_series_matrix_Dem['AvgPrice'])
time_series_Dem=time_series_Dem.bfill()

#time series republicans 
time_series_Rep=time_series_matrix_Rep['AvgPrice']/(time_series_matrix_Rep['AvgPrice']+time_series_matrix_Dem['AvgPrice'])
time_series_Rep=time_series_Rep.bfill()

# Run ITMTF

In [46]:
CB = build_corpus(docs, silent=True)
n_topics = 10
doc_term_matrix = CB.get_doc_term()
time_series_data = time_series_Dem

term_topic_prior = np.zeros((n_topics, doc_term_matrix.shape[1]))
causal_topics = []
iter = 5
for it in range(iter):
  print("\n\n=> Iternation: ", it)
  mu_vec = np.zeros(n_topics)
  mu_vec[:] = 100
  plsa_model = run_plsa(CB, n_topics=n_topics, term_topic_prior=term_topic_prior, mu=mu_vec, n_iter=200, silent=False)
  topic_coverage = get_topic_coverage(time_entries, plsa_model.topic_doc)
  causal_topics, causal_lags = granger_test(time_series_data, topic_coverage)
  print("Causal Topics:, ", np.array(causal_topics)+1)
  doc_term_date_sum_matrix=pd.DataFrame(doc_term_matrix, index=time_entries).groupby(level=0).sum()
  word_level_pearson_matrix, word_level_pearson_sig_matrix=get_word_level_pearson(doc_term_date_sum_matrix, time_series_data)
  
  num_top_terms=100
  term_topic_prior = generate_topic_priors(plsa_model, num_top_terms, n_topics, silent=True)



=> Iternation:  0


Running pLSA

        Running numba pLSA algorithm
        Number of iterations: 200
        Number of documents: 3625 / 3625 before min_count (5)
        Number of terms: 6301 / 14916 before min_count (5)
        Number of topics: 10
        Sparsity factor: 0.01342
    

Ran 200 iterations in 10.524 seconds


Top topic terms
Topic 1: heston, giuliani, nra, hillary, lazio
Topic 2: wis, aflcio, teamster, ralph, nader
Topic 3: saving, elderly, trillion, surplus, medicare
Topic 4: conrad, donor, donation, gasoline, reno
Topic 5: vetting, edition, sarasota, mckinnon, coelho
Topic 6: dna, execute, inmate, penalty, death
Topic 7: nato, treaty, russia, troop, nuclear
Topic 8: danforth, scalia, plank, jew, naacp
Topic 9: hance, prince, castro, ranger, yale
Topic 10: oprah, karenna, schiff, kiss, vidal
Causal Topics:,  [ 2  4  5  7 10]


=> Iternation:  1


Running pLSA

        Running numba pLSA algorithm
        Number of iterations: 200
        Number of documents: 36