# LDA model
## Uncleaned data on one committee

Dependencies:
* numpy
* pandas
* matplotlib
* gensim
* pyLDAvis

In [None]:
__author__ = 'Eliana Ruby'

In [1]:
import os
import re
from pprint import pprint

import numpy as np
import pandas as pd

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


  """
  """


In [2]:
# imports of our code
from hebrew_stopwords import hebrew_stopwords
from Parser import tokenize

#### Get data

In [3]:
# name of committee to classify
committee_name = u'המדע והטכנולוגיה'

# this data table is the "meta" table we made.
meta = pd.read_csv('meta.csv')

# Get session ids from the committee I want to classify.
# the committee shows up both with and without 'ועדת' at the beginning, so I checked both.
# (I wanted to use CommitteeID) but it wasn't one-to-one.
session_ids = meta.ID[meta.committee_name.isin([committee_name, u'ועדת {}'.format(committee_name)])]
len(session_ids)

518

In [4]:
# read from our "Data" table (protocol parts with their protocol IDs) in chunks
data = pd.DataFrame()
for chunk in pd.read_csv('data.csv', chunksize=10000):
    # store only the ones from our committee
    data = pd.concat([data,chunk[chunk.ID.isin(session_ids)]])
data.tail()

Unnamed: 0.1,Unnamed: 0,header,body,ID
6744331,6744331,ג׳ורדנה קטלר,אנחנו נרשום את ההכנסות שלנו מפרסום במדינת ישרא...,435006.0
6744332,6744332,"היו""ר אורי מקלב","אוקיי. ג'ורדנה, אני חייב לסיים. קודם כל תודה ר...",435006.0
6744333,6744333,ג׳ורדנה קטלר,זה בגלל חברת הכנסת סויד.,435006.0
6744334,6744334,רויטל סויד (המחנה הציוני),נכון.,435006.0
6744335,6744335,"היו""ר אורי מקלב",אני חושב שגם חברת הכנסת סויד חושבת אותו דבר. א...,435006.0


#### Clean data

In [9]:
# filter out trash (empty body text/empty headers)
data = data.fillna('')
f_data = data[data.body.apply(lambda x: len(x) > 0)]
f_data = f_data[f_data.header.apply(lambda x: len(x) > 0)]
len(f_data)

120307

In [None]:
## group protocol parts into one protocol. The topic model can be run on whole protocols instead 
## of parts, but without preprocessing the results aren't great.
# g_data = f_data.groupby('ID')['body'].apply(lambda x: '\n'.join(x)).reset_index()

In [None]:
# MORE PREPROCESSING SHOULD GO HERE!
# If we'd managed to use yap to lemmatize and filter by part of speech on the whole dataset, it would go here.

In [10]:
# turn each text into a list of tokens (split into words, filtered by a list of stopwords in 
# hebrew_stopwords.py and Parser.py)
preprocessed_data = [tokenize([s]) for s in f_data.body]

In [11]:
# filter out the word '-' (it's already in stopwords but it didn't work for some reason)
preprocessed_data = [[s for s in l if s != '-'] for l in preprocessed_data]
# filter out extremely short protocol parts (they're pretty noisy and they messed up the model)
preprocessed_data = [t for t in preprocessed_data if len(t) > 10 ]

#### Create model

In [12]:
# This is where the LDA happens :)
# Create Dictionary
id2word = corpora.Dictionary(preprocessed_data)

# Create Corpus
texts = preprocessed_data

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [13]:
# Build LDA model
# This can take a while to run. Go grab a beer and maybe a better server.
# gensim doesn't include functionality to print how many iterations it's done, which is annoying.
# You can use logging for it, but I never got around to it...
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,       # list of texts as lists of tokens
                                           id2word = id2word,      # dictionary mapping words to ids
                                           num_topics = 30,        # number of topics to look for
                                           random_state = 20,      
                                           passes = 10,            # number of iterations over the data (more is better for the model, to a point)
                                           alpha = 'auto',
                                           per_word_topics = True)

In [27]:
# Save model to disk.
temp_file = "models/model6"
lda_model.save(temp_file)
temp_file

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


'models/model6'

In [21]:
# keep training the model for more iterations
lda_model.update(corpus, passes=10)

In [23]:
# Print the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(7,
  '0.026*"נשים" + 0.015*"בירושלים" + 0.014*"פרס" + 0.012*"במגזר" + '
  '0.009*"החרדי" + 0.009*"הערבי" + 0.009*"מוזיאון" + 0.008*"מורה" + '
  '0.008*"צעירים" + 0.008*"מחשוב"'),
 (24,
  '0.037*"אדם" + 0.024*"מאגר" + 0.022*"ניסויים" + 0.021*"המאגר" + '
  '0.018*"רפואיים" + 0.017*"בבני" + 0.015*"הלסינקי" + 0.014*"העליונה" + '
  '0.012*"הביומטרי" + 0.010*"הגנטי"'),
 (2,
  '0.033*"הישיבה" + 0.024*"בשעה" + 0.015*"ננעלה" + 0.014*"תודה" + 0.013*"רבה" '
  '+ 0.009*"המחירים" + 0.008*"מודה" + 0.007*"מיקי" + 0.006*"פיילוט" + '
  '0.006*"להצעת"'),
 (21,
  '0.019*"המשפטי" + 0.013*"היועץ" + 0.011*"גיל" + 0.011*"אינטל" + '
  '0.011*"לפגוע" + 0.010*"קופת" + 0.009*"במחשב" + 0.009*"16" + 0.008*"מתייחס" '
  '+ 0.007*"הפגיעה"'),
 (11,
  '0.026*"אביב" + 0.020*"המקומיות" + 0.019*"הרשויות" + 0.017*"רשויות" + '
  '0.017*"המקומי" + 0.015*"תל" + 0.013*"השלטון" + 0.008*"בתל" + 0.008*"תעודות" '
  '+ 0.008*"לימודי"'),
 (1,
  '0.013*"לבית" + 0.013*"חופש" + 0.012*"זמנים" + 0.011*"המורים" + '
  '0.010*"שדיברנו" + 

In [22]:
# calculate topic coherence - this is a measure of how much the topics make sense. 
# The accepted standard for a model is for this to be above 0.6, right now I'm at 0.47...

coherence_model_ldamallet = CoherenceModel(model=lda_model, texts=preprocessed_data, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
coherence_ldamallet

0.47272110136549406

#### Visualization

In [24]:
# Visualize
# this can take a while to run, but it's really cool so you should do it anyway. 
# Also, you can save the visualizations as an html document.
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
pyLDAvis.save_html(vis,'models/this_one_works.html')