In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
import random
from pathlib import Path
from time import time
from datetime import datetime
from tqdm import tqdm, tnrange, tqdm_notebook
from pprint import pprint
from warnings import warn
from datetime import datetime
import itertools
from collections import Counter, OrderedDict
from bidi import algorithm as bidi
from hebrew_stopwords import hebrew_stopwords
import multiprocessing as mp
import json
from tagger import lemmatize

# Preform Lemmatezaion over all the topics from data_committees_kns_committeesession_kns_committeesession.csv

In [None]:
mpl.rcParams.update({'font.size': 13})

### Read the csv containing CommitteeSessionID and topic

In [None]:
%%time
DATA = Path('../data')
df_topic = pd.read_csv(DATA/'data_committees_kns_committeesession_kns_committeesession.csv')

CPU times: user 984 ms, sys: 50 ms, total: 1.03 s
Wall time: 1.04 s


In [None]:
df_topic.head()

Unnamed: 0,CommitteeSessionID,Number,KnessetNum,TypeID,TypeDesc,CommitteeID,Location,SessionUrl,BroadcastUrl,StartDate,...,download_filename,download_filesize,parts_crc32c,parts_filesize,parts_parsed_filename,text_crc32c,text_filesize,text_parsed_filename,topics,committee_name
0,64990,,15,161,פתוחה,25,"חדר הוועדה, באגף קדמה, קומה 1, חדר 1720",http://main.knesset.gov.il/Activity/committees...,,2002-06-12 09:00:00,...,,,,,,,,,"[""חוק הבחירות לכנסת (תיקון מס' 52), התשס\""ד-20...","החוקה, חוק ומשפט"
1,470756,,18,161,פתוחה,661,"חדר הוועדה, באגף הוועדות (קדמה), קומה 2, חדר 2740",http://main.knesset.gov.il/Activity/committees...,,2012-06-06 09:30:00,...,,,,,,,,,"[""דו\""ח מבקר המדינה על ההתמודדות עם המשט הטורק...",לענייני ביקורת המדינה
2,470814,,18,161,פתוחה,668,"חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3710",http://main.knesset.gov.il/Activity/committees...,http://main.knesset.gov.il/Activity/committees...,2012-06-06 09:30:00,...,,,,,,,,,"[""פניות ציבור בנוגע לתוכנית \""מנוחה בכבוד\"" של...",לפניות הציבור
3,471255,,18,161,פתוחה,660,"חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3730",http://main.knesset.gov.il/Activity/committees...,,2012-06-13 09:30:00,...,,,,,,,,,"[""תקנות אגרות בריאות (תיקון), התשע\""ב - 2012"",...","העבודה, הרווחה והבריאות"
4,471661,,18,161,פתוחה,660,"חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3730",http://main.knesset.gov.il/Activity/committees...,,2012-06-20 09:00:00,...,,,,,,,,,"[""מעמדם של עובדי המכון למחקר ביולוגי בישראל""]","העבודה, הרווחה והבריאות"


### list of topics 

In [None]:
def preprocess_topics(topic_item):
    if str(topic_item).startswith("["):
        return " ".join(json.loads(topic_item))
    else:
        return ""
    


In [None]:
topic_list = [preprocess_topics(a) for a in df_topic.topics]

In [None]:
topic_list[:5]

['חוק הבחירות לכנסת (תיקון מס\' 52), התשס"ד-2004',
 'דו"ח מבקר המדינה על ההתמודדות עם המשט הטורקי ',
 'פניות ציבור בנוגע לתוכנית "מנוחה בכבוד" של משרד הבריאות',
 'תקנות אגרות בריאות (תיקון), התשע"ב - 2012 תקנות הרוקחים (תנאי ייצור נאותים לתכשירים)(תיקון), התשע"ב - 2012 תקנות הרוקחים (תכשירים)(תיקון), התשע"ב - 2012',
 'מעמדם של עובדי המכון למחקר ביולוגי בישראל']

## Run lemmetization using yap in tagger.py - multithreaded

In [None]:
NUM_OF_THREADS = 40

In [None]:
import threading
global tidx
global results

tidx = 0
results = [None]*len(topic_list)

class Runner(threading.Thread):
    def __init__(self, lock):
        self.lock = lock
        threading.Thread.__init__(self)
        
    def run(self):
        global tidx

        while tidx <= len(topic_list):
            self.lock.acquire()
            run_on = tidx
            tidx += 1
            print("Increasing to " + str(tidx))
            self.lock.release()

            res = lemmatize(topic_list[run_on])
            self.lock.acquire()
            global results
            results[run_on] = res
            self.lock.release()
             

In [None]:
lock = threading.Lock()
runners = [Runner(lock) for i in range(NUM_OF_THREADS)]
for r in runners:
    r.start()

### wait for all workers to stop (check that results is full)

In [None]:
topic_list[-1]

In [None]:
results[-1]

## Save these results intermedietly 

     results is built as a list of dicts (member per CommitteeSessionID)
     each dict contains a list of words per part of speach
     e.g 
     {'JJ': [],                 # Adjectives
         'CD': ['2019'],        # numbers 
         'NN': ['שינוי', 'תקציב'], # nouns 
         'NNT': ['שינה'],        # verb name 
         'NNP': [],             # entity name (e.g "מיכאל")
         'VB': []}              # verbs

In [None]:
CommitteeSessionID = df_topic.CommitteeSessionID.values
merged_df = pd.DataFrame({'CommitteeSessionID':CommitteeSessionID, 'lemmas':results})
merged_df.head()

In [None]:
with open('./topic_lemmas_df.pkl','wb') as f:
    pickle.dump(merged_df, f)

# Perform LDA on lemmatized topics
The purpose of this code is to create an LDA-Model-Object (using gensim library to do so). The LDA-Model is trained on the names of the sessions, as depicted in the column 'topics' in the file 'data_committees_kns_committeesession_kns_committeesession.csv'.

In [None]:
# imports
import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


import time

## Load data if starting from precomputed results 
The following code does the following:
1.   Uses the pickle library to read "topic_lemmas.pkl". This file consists of a list of dictionaries. Every dictionary describes a different title, in its lemmatized form (using the YAP). The values of the dictionary are the lemmatized words of the title, and the keys of the dictionary are parts of speech. For example, if the title is "בשנת 2002 דן קנה חלב", then the dictionary is:
{'CD': ['2002'], 
 'JJ' : [],
 'NN': ['שנה', 'חלב'],
 'NNP': ['דן,'],
 ,'NNT':[],
 'VB': ['קנה']
 ''}
 The idea is basically to sort the different words according to their part-of-speech.
2.   After loading the pickle and get rid of None values, the lemmatized title is defined as al parts of speech except for the 'CD' (words that belong to 'CD' are numbers).




In [None]:
topic_lemmas_df = pd.read_pickle('topic_lemmas_df.pkl')

## create list of tokens from lemma dicts

In [None]:
topic_lemas_list = topic_lemmas_df.lemmas.values
topic_lemas_list = [x if x is not None else {'JJ':[], 'NN':[], 'NNP':[], 'NNT':[], 'VB':[]} for x in topic_lemas_list]
lemmatize_title_func = lambda dic: dic['JJ'] + dic['NN'] + dic['NNP'] + dic['NNT'] + dic['VB']
lemmatized_titles = [lemmatize_title_func(title) for title in topic_lemas_list]

In [None]:
lemmatized_titles[:5]

[['בחירה', 'מס', 'כנסת', 'תשס', 'חוק', 'תיקון'],
 ['תורכי', 'מדינה', 'התמודדות', 'משט', 'ח', 'מבקר'],
 ['ציבור', 'תוכנית', 'מנוחה', 'כבוד', 'בריאות', 'פנייה', 'משרד'],
 ['בריא',
  'נאות',
  'אגרה',
  'תיקון',
  'רוקח',
  'ייצור',
  'תכשיר',
  'תיקון',
  'רוקח',
  'תכשיר',
  'תיקון',
  'ב',
  'ב',
  'ב',
  'תקנה',
  'תקנה',
  'תנאי',
  'תקנה'],
 ['ביולוגי', 'מעמד', 'מכון', 'מחקר', 'ישראל', 'עובד']]

## Tokenize and remove hebrew_stopwords

In [None]:
# Imports from github/ido90 source
from hebrew_stopwords import hebrew_stopwords
from Parser import tokenize
is_stopword = lambda word: bool(word in hebrew_stopwords)
is_jibbrish = lambda word: bool(len(word) < 3)
is_a_year = lambda word: (word == "תשח") or (word == "תשע") or (word == "תשס")
tests = [is_stopword, is_jibbrish, is_a_year]
filter_word = lambda word : sum([test(word) for test in tests]) == 0
tokenized_titles = [list(filter(filter_word, title)) for title in lemmatized_titles]

## Create corpus & TfidfModel

    Create id2word - word id to word Dictionary
    Create corpus - for each title, list of tuples containing a word ID and num of occurnces

In [None]:
titles = tokenized_titles

# Create Dictionary
id2word = corpora.Dictionary(titles)

# Create Corpus
corpus = [id2word.doc2bow(title) for title in titles]

In [None]:
corpus[:2]

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]

## Calculate LDA
The following code receives 'num_topics', trains an LDA-Model such that the number of clusters (of titles) is num_topics. Then, topic coherence score is calculated to see how good of a guess num_topics is.

In [None]:
def calc_model_and_score(num_topics):  
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                             id2word = id2word,
                                             num_topics = num_topics,
                                             random_state = 100,
                                             passes = 10,
                                             alpha = 'auto',
                                             per_word_topics = True)
    coherence_model_ldamallet = CoherenceModel(model=lda_model, texts=titles, dictionary=id2word, coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    return lda_model, coherence_ldamallet

The following code tries to train several lda-models, to see which produce highest topic-coherence-values.

In [None]:
LDA_NUM_OF_TOPICS_LIST = [20, 50, 80, 100, 140, 170, 200]

In [None]:
import threading
global tidx
global results

inps = LDA_NUM_OF_TOPICS_LIST
results = [None]*len(inps)

class Runner(threading.Thread):
    def __init__(self, lock, i, num):
        self.lock = lock
        self.num = num
        self.i = i
        threading.Thread.__init__(self)
        
    def run(self):
        r = calc_model_and_score(self.num)
        self.lock.acquire()
        global results
        results[self.i] = r
        self.lock.release()
             

## This section runs the LDA multithreaded over LDA_NUM_OF_TOPICS_LIST
    Check completion by printing results and seeing that is full and no None's

In [None]:
lock = threading.Lock()
runners = [Runner(lock, i, inps[i]) for i in range(len(inps))]

for r in runners:
    r.start()

In [None]:
print(results)

## Save intermediet results to pickle

In [None]:
data = {i : results[i] for i,n_topics in enumerate(LDA_NUM_OF_TOPICS_LIST)}
lda_results_df = pd.DataFrame.from_dict(data, orient='index')
lda_results_df.to_pickle('results_LDA.pkl')

In [None]:
with open('results_LDA.pkl', 'rb') as f:
    lda_results_df = pickle.load(f)

## Extract single model with certain topic numbers (this case 100)

In [None]:
lda_model = lda_results_df.iloc[3,0]

### example extract topic for certain enrty in corpus (certain title)

In [None]:
lda_model.get_document_topics(corpus[0])

[(52, 0.402), (72, 0.402)]

## Print the Keyword in the 10 topics


In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(55,
  '0.475*"כנס" + 0.149*"שיעור" + 0.079*"הבטחה" + 0.067*"ראשי" + 0.053*"רבנות" '
  '+ 0.030*"סכם" + 0.029*"מיזוג" + 0.018*"פנימיות" + 0.018*"אימוץ" + '
  '0.017*"אמנות"'),
 (42,
  '0.336*"הסדרה" + 0.109*"מילואים" + 0.106*"עיסוק" + 0.101*"חוק" + '
  '0.072*"פיקדון" + 0.059*"תימן" + 0.028*"על-ידי" + 0.025*"מתחם" + '
  '0.023*"מקצוע" + 0.022*"משקה"'),
 (75,
  '0.420*"עבודה" + 0.107*"מזון" + 0.105*"בריאות" + 0.090*"בטיחות" + '
  '0.072*"הפרטה" + 0.058*"כלי" + 0.047*"זיהום" + 0.026*"ניעה" + 0.025*"דחוף" + '
  '0.020*"[נוסח"'),
 (59,
  '0.217*"מיוחד" + 0.143*"דיווח" + 0.133*"בריאות" + 0.080*"ניהול" + '
  '0.069*"רפורמה" + 0.062*"ביקורת" + 0.052*"העסקה" + 0.042*"שדה" + '
  '0.042*"פרויקט" + 0.040*"תעופה"'),
 (52,
  '0.425*"חוק" + 0.305*"תיקון" + 0.241*"הצעה" + 0.011*"עונש" + 0.006*"צבאי" + '
  '0.004*"שיפוט" + 0.003*"הפצה" + 0.001*"מצוינות" + 0.001*"מרכיב" + '
  '0.001*"לוויין"'),
 (26,
  '0.260*"מים" + 0.193*"תאגיד" + 0.093*"ביוב" + 0.087*"חינוכי" + 0.066*"הצלה" '
  '+ 0.050*"יצר" + 0.0

## PCA visualization

In [None]:
print(lda_model.num_topics)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## Export pickle of CommitteeSessionID to topics for use in Parliment member analysis

In [None]:
topic_lemmas_df['topics'] = [lda_model.get_document_topics(corpus_line) for corpus_line in corpus]
topic_lemmas_df.head()

In [None]:
topic_lemmas_df.to_pickle('id_to_lemma_LDA.pkl')