In [2]:
import pandas as pd

data = pd.read_excel('data/Nova_Simon_Hibbard2.xlsx', error_bad_lines=False);
data_text = data[['Summary']]
documents = data_text

In [3]:
len(documents)

16565

In [4]:
documents[:5]

Unnamed: 0,Summary
0,ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1,ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...
2,ENGLISH TRANSLATION: FIRST AUDIO MEETING WITH ...
3,ENGLISH TRANSLATION: SHEIKH NASIR AL WUHAYSHI ...
4,ENGLISH TRANSLATION: AQAP: 'RESPONSE TO SHEIKH...


In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fayikanova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [8]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
processed_docs = documents['Summary'].astype(str).map(preprocess)

In [11]:
processed_docs[:10]

0    [english, translat, messag, truth, syria, shei...
1    [english, translat, sheikh, fatih, jawlani, pe...
2    [english, translat, audio, meet, sheikh, fatih...
3    [english, translat, sheikh, nasir, wuhayshi, l...
4    [english, translat, aqap, respons, sheikh, bag...
5    [second, clip, seri, soldier, video, link, htt...
6    [english, transcript, murabit, http, hujlj, kg...
7    [english, translat, collect, word, lama, dawla...
8    [aslm, share, account, previous, suspend, khal...
9    [english, translat, aqap, statement, bless, ra...
Name: Summary, dtype: object

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 bzcscxzq
1 english
2 http
3 maqdisi
4 messag
5 muham
6 sheikh
7 syria
8 translat
9 truth
10 xfszsjvr


In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(219, 1), (464, 1), (623, 1), (915, 1), (1066, 1), (1148, 1), (1420, 1)]

In [16]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 219 ("die") appears 1 time.
Word 464 ("bomb") appears 1 time.
Word 623 ("cluster") appears 1 time.
Word 915 ("wayf") appears 1 time.
Word 1066 ("explod") appears 1 time.
Word 1148 ("student") appears 1 time.
Word 1420 ("rer__") appears 1 time.


In [17]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)


In [18]:
corpus_tfidf = tfidf[bow_corpus]

In [19]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.34493054413034635),
 (1, 0.497045147662217),
 (2, 0.4168438606487528),
 (3, 0.3508802280734451),
 (4, 0.2820665771118998),
 (5, 0.1376582816633768),
 (6, 0.3521443313090525),
 (7, 0.33849123443705564)]


In [20]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)


In [21]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"isi" + 0.021*"kill" + 0.020*"armi" + 0.017*"amaqag" + 0.016*"syria" + 0.016*"break" + 0.015*"iraqi" + 0.015*"near" + 0.014*"islamicst" + 0.013*"ramadi"
Topic: 1 
Words: 0.029*"isi" + 0.019*"kill" + 0.014*"aleppo" + 0.012*"muslim" + 0.012*"rebel" + 0.012*"attack" + 0.010*"syrian" + 0.008*"syria" + 0.008*"break" + 0.008*"say"
Topic: 2 
Words: 0.024*"scotsmaninfidel" + 0.018*"spicylatt" + 0.017*"sassysassyr" + 0.016*"kafirkati" + 0.014*"isi" + 0.012*"syria" + 0.012*"islam" + 0.011*"fight" + 0.010*"kill" + 0.008*"assad"


In [22]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [23]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"like" + 0.008*"isi" + 0.007*"kill" + 0.007*"syria" + 0.007*"ramiallolah" + 0.005*"iraq" + 0.005*"today" + 0.005*"bomb" + 0.005*"assad" + 0.005*"akhi"
Topic: 1 Word: 0.011*"islam" + 0.010*"state" + 0.008*"want" + 0.008*"syria" + 0.007*"isi" + 0.007*"media" + 0.006*"say" + 0.006*"fight" + 0.006*"assad" + 0.006*"http"
Topic: 2 Word: 0.011*"kill" + 0.011*"iraqi" + 0.010*"armi" + 0.010*"isi" + 0.009*"allah" + 0.009*"islam" + 0.008*"soldier" + 0.007*"state" + 0.007*"attack" + 0.007*"destroy"
Topic: 3 Word: 0.017*"isi" + 0.012*"syria" + 0.008*"kill" + 0.007*"assad" + 0.007*"islam" + 0.007*"palmyra" + 0.007*"near" + 0.007*"iraq" + 0.006*"ramiallolah" + 0.006*"soldier"
Topic: 4 Word: 0.009*"kill" + 0.008*"assad" + 0.008*"isi" + 0.008*"islam" + 0.007*"muslim" + 0.007*"syria" + 0.007*"break" + 0.006*"state" + 0.006*"today" + 0.006*"aleppo"
Topic: 5 Word: 0.013*"muslim" + 0.010*"khair" + 0.007*"syria" + 0.007*"read" + 0.007*"isi" + 0.007*"fight" + 0.006*"jazakallah" + 0.006*"

In [24]:
processed_docs[4310]

['wayf', 'rer__', 'student', 'die', 'cluster', 'bomb', 'explod']

In [25]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))



Score: 0.8928377628326416	 
Topic: 0.022*"isi" + 0.021*"kill" + 0.020*"armi" + 0.017*"amaqag" + 0.016*"syria" + 0.016*"break" + 0.015*"iraqi" + 0.015*"near" + 0.014*"islamicst" + 0.013*"ramadi"

Score: 0.0551222562789917	 
Topic: 0.024*"scotsmaninfidel" + 0.018*"spicylatt" + 0.017*"sassysassyr" + 0.016*"kafirkati" + 0.014*"isi" + 0.012*"syria" + 0.012*"islam" + 0.011*"fight" + 0.010*"kill" + 0.008*"assad"

Score: 0.052039943635463715	 
Topic: 0.029*"isi" + 0.019*"kill" + 0.014*"aleppo" + 0.012*"muslim" + 0.012*"rebel" + 0.012*"attack" + 0.010*"syrian" + 0.008*"syria" + 0.008*"break" + 0.008*"say"


In [26]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6945732831954956	 
Topic: 0.012*"kill" + 0.012*"isi" + 0.011*"iraq" + 0.010*"armi" + 0.009*"support" + 0.007*"pour" + 0.007*"soldier" + 0.007*"report" + 0.006*"attack" + 0.006*"syria"

Score: 0.2053760141134262	 
Topic: 0.013*"follow" + 0.012*"islamicst" + 0.011*"break" + 0.009*"syria" + 0.008*"amaqag" + 0.008*"isi" + 0.008*"allah" + 0.007*"support" + 0.006*"caliphate_new" + 0.005*"http"

Score: 0.012508618645370007	 
Topic: 0.017*"isi" + 0.012*"syria" + 0.008*"kill" + 0.007*"assad" + 0.007*"islam" + 0.007*"palmyra" + 0.007*"near" + 0.007*"iraq" + 0.006*"ramiallolah" + 0.006*"soldier"

Score: 0.012508576735854149	 
Topic: 0.011*"syria" + 0.011*"isi" + 0.008*"kill" + 0.008*"news" + 0.006*"attack" + 0.006*"good" + 0.006*"armi" + 0.006*"rebel" + 0.005*"iraq" + 0.005*"know"

Score: 0.012507087551057339	 
Topic: 0.009*"like" + 0.008*"isi" + 0.007*"kill" + 0.007*"syria" + 0.007*"ramiallolah" + 0.005*"iraq" + 0.005*"today" + 0.005*"bomb" + 0.005*"assad" + 0.005*"akhi"

Score: 0.0125