In [5]:
#!pip install gensim
from gensim import corpora
documents = ["Our business may be impacted by disruptions including threats to physical security", 
              "information technology or cyber-attacks or failures",
              "damaging weather or other acts of nature and pandemics or other public health crises",
              "Any of these disruptions could affect our internal operations or our ability to deliver products and services to our customers",
              "Any significant production delays",
              "or any destruction", 
              "manipulation or improper use of our data", 
              "information systems or networks could impact our sales", 
              "increase our expenses and/or have an adverse effect on the reputation of Boeing and of our products and services"]
    

In [6]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 0] for text in texts]

from pprint import pprint  # pretty-printer
pprint(texts)

[['our',
  'business',
  'may',
  'be',
  'impacted',
  'by',
  'disruptions',
  'including',
  'threats',
  'physical',
  'security'],
 ['information', 'technology', 'or', 'cyber-attacks', 'or', 'failures'],
 ['damaging',
  'weather',
  'or',
  'other',
  'acts',
  'nature',
  'pandemics',
  'or',
  'other',
  'public',
  'health',
  'crises'],
 ['any',
  'these',
  'disruptions',
  'could',
  'affect',
  'our',
  'internal',
  'operations',
  'or',
  'our',
  'ability',
  'deliver',
  'products',
  'services',
  'our',
  'customers'],
 ['any', 'significant', 'production', 'delays'],
 ['or', 'any', 'destruction'],
 ['manipulation', 'or', 'improper', 'use', 'our', 'data'],
 ['information',
  'systems',
  'or',
  'networks',
  'could',
  'impact',
  'our',
  'sales'],
 ['increase',
  'our',
  'expenses',
  'and/or',
  'have',
  'an',
  'adverse',
  'effect',
  'on',
  'reputation',
  'boeing',
  'our',
  'products',
  'services']]


In [7]:
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, '10Kdict.dict'))  # store the dictionary, for future reference
print(dictionary)

Folder "C:\Users\pragmi\AppData\Local\Temp" will be used to save temporary dictionary and corpus.
Dictionary(58 unique tokens: ['our', 'business', 'may', 'be', 'impacted']...)


In [8]:
print(dictionary.token2id)

{'our': 0, 'business': 1, 'may': 2, 'be': 3, 'impacted': 4, 'by': 5, 'disruptions': 6, 'including': 7, 'threats': 8, 'physical': 9, 'security': 10, 'information': 11, 'technology': 12, 'or': 13, 'cyber-attacks': 14, 'failures': 15, 'damaging': 16, 'weather': 17, 'other': 18, 'acts': 19, 'nature': 20, 'pandemics': 21, 'public': 22, 'health': 23, 'crises': 24, 'any': 25, 'these': 26, 'could': 27, 'affect': 28, 'internal': 29, 'operations': 30, 'ability': 31, 'deliver': 32, 'products': 33, 'services': 34, 'customers': 35, 'significant': 36, 'production': 37, 'delays': 38, 'destruction': 39, 'manipulation': 40, 'improper': 41, 'use': 42, 'data': 43, 'systems': 44, 'networks': 45, 'impact': 46, 'sales': 47, 'increase': 48, 'expenses': 49, 'and/or': 50, 'have': 51, 'an': 52, 'adverse': 53, 'effect': 54, 'on': 55, 'reputation': 56, 'boeing': 57}


In [65]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[]


In [9]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, '10Kdict.mm'), corpus)  # store to disk, for later use
for c in corpus:
    print(c)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]
[(11, 1), (12, 1), (13, 2), (14, 1), (15, 1)]
[(13, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]
[(0, 3), (6, 1), (13, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)]
[(25, 1), (36, 1), (37, 1), (38, 1)]
[(13, 1), (25, 1), (39, 1)]
[(0, 1), (13, 1), (40, 1), (41, 1), (42, 1), (43, 1)]
[(0, 1), (11, 1), (13, 1), (27, 1), (44, 1), (45, 1), (46, 1), (47, 1)]
[(0, 2), (33, 1), (34, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]


In [10]:
from six import iteritems

# collect statistics about all tokens
#dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 
            if stopword in dictionary.token2id]
#once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids) #+ once_ids)
print(dictionary)

Dictionary(58 unique tokens: ['our', 'business', 'may', 'be', 'impacted']...)


In [11]:
from gensim import corpora, models, similarities
if os.path.isfile(os.path.join(TEMP_FOLDER, '10Kdict.dict')):
    dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, '10Kdict.dict'))
    corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, '10Kdict.mm'))
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

Used files generated from first tutorial


In [12]:
print(dictionary[0])
print(dictionary[1])
print(dictionary[8])

our
business
threats


In [13]:
#Creating a transformation
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

In [14]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors
[(0, 0.70710678), (1, 0.70710678)]

[(0, 0.2584260910573898), (1, 0.9660310323487531)]


[(0, 0.70710678), (1, 0.70710678)]

In [15]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.08660992075313446), (1, 0.3237593805426314), (2, 0.3237593805426314), (3, 0.3237593805426314), (4, 0.3237593805426314), (5, 0.3237593805426314), (6, 0.22162466745152637), (7, 0.3237593805426314), (8, 0.3237593805426314), (9, 0.3237593805426314), (10, 0.3237593805426314)]
[(11, 0.3605413479900455), (12, 0.5266951771548322), (13, 0.1943875188252588), (14, 0.5266951771548322), (15, 0.5266951771548322)]
[(13, 0.10594182317454164), (16, 0.2870505661177798), (17, 0.2870505661177798), (18, 0.5741011322355596), (19, 0.2870505661177798), (20, 0.2870505661177798), (21, 0.2870505661177798), (22, 0.2870505661177798), (23, 0.2870505661177798), (24, 0.2870505661177798)]
[(0, 0.2563295230608981), (6, 0.2186390997412536), (13, 0.05894013592868689), (25, 0.15969896381256668), (26, 0.31939792762513336), (27, 0.2186390997412536), (28, 0.31939792762513336), (29, 0.31939792762513336), (30, 0.31939792762513336), (31, 0.31939792762513336), (32, 0.31939792762513336), (33, 0.2186390997412536), (34, 0.21

In [16]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=12) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [17]:
>>> lsi.print_topics(12)

[(0,
  '0.290*"any" + 0.287*"destruction" + 0.263*"our" + 0.228*"could" + 0.208*"information" + 0.188*"or" + 0.173*"services" + 0.173*"products" + 0.169*"networks" + 0.169*"systems"'),
 (1,
  '0.447*"destruction" + 0.386*"any" + 0.325*"significant" + 0.325*"production" + 0.325*"delays" + -0.246*"information" + -0.182*"cyber-attacks" + -0.182*"failures" + -0.182*"technology" + -0.177*"networks"'),
 (2,
  '-0.277*"information" + -0.274*"failures" + -0.274*"technology" + -0.274*"cyber-attacks" + -0.207*"destruction" + 0.169*"products" + 0.169*"services" + 0.160*"our" + -0.158*"or" + 0.147*"increase"'),
 (3,
  '0.488*"other" + 0.244*"acts" + 0.244*"public" + 0.244*"weather" + 0.244*"nature" + 0.244*"pandemics" + 0.244*"health" + 0.244*"crises" + 0.244*"damaging" + 0.227*"manipulation"'),
 (4,
  '-0.281*"be" + -0.281*"business" + -0.281*"by" + -0.281*"impacted" + -0.281*"may" + -0.281*"threats" + -0.281*"physical" + -0.281*"including" + -0.281*"security" + -0.189*"disruptions"'),
 (5,
  '-0

In [18]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)


[(0, 0.21978052539842333), (1, -0.070637698506108293), (2, 0.34772589965132267), (3, 0.01112144194259929), (4, -0.85702520204609556), (5, -0.16471982996154905), (6, -0.18858025462827568), (7, 0.047383128306834388), (8, -0.16197949202793632)]
[(0, 0.32512792981419669), (1, -0.38209122793903172), (2, -0.56386649153892265), (3, -0.11084852651448621), (4, -0.040746976365359427), (5, -0.00031507992569047296), (6, -0.58198175075497061), (7, -0.0168132532004115), (8, 0.2782435041877)]
[(0, 0.090675482756165521), (1, -0.029905847897680644), (2, -0.19281767877845457), (3, 0.85025047090377592), (4, -0.13689902436192539), (5, 0.44878168775463451), (6, 0.064992205987049509), (7, 0.079964509949774609), (8, -0.0037974650937407247)]
[(0, 0.64924603119477076), (1, -0.00064362883451782305), (2, 0.33845552617969421), (3, -0.045870790073969325), (4, 0.01598832281060434), (5, 0.10467439207796324), (6, 0.3053862458140279), (7, -0.13515300224532054), (8, 0.58230872925337518)]
[(0, 0.29656208201848316), (1, 

In [19]:
lsi.save(os.path.join(TEMP_FOLDER,'model.lsi')) # same for tfidf, lda, ...
#lsi = models.LsiModel.load(os.path.join(TEMP_FOLDER,'model.lsi'))

In [20]:
#similarity interface
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, '10Kdict.dict'))
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, '10Kdict.mm')) # comes from the first tutorial, "From strings to vectors"
print(corpus)


MmCorpus(9 documents, 58 features, 74 non-zero entries)


In [21]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=12)

In [22]:
doc = "cyber-attacks"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)
print(vec_bow)

[(0, 0.027062109946746283), (1, 0.086144816091390908), (2, 0.018456546533812696), (3, 0.052130539051521478), (4, 0.19761882344365286), (5, -0.18752045222910951), (6, -0.12667716922021502), (7, -0.2291945882141439), (8, 0.17638937194857221)]
[(14, 1)]


In [23]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

index.save(os.path.join(TEMP_FOLDER,'10Kdict.index'))
index = similarities.MatrixSimilarity.load(os.path.join(TEMP_FOLDER,'10Kdict.index'))

In [24]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples
print(dictionary.token2id)


[(0, 0.0), (1, 0.82173038), (2, 1.1175871e-08), (3, 0.0), (4, 0.0), (5, -1.6763806e-08), (6, 0.0), (7, 3.7252903e-08), (8, -1.1175871e-08)]
{'our': 0, 'business': 1, 'may': 2, 'be': 3, 'impacted': 4, 'by': 5, 'disruptions': 6, 'including': 7, 'threats': 8, 'physical': 9, 'security': 10, 'information': 11, 'technology': 12, 'or': 13, 'cyber-attacks': 14, 'failures': 15, 'damaging': 16, 'weather': 17, 'other': 18, 'acts': 19, 'nature': 20, 'pandemics': 21, 'public': 22, 'health': 23, 'crises': 24, 'any': 25, 'these': 26, 'could': 27, 'affect': 28, 'internal': 29, 'operations': 30, 'ability': 31, 'deliver': 32, 'products': 33, 'services': 34, 'customers': 35, 'significant': 36, 'production': 37, 'delays': 38, 'destruction': 39, 'manipulation': 40, 'improper': 41, 'use': 42, 'data': 43, 'systems': 44, 'networks': 45, 'impact': 46, 'sales': 47, 'increase': 48, 'expenses': 49, 'and/or': 50, 'have': 51, 'an': 52, 'adverse': 53, 'effect': 54, 'on': 55, 'reputation': 56, 'boeing': 57}


In [25]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples


[(1, 0.82173038), (7, 3.7252903e-08), (2, 1.1175871e-08), (0, 0.0), (3, 0.0), (4, 0.0), (6, 0.0), (8, -1.1175871e-08), (5, -1.6763806e-08)]


In [6]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)

ModuleNotFoundError: No module named 'pyLDAvis'

In [26]:
#LDA Model
from gensim.sklearn_api import LdaTransformer

In [27]:
from pprint import pprint  # pretty-printer
pprint(texts)

[['our',
  'business',
  'may',
  'be',
  'impacted',
  'by',
  'disruptions',
  'including',
  'threats',
  'physical',
  'security'],
 ['information', 'technology', 'or', 'cyber-attacks', 'or', 'failures'],
 ['damaging',
  'weather',
  'or',
  'other',
  'acts',
  'nature',
  'pandemics',
  'or',
  'other',
  'public',
  'health',
  'crises'],
 ['any',
  'these',
  'disruptions',
  'could',
  'affect',
  'our',
  'internal',
  'operations',
  'or',
  'our',
  'ability',
  'deliver',
  'products',
  'services',
  'our',
  'customers'],
 ['any', 'significant', 'production', 'delays'],
 ['or', 'any', 'destruction'],
 ['manipulation', 'or', 'improper', 'use', 'our', 'data'],
 ['information',
  'systems',
  'or',
  'networks',
  'could',
  'impact',
  'our',
  'sales'],
 ['increase',
  'our',
  'expenses',
  'and/or',
  'have',
  'an',
  'adverse',
  'effect',
  'on',
  'reputation',
  'boeing',
  'our',
  'products',
  'services']]


In [28]:
corpus = [dictionary.doc2bow(text) for text in texts]


In [29]:
model = LdaTransformer(num_topics=2, id2word=dictionary, iterations=20, random_state=1)
model.fit(corpus)
model.transform(corpus)


array([[ 0.9461804 ,  0.05381962],
       [ 0.41123676,  0.58876324],
       [ 0.04271816,  0.95728183],
       [ 0.96147501,  0.03852497],
       [ 0.86270857,  0.13729143],
       [ 0.23268875,  0.76731122],
       [ 0.12523969,  0.87476033],
       [ 0.93030161,  0.06969839],
       [ 0.96097314,  0.03902683]], dtype=float32)

In [40]:
#evaluate LSI model
#coloring words 
































sentense = "cyber-attacks"
#color_words(bad_lda,sentense)
#color_words(good_lda,sentense)
goodcm = 



































(model=goodLdaModel,texts=texts,dictionary=dictionary,coherence='c_v')
print(goodcm.get_coherence())
badcm = CoherenceModel(model=badLdaModel,texts=texts,dictionary=dictionary,coherence='c_v')
print(badcm.get_coherence())

NameError: name 'CoherenceModel' is not defined

In [30]:
#integration with sklearn
import numpy as np
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn.datasets import fetch_20newsgroups
from gensim.sklearn_api.ldamodel import LdaTransformer

In [31]:
rand = np.random.mtrand.RandomState(1) # set seed for getting same result
cats = ['rec.sport.baseball', 'sci.crypt']
data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True)

In [32]:
data_texts = [_.split() for _ in data.data]
id2word =  corpora.Dictionary(data_texts)
corpus = [id2word.doc2bow(i.split()) for i in data.data]

In [33]:
obj = LdaTransformer(id2word=id2word, num_topics=5, iterations=20)
lda = obj.fit(corpus)

In [34]:
from sklearn.pipeline import Pipeline
from sklearn import linear_model

def print_features_pipe(clf, vocab, n=10):
    ''' Better printing for sorted list '''
    coef = clf.named_steps['classifier'].coef_[0]
    print(coef)
    print('Positive features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0])))
    print('Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0])))


In [35]:
id2word =  corpora.Dictionary([_.split() for _ in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]

In [36]:
model = LdaTransformer(num_topics=15, id2word=id2word, iterations=10, random_state=37)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)  # l2 penalty used
pipe = Pipeline([('features', model,), ('classifier', clf)])
pipe.fit(corpus, data.target)
print_features_pipe(pipe, id2word.values())

print(pipe.score(corpus, data.target))

[-0.13235784 -0.35209528  0.77119825  0.31151458 -0.28300901 -0.62038253
  0.03171907 -0.72783516  0.94230405 -0.30298003 -0.74538647 -0.24128422
  0.60770572  0.54598887  0.19823075]


TypeError: 'ValuesView' object does not support indexing

In [173]:
#summarize
import requests
from gensim.summarization import summarize
#text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text

f = open('C:\\DataScience\\SentimentAnalysis\\20160210_10-K_edgar_data_12927_0000012927-16-000099_1.txt', 'r')
text = f.read()
#print ('Input text:')
print (text)

<Header>
<FileStats>
    <FileName>20160210_10-K_edgar_data_12927_0000012927-16-000099_1.txt</FileName>
    <GrossFileSize>21300635</GrossFileSize>
    <NetFileSize>449344</NetFileSize>
    <ASCII_Embedded_Chars>1095269</ASCII_Embedded_Chars>
    <HTML_Chars>7163601</HTML_Chars>
    <XBRL_Chars>9206714</XBRL_Chars>
    <XML_Chars>3016161</XML_Chars>
    <N_Tables>140</N_Tables>
    <N_Exhibits>18</N_Exhibits>
</FileStats>
<SEC-Header>
0000012927-16-000099.hdr.sgml : 20160210
<ACCEPTANCE-DATETIME>20160210105734
ACCESSION NUMBER:		0000012927-16-000099
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		168
CONFORMED PERIOD OF REPORT:	20151231
FILED AS OF DATE:		20160210
DATE AS OF CHANGE:		20160210

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			BOEING CO
		CENTRAL INDEX KEY:			0000012927
		STANDARD INDUSTRIAL CLASSIFICATION:	AIRCRAFT [3721]
		IRS NUMBER:				910425694
		STATE OF INCORPORATION:			DE
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Ac

In [1]:
print ('Summary:')
print (summarize(text, split=True,word_count=100))

Summary:


NameError: name 'summarize' is not defined

In [2]:
from gensim.summarization import keywords
print ('Keywords:')
print (keywords(text))




Keywords:


NameError: name 'text' is not defined