In [1]:
import isaid_helpers

In [3]:
with isaid_helpers.graph_driver.session(database=isaid_helpers.graphdb) as session:
    results = session.run("""
    MATCH (w:CreativeWork)
    WHERE NOT w.description IS NULL
    RETURN w.name, w.description
    LIMIT 10
    """)
    docs = results.data()

In [8]:
text_corpus = [doc["w.description"] for doc in docs]

In [18]:
text_corpus[1]

'The Queen Charlotte Fault defines the Pacific–North America transform plate boundary in western Canada and southeastern Alaska for c. 900 km. The entire length of the fault is submerged along a continental margin dominated by Quaternary glacial processes, yet the geomorphology along the margin has never been systematically examined due to the absence of high-resolution seafloor mapping data. Hence the geological processes that influence the distribution, character and timing of mass transport events and their associated hazards remain poorly understood. Here we develop a classification of the first-order shape of the continental shelf, slope and rise to examine potential relationships between form and process dominance. We found that the margin can be split into six geomorphic groups that vary smoothly from north to south between two basic end-members. The northernmost group (west of Chichagof Island, Alaska) is characterized by concave-upwards slope profiles, gentle slope gradients (

In [9]:
stoplist = set('for a of the and to in'.split(' '))
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
display(processed_corpus)

[['hawaiian',
  'volcanoes',
  'are',
  'accessible',
  'well',
  'by',
  'remain',
  'thermal',
  'satellite',
  'has',
  'useful',
  'activity',
  'between',
  'here',
  'we',
  'thermal',
  'remote',
  'sensing',
  'at',
  'us',
  'geological',
  'survey',
  'hawaiian',
  'volcano',
  'observatory',
  'whereas',
  'have',
  'been',
  'required',
  'rapid',
  'satellite',
  'data,',
  'we',
  'data',
  'sources',
  'on',
  'automated',
  'at',
  'observatory',
  'these',
  'data',
  'provide',
  'basic',
  'display',
  'satellite',
  'data',
  'have',
  'been',
  'useful',
  'monitoring',
  'ongoing',
  'lava',
  'flow',
  'activity',
  'on',
  'east',
  'rift',
  'zone',
  'at',
  'over',
  'past',
  'recent',
  'lava',
  'was',
  'from',
  'over',
  'satellite',
  'data',
  'advance',
  'flow',
  'hazard',
  'ongoing',
  'thermal',
  'remote',
  'sensing',
  'at',
  'automated',
  'rate',
  'lava',
  'flow',
  'as',
  'has',
  'been',
  'these',
  'be',
  'useful',
  'monitoring',


In [10]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(337 unique tokens: ['accessible', 'activity', 'advance', 'are', 'as']...)


In [20]:
print(len(dictionary.token2id))
display(dictionary.token2id)

337


{'accessible': 0,
 'activity': 1,
 'advance': 2,
 'are': 3,
 'as': 4,
 'at': 5,
 'automated': 6,
 'basic': 7,
 'be': 8,
 'been': 9,
 'between': 10,
 'by': 11,
 'data': 12,
 'data,': 13,
 'display': 14,
 'east': 15,
 'flow': 16,
 'from': 17,
 'future': 18,
 'geological': 19,
 'has': 20,
 'have': 21,
 'hawaiian': 22,
 'hazard': 23,
 'here': 24,
 'lava': 25,
 'mauna': 26,
 'monitoring': 27,
 'observatory': 28,
 'on': 29,
 'ongoing': 30,
 'over': 31,
 'past': 32,
 'provide': 33,
 'rapid': 34,
 'rate': 35,
 'recent': 36,
 'remain': 37,
 'remote': 38,
 'required': 39,
 'rift': 40,
 'satellite': 41,
 'sensing': 42,
 'sources': 43,
 'survey': 44,
 'thermal': 45,
 'these': 46,
 'us': 47,
 'useful': 48,
 'volcano': 49,
 'volcanoes': 50,
 'was': 51,
 'we': 52,
 'well': 53,
 'whereas': 54,
 'zone': 55,
 '(west': 56,
 '10': 57,
 'accumulation': 58,
 'alaska': 59,
 'all': 60,
 'along': 61,
 'america': 62,
 'an': 63,
 'area': 64,
 'areas': 65,
 'associated': 66,
 'boundary': 67,
 'but': 68,
 'bypass'

In [15]:
new_doc = "Satellite data"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(12, 1), (41, 1)]


In [16]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
display(bow_corpus)

[[(0, 1),
  (1, 3),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 4),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 3),
  (10, 1),
  (11, 1),
  (12, 4),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 2),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 3),
  (26, 1),
  (27, 2),
  (28, 2),
  (29, 3),
  (30, 2),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 1),
  (41, 4),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 3),
  (46, 2),
  (47, 1),
  (48, 3),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 1)],
 [(1, 1),
  (3, 5),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 4),
  (11, 7),
  (14, 1),
  (17, 2),
  (19, 1),
  (20, 1),
  (24, 1),
  (29, 1),
  (37, 1),
  (46, 1),
  (52, 3),
  (56, 2),
  (57, 2),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 9),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 3),
  (75, 2),
  

In [19]:
from gensim import models

tfidf = models.TfidfModel(bow_corpus)

words = "Quaternary sediments".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(124, 0.7071067811865476), (131, 0.7071067811865476)]


In [21]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary.token2id))

In [24]:
query_document = 'satellite data'.split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]]
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

0 0.46604693
2 0.07612167
4 0.056788202
5 0.018751383
1 0.0
3 0.0
6 0.0
7 0.0
8 0.0
9 0.0
