In [1]:
import re
import solr
import json
import spacy
from unidecode import unidecode

## Connect to Solr

In [2]:
solr_expertv_url = 'http://holbox.lti.cs.cmu.edu:8983/solr/expertv'
solr_expertv = solr.SolrConnection(solr_expertv_url)

## Preprocess data

What needs to be extracted is:
1. LOCATION named entities
1. DATETIME named entities
1. VB lemmas
1. Other named entities found

In [3]:
def extract_meta(nlp, text):
    doc = nlp(text)
    
    meta = {}
    verbs = []
    for word in doc:
        if word.pos == u'VERB':
            verb.append(word.lemma_)
            #word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_
    meta['VERB'] = ','.join(verbs)
    
    loc = []
    dt = []
    person = []
    other = []
    for ent in doc.ents:
        if ent.label_ in [u'GPE', u'LOC',u'ORG',u'FACILITY']:
            loc.append(ent.text)
        elif ent.label_ in [u'DATE',u'TIME']:
            dt.append(ent.text)
        elif ent.label_ in [u'PERSON',]:
            person.append(ent.text)
        else:
            other.append(ent.text)
            
    meta['LOC'] = ','.join(loc)
    meta['DT'] = ','.join(dt)
    meta['PERSON'] = ','.join(person)
    meta['NER'] = ','.join(other)
    
    return meta

In [4]:
# Load the spacy pipeline
nlp = spacy.load('en')

In [5]:
# Load the data
blog_data = json.load(open('expertvagabond_blog_posts.json'))

In [7]:
# Index the data
for entry in blog_data:
    entry_text_pos = 0
    for entry_text in entry['text']:
        text_type, text = entry_text[0], entry_text[1]
        entry_text_pos += 1
        meta = extract_meta(nlp, text)
        try:
            solr_expertv.add(url=entry['entry_url'],
                             title=entry['title'],
                             date='',
                             texttype=text_type,
                             text=text,
                             textpos=entry_text_pos,
                             LOC=meta['LOC'],
                             DT=meta['DT'],
                             VB=meta['VERB'],
                             PERSON=meta['PERSON'],
                             NE=meta['NER']
                            )
        except Exception, e:
            print e
    
solr_expertv.commit()

'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n<lst name="responseHeader"><int name="status">0</int><int name="QTime">680</int></lst>\n</response>\n'

# Query the corpus with a query

Let's see how this works: :P

In [28]:
query = 'volcanoes I visited?'
response = solr_expertv.query('text:%s'%(query))
for r in response:
    print r['text'][0]
    print r['score'],'_____________________________________________________________'

I am bringing my Canon 70D with 17-55 F2.8 IS lens and I hope I would have some gorgeous photos to share soon ! I am new to photography, so I am crossing my fingers on this. Haha !
2.5366573 _____________________________________________________________
What a great post ! This will help me greatly as I am travelling to Iceland to chase the Aurora Borealis this coming Feb. I am spending 4 nights there and got so excited and happy to see this post of yours. As I haven’t really got a clue of where to go and what to see, in order to capture some beautiful and amazing photos. Now after reading your post, I have decided on at least 5 places I should go and see.
2.5129242 _____________________________________________________________
Hey Matthew,
I accidently stumbled upon your blog when my colleague sent me a link to websites about cold areas of the world. I was mesmerized by the pictures and the beauty of Iceland and long to go there now. I was planning a vacation and now I know where I will