In [3]:
from gensim import corpora, models, similarities

In [2]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]


In [4]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

from pprint import pprint   # pretty-printer
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [5]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference
print(dictionary)

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


In [6]:
print(dictionary.token2id)

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


In [7]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored

[(1, 1), (2, 1)]


In [9]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (6, 2), (8, 1)], [(3, 1), (4, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(5, 1), (10, 1), (11, 1)]]


In [10]:
#from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm') # comes from the first tutorial, "From strings to vectors"
print(corpus)

MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [11]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [12]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.46182100453271602), (1, -0.070027665279000256)]


In [None]:
s1 = "This is a foo bar sentence ."
s2 = "This sentence is similar to a foo bar sentence ."
s3 = "What is this string ? Totally not related to the other two lines ."

cosine_sim(s1, s2) # Should give high cosine similarity
cosine_sim(s1, s3) # Shouldn't give high cosine similarity value
cosine_sim(s2, s3) # Shouldn't give high cosine similarity value

In [15]:

import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def similarity(text1,text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)

    return cosine

In [16]:
text1 = 'This is a foo bar sentence .'
text2 = 'This sentence is similar to a foo bar sentence .'
similarity(text1,text2)

0.8616404368553293

In [17]:
import isbntools
import isbntools.app as app
#using isbntools
def getIMeta(title,str(author)):
    
    print query
    isbn = app.isbn_from_words(query)    
    
        
    if isbn != None:
        meta=isbntools.app.meta(isbn)
        author=str(meta['Authors'][0])
        title=str(meta['Title'])
        fileName = author+" - "+title
        return [fileName,meta]
    else:
        return "Query Failed"

UnboundLocalError: local variable 's' referenced before assignment

In [110]:
from isbnlib import meta
import isbnlib
from isbnlib.config import add_apikey
from isbnlib.dev.helpers import fmtbib
from comFuncs import *
APIKEY="2E7FE5A4"
import os
import string
#SERVICE = 'isbndb'
#register your key
#add_apikey(SERVICE, APIKEY)


In [64]:
d="/media/some/sophie-hhd/transmitt/fic/#Calibre Fiction/"
dd="/media/some/sophie-hhd/transmitt/fic/Fictional Literature/"
files=! find "$d" "$dd" -type f -iname "*.epub"
len(files)

8837

In [170]:
def getISBNmeta(title,author):
    #isbn=isbnlib.isbn_from_words(title+" by " +author)
    try:
        return isbnlib.goom(title+" " +author)
    except:
        print "service unavailable"

In [206]:
def getFileIsbnMeta(f,smscr):
    fp,ff= os.path.split(f)
    fn, fext = os.path.splitext(ff)
    seri=''
    sm=0
    mtd=None
    auth=fn.split()[-2:]
    tit=fn.split()[:-2]
    if fn.count(' - ')==1:
        tit,auth=fn.split(' - ')
    elif fn.count(' - ')==2:
        seri,tit,auth=fn.split(' - ')
        
    author=''.join(l for l in ' '.join(auth.split()) if l in string.ascii_letters+" .")
    if tit.count('_ ')==1:
            tit=' '.join(tit.split('_ ')[0].split())
    title=''.join(l for l in ' '.join(tit.split()) if l in string.ascii_letters+string.digits+" .,&$£")
    #print title,'..... ....',author
    metad =getISBNmeta(title,str(author))
    if metad !=None and len(metad)>0: 
        for md in metad:
            #print md
            authsim=similarity(md['Authors'][0].encode('ascii','ignore'),str(author))
            titsim=similarity(md['Title'].encode('ascii','ignore'),str(title))
            #print titsim,authsim
            if titsim+authsim >sm:
                sm=titsim+authsim
                mtd=md
    #if no data found switch title and author
    metad =getISBNmeta(author,str(title))
    if metad !=None and len(metad)>0: 
        for md in metad:
            authsim=similarity(md['Authors'][0].encode('ascii','ignore'),str(title))
            titsim=similarity(md['Title'].encode('ascii','ignore'),str(author))
            #print titsim,authsim
            if titsim+authsim >sm:
                sm=titsim+authsim
                mtd=md
    # if no data found try seri, auth ,title  combinaton
    if seri !='':
        metad =getISBNmeta(title,str(seri))
        if metad !=None and len(metad)>0: 
            for md in metad:
                authsim=similarity(md['Authors'][0].encode('ascii','ignore'),str(seri))
                titsim=similarity(md['Title'].encode('ascii','ignore'),str(title))
                #print titsim,authsim
                if titsim+authsim >sm:
                    sm=titsim+authsim
                    mtd=md
        metad =getISBNmeta(seri,str(author))
        if metad !=None and len(metad)>0: 
            for md in metad:
                authsim=similarity(md['Authors'][0].encode('ascii','ignore'),str(author))
                titsim=similarity(md['Title'].encode('ascii','ignore'),str(seri))
                #print titsim,authsim
                if titsim+authsim >sm:
                    sm=titsim+authsim
                    mtd=md    
        metad =getISBNmeta(seri,str(title))
        if metad !=None and len(metad)>0: 
            for md in metad:
                authsim=similarity(md['Authors'][0].encode('ascii','ignore'),str(title))
                titsim=similarity(md['Title'].encode('ascii','ignore'),str(seri))
                #print titsim,authsim
                if titsim+authsim >sm:
                    sm=titsim+authsim
                    mtd=md
        metad =getISBNmeta(author,str(seri))
        if metad !=None and len(metad)>0: 
            for md in metad:
                authsim=similarity(md['Authors'][0].encode('ascii','ignore'),str(seri))
                titsim=similarity(md['Title'].encode('ascii','ignore'),str(author))
                #print titsim,authsim
                if titsim+authsim >sm:
                    sm=titsim+authsim
                    mtd=md               
    if sm >smscr and mtd !=None:
        return fn,fp,mtd,sm
    return "NO meta"       

In [207]:
f=files[-4053]
print f
getFileIsbnMeta(f,.2)

/media/some/sophie-hhd/transmitt/fic/Fictional Literature/Barnes and Noble Classics Series Collection/The Inferno - Dante Alighieri.epub


('The Inferno - Dante Alighieri',
 '/media/some/sophie-hhd/transmitt/fic/Fictional Literature/Barnes and Noble Classics Series Collection',
 {'Authors': [u'Dante Alighieri'],
  'ISBN-13': u'9781631061493',
  'Language': u'en',
  'Publisher': u'Race Point Pub',
  'Title': u'The Inferno',
  'Year': u'2015'},
 1.9999999999999996)