In [22]:
import word2vec as w2v
import gensim
import os
import csv
import nltk
from nltk.corpus import stopwords # must be downloaded beforehand with nltk.download()
import re

In [2]:
porter = nltk.PorterStemmer()

stripRegex = re.compile('[^a-zA-Z]') # strip non-alphanumerics
def clean_token(w):
    try:
        w = porter.stem(''.join([a.lower() for a in w if a.isalnum()]))
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        # problem file (#1068): "In re Pittsburgh Corning Corp.-Wed Dec 20 16:00:00 PST 2006"
        # problem phrase: "http:// dictionary. oed. com."
        # problem token: "oed."
        # in cases where porter dies due to interpreting tokens like this as suffixes with missing starts, pass the token through after manual stripping
        w = stripRegex.sub('', w)
    return w
    
def clean_phrase(s):
    s = s.split()
    s = ' '.join([clean_token(w) for w in s])
    return s

clean_phrase("Testing how\nthis thing splits\twhitespace")

u'test how thi thing split whitespac'

In [3]:
# concat court cases to train word2vec
casepath = 'CourtCases'

In [73]:
#HEAVY
# Build clean file for Word2Vec training
fns = [fn for fn in os.listdir(casepath) if fn[0] != '.']
with open('clean_cases.txt', 'w') as catfile:
    for i, fn in enumerate(fns):
        with open(os.path.join(casepath, fn)) as infile:
            s = clean_phrase(infile.read())+'\n'
            catfile.write(s)
        if i % 1000 == 0:
            print "Read %d files" % i

Read 0 files
Read 1000 files
Read 2000 files
Read 3000 files
Read 4000 files
Read 5000 files
Read 6000 files
Read 7000 files
Read 8000 files
Read 9000 files
Read 10000 files
Read 11000 files
Read 12000 files
Read 13000 files
Read 14000 files
Read 15000 files
Read 16000 files
Read 17000 files
Read 18000 files
Read 19000 files
Read 20000 files
Read 21000 files
Read 22000 files
Read 23000 files
Read 24000 files
Read 25000 files
Read 26000 files
Read 27000 files
Read 28000 files
Read 29000 files
Read 30000 files
Read 31000 files
Read 32000 files
Read 33000 files
Read 34000 files
Read 35000 files
Read 36000 files
Read 37000 files
Read 38000 files
Read 39000 files
Read 40000 files
Read 41000 files
Read 42000 files
Read 43000 files
Read 44000 files
Read 45000 files
Read 46000 files
Read 47000 files
Read 48000 files
Read 49000 files
Read 50000 files
Read 51000 files
Read 52000 files
Read 53000 files
Read 54000 files
Read 55000 files
Read 56000 files
Read 57000 files
Read 58000 files
Read 59000

In [None]:
#HEAVY
model = w2v.word2vec('clean_cases.txt', 'cases_w2v.bin', size=150, window=15)

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('cases_w2v.bin', binary=True)

In [5]:
model.most_similar(positive='law', negative=['justi'])

[(u'm', 0.7145407795906067),
 (u'jr', 0.7127110958099365),
 (u'h', 0.6810263395309448),
 (u'e', 0.6772070527076721),
 (u'jame', 0.6240376830101013),
 (u'j', 0.6222593188285828),
 (u'david', 0.6136133670806885),
 (u'robert', 0.6124848127365112),
 (u'john', 0.6003983616828918),
 (u'thoma', 0.5987942218780518)]

In [6]:
with open('inputData.csv', 'r') as infile:
    reader = csv.reader(infile)
    data = [row for row in reader]

with open('outputData.csv', 'r') as testfile:
    reader = csv.reader(testfile)
    test = [row for row in reader]

questions = [d[0] for d in data]
answers = [d[1:] for d in data]

validation = [t[1] for t in test]

print "input/data/train lines: %d" % len(data)
print "output/test lines: %d" % len(test)

input/data/train lines: 300
output/test lines: 300


In [33]:
# Similarity between sentences

pre_clean_enabled = True #causes about a 4% gain in accuracy
stopwords_enabled = False #causes about a 12% hit to accuracy
topn_enabled = True #causes about a 5% gain in accuracy

if stopwords_enabled:
    stop = set(stopwords.words('english'))
    stemmedstop = set([])
    for word in stop:
        stemmedstop.add(clean_token(word))

def similarity(q, a, n=200):
        score = 0.0
        no_count = 0
        
        if pre_clean_enabled:
            q = clean_phrase(q)
            a = clean_phrase(a)
        
        if stopwords_enabled:
            qlist = [i for i in q.split() if i not in stemmedstop]
            alist = [i for i in a.split() if i not in stemmedstop]
        else:
            qlist = q.split()
            alist = a.split()
        
        for wq in qlist:
            for wa in alist:
                try: # Shouldn't happen anymore
                    score += model.similarity(wq, wa)
                except: # Shouldn't happen anymore
                    no_count += 1
        score /= (len(q)*len(a) - no_count)
        return score

def similarity_new(q, a, n=0.4):
        scores = []
        no_count = 0
        
        if pre_clean_enabled:
            q = clean_phrase(q)
            a = clean_phrase(a)
        
        if stopwords_enabled:
            qlist = [i for i in q.split() if i not in stemmedstop]
            alist = [i for i in a.split() if i not in stemmedstop]
        else:
            qlist = q.split()
            alist = a.split()
        
        for wq in qlist:
            for wa in alist:
                try: # Shouldn't happen anymore
                    scores.append(model.similarity(wq, wa))
                except: # Shouldn't happen anymore
                    pass
        
        if topn_enabled:
            num = min(int(float(len(scores))*n),len(scores))
        else:
            num = len(scores)
        score = float(sum(scores[-num:])) / float(num)
        return score

def rank_answers(q, alist):
    # Find answer that maximizes inner product of embeddings
    # Credit: Miyyer
    simlist = []
    for a in alist:
        simlist.append(similarity_new(q,a))
    #return [a for (a, s) in sorted(zip(alist, simlist))]
    return alist[simlist.index(max(simlist))]

In [35]:
#moderately heavy
subset = False
if subset:
    smallquestions = questions[150:200]
    smallanswers = answers[150:200]
    smallvalidation = validation[150:200]
else:
    smallquestions = questions
    smallanswers = answers
    smallvalidation = validation


top_answer = [rank_answers(q,a) for q,a in zip(smallquestions,smallanswers)]
correct = [int(i == j) for (i,j) in zip(top_answer, smallvalidation)]
(sum(correct)*1.0)/len(smallvalidation)

0.43333333333333335

In [15]:
#HEAVY
top_answer = [rank_answers(q,a) for q,a in zip(questions,answers)]

In [16]:
correct = [int(i == j) for (i,j) in zip(top_answer, validation)]
(sum(correct)*1.0)/len(validation)

0.3566666666666667

In [48]:
rank_answers(questions[0],answers[0])

'Section 502(b)(6) caps a landlord\'s claim in bankruptcy for damages resulting from the termination of a real property lease.16 Under \xc2\xa7 502(b)(6), a landlord-creditor is entitled to rent reserved from the greater of (1) one lease year or (2) fifteen percent, not to exceed three years, of the remaining lease term. The cap operates from the earlier of the petition filing date or "the date on which [the] lessor repossessed or the lessee surrendered, the leased property." The landlord also retains a claim for any unpaid rent due under such lease prior to the earlier of those dates. This language reflects Congress\'s intent to limit lease termination claims to prevent landlords from receiving a windfall over other creditors. See H.R.Rep. No. 95-595, at 353 (1977), reprinted in 1978 U.S.C.C.A.N. 5963, 6309 ("[The cap] limits the damages allowable to a landlord of the debtor.... It is designed to compensate the landlord for his loss while not permitting a claim so large (based on a lo