In [102]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, Counter
import operator

In [103]:
def showFeatures(self, incident):
    return(incident[0].keys())

#Get column of features
def getField(incident, field):
    try:
        return([doc[field] for doc in incident])
    except:
        return(incident[field])

#Returns the text in lower case letters
def textToLower(text):
    if type(text) is str:
        return(text.lower())
        
#Converts text to tokens
def textToTokens(text):
    return(nltk.word_tokenize(text))

#Inputs array of words and returns stemmed version
def wordStems(tokens, stemmer = nltk.stem.PorterStemmer()):
    return([stemmer.stem(token) for token in tokens])

# Removes stopword. TODO: Add words to stopwords.
def removeStopWords(tokens):
    
    stopwords = []
    nl = nltk.corpus.stopwords
    lan = ['norwegian', 'danish', 'swedish', 'english']
    
    for l in lan:
        stopwords += nl.words(l)
    
    return ([token for token in tokens if token not in stopwords])

#Removes non-alphabetic or numeric symbols
def removeNonAlnum(tokens):
    return [token for token in tokens if token.isalnum()]

#Removes numeric
def removeNonAlpha(tokens):
    return [token for token in tokens if token.isalpha()]

#Runs the functions above in sequential order and outputs
#a list of tokens
def processText(text):
    functions = [textToLower, textToTokens, wordStems, removeStopWords,
                 removeNonAlnum, removeNonAlpha]
    processed = text
    for func in functions:
        processed = func(processed)
    return processed

#Returns clean text, very similar to processText
def cleanText(text):
    functions = [textToLower, textToTokens, removeStopWords,
     removeNonAlnum, removeNonAlpha]
    processed = text
    for func in functions:
        processed = func(processed)
    return(processed)

In [113]:
with open('../kaggle/contract.txt', 'r') as f:
    text = f.read()

In [104]:

#text = '''Restaurant agrees that it shall require each of its delivery persons to hold a valid driver’s license and to carry automobile property damage and public liability insurance in amounts not less than those required under the laws of the state in which each Restaurant is located. Furthermore, Restaurant agrees that at all times during this Agreement, none of its delivery persons shall have (i) no more than two moving violations in a thirty-six (36) month period and only one at fault accident, and (ii) no major traffic citations or incidents. For the purpose of this Agreement, “major traffic citations or incidents” are as follows: driving under the influence, driving while impaired, driving in possession of alcohol or drugs, refusal to submit to a blood, urine or breath test, driving with a suspended or revoked license, a felony in which a vehicle is used (i.e. vehicular manslaughter, vehicular homicide, vehicular assault, hit and run, eluding a peace officer), reckless driving, careless driving, and driving over 100 miles per hour, in a speed contest and/or racing.
#'''

In [114]:
tvText = TfidfVectorizer()
text_sentences = text.split('.')
tfIdf = tvText.fit_transform([' '.join(cleanText(i)) for i in text_sentences])

In [115]:
 
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        #print (score_vals, feature_vals)
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
        #print (results)
    
    return results

In [116]:
feature_names=tvText.get_feature_names()

tf_out = tfIdf.tocoo() 

In [117]:
sentence_score = {}
for i,r in enumerate(tf_out.row):
    if not sentence_score.get(r):
        sentence_score[r] = 0         
    sentence_score[r] += tf_out.data[i]

import operator
sorted_x = sorted(sentence_score.items(), key=operator.itemgetter(1), reverse=True)
for k,v in sorted_x:
    print("Score:{0}".format(v))
    print ( text_sentences[k])

Score:5.972044981271229
 Restaurant agrees to indemnify and hold harmless Company against any losses, claims, damages, liabilities or expenses (including the reasonable cost of investigating and defending any claims therefore and counsel fees incurred in connection therewith), joint or several, suffered or incurred by Company by reason of any injury to person or damage to property caused by the negligence or misconduct of the Restaurant arising or resulting from the preparation or delivery of the Food by the Restaurant
Score:5.65843062217277
 As this Agreement relates to the posting of a website that advertises business and lists the restaurant(s) as a food provider on such website the Company agrees to the following guidelines: (i) to distinguish the restaurant’s trademark within text, using all CAPITALS, italics, or color; (ii) to avoid distorting the trademark; (iii) to not alter the graphic design or the color; and (iv) to always use the proper registration symbol ®
Score:4.9633178