In [36]:
import sys
from collections import Counter
import re
from gurobipy import *
import gzip
import os
import time
import codecs
import math
import networkx as nx
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import wordnet_ic, genesis
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import argparse
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from itertools import cycle
from operator import itemgetter
import math
import json
from isStatute_isPrecedent import *
from mention_statute_sentence import get_statute_mention

In [37]:
lmtzr = WordNetLemmatizer()
WORD = re.compile(r'\w+')
class keyvalue(argparse.Action):
    # Constructor calling
    def __call__( self , parser, namespace,
                 values, option_string = None):
        setattr(namespace, self.dest, dict())
          
        for value in values:
            # split it into key and value
            key, value = value.split('=')
            # assign into dictionary
            getattr(namespace, self.dest)[key] = int(value)

In [38]:
WT1 = 1
WT2 = 1
WT3 = 1
cachedstopwords = stopwords.words("english")
AUX = ['be','can','cannot','could','am','has','had','is','are','may','might','dare','do','did','have','must','need','ought','shall','should','will','would','shud','cud','don\'t','didn\'t','shouldn\'t','couldn\'t','wouldn\'t']
NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
              "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
              "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
              "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
              "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
              "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
              "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
              "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
POS_TAGS = ['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNS','NNP','NNPS','PDT','PRP','PRP$','RB','RBR','RBS','RP','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP$','WRB']

In [39]:
def get_legal_word(sen,LEG):
    temp = []
    for k in LEG.keys():
        if k in sen.lower() and k not in temp:
            temp.append(k)
    return temp

In [54]:
import os
import sys
import time
import json
from gurobipy import Model, GRB
from nltk.stem import WordNetLemmatizer

# You need to define these functions or import them appropriately
# from helper_module import get_legal_word, get_statute_mention, should_select, isStatute, isPrecedent

POS_TAGS = set(["NN", "NNS", "NNP", "NNPS"])
lmtzr = WordNetLemmatizer()

def optimize(tweet, con_word, ofname, summary_length, num_classes, sentence_limit):
    model = Model("summary_optimizer")
    model.setParam('OutputFlag', 0)  # Suppress Gurobi output

    n = len(tweet)
    tweet_var = {}
    con_var = {}

    for i in range(n):
        tweet_var[i] = model.addVar(vtype=GRB.BINARY, name=f"tweet_{i}")

    for word in con_word.keys():
        con_var[word] = model.addVar(vtype=GRB.BINARY, name=f"con_{word}")

    model.update()

    model.addConstr(sum(tweet_var[i] * tweet[i][1] for i in range(n)) <= summary_length, name="LengthConstraint")

    for c in range(num_classes):
        model.addConstr(sum(tweet_var[i] for i in range(n) if tweet[i][4] == c) <= sentence_limit[c], name=f"ClassSentenceLimit_{c}")

    for word in con_word.keys():
        model.addConstr(con_var[word] <= sum(tweet_var[i] for i in range(n) if word in tweet[i][2]), name=f"ContentTrigger_{word}")

    for word in con_word.keys():
        for i in range(n):
            if word in tweet[i][2]:
                model.addConstr(tweet_var[i] >= con_var[word], name=f"ContentEnforcement_{word}_{i}")

    model.setObjective(
        sum(con_word[word] * con_var[word] for word in con_word.keys()) +
        sum(tweet[i][3] * tweet_var[i] for i in range(n)),
        GRB.MAXIMIZE
    )

    model.optimize()

    selected_sentences = []
    for i in range(n):
        if tweet_var[i].X > 0.5:
            selected_sentences.append((i, tweet[i][0]))

    selected_sentences.sort(key=lambda x: x[0])

    with open(ofname, 'w', encoding='utf-8') as out_fp:
        for _, sentence in selected_sentences:
            out_fp.write(sentence + '\n')

def compute_summary(args):
    ifname = args.prep_path
    SUMMARY_PATH = args.summary_path
    CLASS_WEIGHT = args.class_weights
    nos = args.class_sents

    with open(ifname,'r') as fp:
        dic = json.load(fp)

    for k,v in CLASS_WEIGHT.items():
        print('Weight for class {} is {}'.format(k,v))

    print('Total number of documents:{}'.format(len(dic.keys())))
    
    SUMMARY_LENGTH = {}
    with open(args.length_file,'r') as fp:
        for l in fp:
            wl = l.strip().split('\t')
            if len(wl) != 2:
                print(f"Skipping malformed line in {args.length_file}: {l!r}")
                continue
            docid = wl[0].strip().replace(".txt", "")
            SUMMARY_LENGTH[docid] = int(wl[1])

    LEGALDICT = {}
    with open('dict_words.txt','r') as fp:
        for l in fp:
            LEGALDICT[l.strip(' \t\n\r').lower()] = 1

    for k,v in dic.items():
        print('Document ID {}'.format(k))
        t0 = time.time()
        T = {}
        TW = {}
        index = 0
        content_count = {}
        CLASS_INDEX = 0
        NOS = {}
        MAP = {}
        Ts = SUMMARY_LENGTH.get(k, 2000)
        print('Summary Length: {}'.format(Ts))

        for ck,cv in v.items():
            CL_WEIGHT = CLASS_WEIGHT.get(ck, 1)
            MAP[CLASS_INDEX] = ck
            if nos != None and ck in nos.keys():
                NOS[CLASS_INDEX] = nos[ck]
            else:
                NOS[CLASS_INDEX] = args.default_sents

            position = 1
            for x in cv:
                if len(x[0].split()) > 4:
                    content = set()
                    All = set()
                    sentence = x[0]
                    tokens = x[2]
                    L = 0
                    SEN_TEMP = ''
                    for y in tokens:
                        if y[1] in POS_TAGS:
                            L += 1
                            All.add(y[0].lower())
                            SEN_TEMP += y[0].lower() + ' '
                        if y[1] in ['NN', 'NNP']:
                            content.add(y[0].lower())
                        elif y[1] in ['NNS', 'NNPS']:
                            try:
                                word = lmtzr.lemmatize(y[0].lower())
                            except:
                                word = y[0].lower()
                            content.add(word)

                    LEGAL_WORD = get_legal_word(SEN_TEMP.strip(), LEGALDICT)
                    STATUTE_WORD = get_statute_mention(sentence.strip())

                    for y in content:
                        content_count[y] = args.content_weight
                    for y in LEGAL_WORD:
                        All.add(y.lower())
                        content.add(y.lower())
                        content_count[y.lower()] = args.legal_weight
                    for y in STATUTE_WORD:
                        All.add(y.lower())
                        content.add(y.lower())
                        content_count[y.lower()] = args.statute_weight

                    if should_select(TW, All):
                        if ck == 'F':
                            score = CL_WEIGHT * (1 / position)
                        elif ck == 'S':
                            score = CL_WEIGHT * isStatute(sentence, 'current-acts.txt')
                        elif ck == 'P':
                            score = CL_WEIGHT * isPrecedent(sentence)
                        elif ck == 'R':
                            score = CL_WEIGHT * position * (isPrecedent(sentence) or isStatute(sentence, 'current-acts.txt'))
                        else:
                            score = CL_WEIGHT

                        T[index] = [sentence, content, L, score, CLASS_INDEX]
                        TW[index] = All
                        index += 1

                    position += 1

            CLASS_INDEX += 1

        L = len(T.keys())
        print('Number of tweets: {}'.format(L))

        tweet_cur_window = {
            i: [T[i][0].strip(), int(T[i][2]), T[i][1], float(T[i][3]), int(T[i][4])]
            for i in range(L)
        }

        print('Number of classes: ', CLASS_INDEX)
        print('Sentence Limit: {}'.format(NOS))
        print('Class Mapping: {}'.format(MAP))

        ofname = os.path.join(SUMMARY_PATH, k + '.txt')
        optimize(tweet_cur_window, content_count, ofname, Ts, CLASS_INDEX, NOS)

        t1 = time.time()
        print('Summarization done: ', ofname, ' ', t1 - t0)

    print('Done with documents')

In [55]:
def should_select(T,new):
    if len(new)==0:
        return 0
    for i in range(0,len(T),1):
        temp = T[i]
        common = set(temp).intersection(set(new))
        if len(common)==len(new):
            return 0
    return 1

In [56]:
def set_weight(P,L,U):
    min_p = min(P.values())
    max_p = max(P.values())

    x = U - L + 4.0 - 4.0
    y = max_p - min_p + 4.0 - 4.0
    factor = round(x/y,4)

    mod_P = {}
    for k,v in P.iteritems():
        val = L + factor * (v - min_p)
        mod_P[k] = round(val,4)

    count = 0
    return mod_P

In [None]:
from gurobipy import Model, GRB, LinExpr, GurobiError
import codecs
import sys

def optimize(tweet, con_weight, ofname, L, CLASS_INDEX, NOS):
    con_word = {}
    tweet_word = {}
    tweet_index = 1
    for k, v in tweet.items():
        set_of_words = v[2]
        for x in set_of_words:
            if x not in con_word:
                p1 = round(con_weight.get(x, 0.0), 4) * WT2  
                con_word[x] = p1

        tweet_word[tweet_index] = [v[1], set_of_words, v[0], v[3], v[4]]  
        tweet_index += 1

    sen = list(tweet_word.keys())
    sen.sort()
    entities = list(con_word.keys())
    print('Length: ', len(sen), len(entities))


    m = Model("sol1")

    sen_var = [m.addVar(vtype=GRB.BINARY, name=f"x{i+1}") for i in range(len(sen))]

    con_var = [m.addVar(vtype=GRB.BINARY, name=f"y{i+1}") for i in range(len(entities))]

    m.update()

    P = LinExpr() 
    C1 = LinExpr()  
    C4 = LinExpr()  
    C2 = [] 
    counter = -1

    for i in range(len(sen)):
        P += tweet_word[i+1][3] * sen_var[i]
        C1 += tweet_word[i+1][0] * sen_var[i]
        v = tweet_word[i+1][1] 
        C = LinExpr()
        flag = 0
        for j in range(len(entities)):
            if entities[j] in v:
                flag += 1
                C += con_var[j]
        if flag > 0:
            counter += 1
            m.addConstr(C >= flag * sen_var[i], f"c{counter}")
                
    for i in range(len(entities)):
        P += con_word[entities[i]] * con_var[i]
        C = LinExpr()
        flag = 0
        for j in range(len(sen)):
            v = tweet_word[j+1][1]
            if entities[i] in v:
                flag = 1
                C += sen_var[j]
        if flag == 1:
            counter += 1
            m.addConstr(C >= con_var[i], f"c{counter}")

    CC = 0
    while CC < CLASS_INDEX:
        C = LinExpr()
        for i in range(len(sen)):
            if tweet_word[i+1][4] == CC:
                C += sen_var[i]
        counter += 1
        m.addConstr(C >= NOS[CC], f"c{counter}")
        CC += 1

    counter += 1
    m.addConstr(C1 <= L, f"c{counter}")

    m.setObjective(P, GRB.MAXIMIZE)


    fo = codecs.open(ofname, 'w', 'utf-8')
    try:
        m.optimize()
        #print('vars: {}'.format(m.getVars()))
        for v in m.getVars():
            if v.x == 1:
                temp = v.varName.split('x')
                if len(temp) == 2:
                    X = ''
                    fo.write(tweet_word[int(temp[1])][2])  
                    fo.write('\n')
    except GurobiError as e:
        print(e)
        sys.exit(0)

    fo.close()


In [62]:
def compute_tfidf_NEW(word,tweet_count,PLACE):
    score = {}
    discard = []
    THR = 5
    N = tweet_count + 4.0 - 4.0
    for k,v in word.iteritems():
        D = k.split('_')
        D_w = D[0].strip(' \t\n\r')
        D_t = D[1].strip(' \t\n\r')
        if D_w not in discard:
            tf = v
            w = 1 + math.log(tf,2)
            df = v + 4.0 - 4.0
            try:
                y = round(N/df,4)
                idf = math.log10(y)
            except Exception as e:
                idf = 0
            val = round(w * idf, 4)
            if D_t=='P' and tf>=THR:
                score[k] = val
            elif tf>=THR and D_t=='S':
                score[k] = val
            elif tf>=THR and len(D_w)>2:
                score[k] = val
            else:
                score[k] = 0
        else:
            score[k] = 0
    return score


In [63]:
def numToWord(number):
    word = []
    if number < 0 or number > 999999:
        return number
        # raise ValueError("You must type a number between 0 and 999999")
    ones = ["","one","two","three","four","five","six","seven","eight","nine","ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"]
    if number == 0: return "zero"
    if number > 9 and number < 20:
        return ones[number]
    tens = ["","ten","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
    word.append(ones[int(str(number)[-1])])
    if number >= 10:
        word.append(tens[int(str(number)[-2])])
    if number >= 100:
        word.append("hundred")
        word.append(ones[int(str(number)[-3])])
    if number >= 1000 and number < 1000000:
        word.append("thousand")
        word.append(numToWord(int(str(number)[:-3])))
    for i,value in enumerate(word):
        if value == '':
            word.pop(i)
    return ' '.join(word[::-1])

In [65]:
from argparse import Namespace

def main():
    args = Namespace(
        prep_path="prepared_data.json",
        summary_path="summaries",
        length_file="length_file.txt",
        class_weights={"L1": 2, "L2": 3},
        content_weight=1,
        legal_weight=3,
        statute_weight=5,
        class_sents={"L1": 2, "L2": 1},
        default_sents=1
    )
    
    compute_summary(args)
    print('Done')

if __name__ == '__main__':
    main()


Weight for class L1 is 2
Weight for class L2 is 3
Total number of documents:3
Document ID 1253
Summary Length: 170
Number of tweets: 92
Number of classes:  7
Sentence Limit: {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}
Class Mapping: {0: 'F', 1: 'R', 2: 'S', 3: 'A', 4: 'RLC', 5: 'P', 6: 'RPC'}
Length:  92 303
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (mac64[arm] - Darwin 24.3.0 24D81)

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 403 rows, 395 columns and 2661 nonzeros
Model fingerprint: 0x0d627896
Variable types: 0 continuous, 395 integer (395 binary)
Coefficient statistics:
  Matrix range     [1e+00, 3e+01]
  Objective range  [6e-02, 4e+01]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 2e+02]
Found heuristic solution: objective 72.0666667
Presolve removed 211 rows and 202 columns
Presolve time: 0.00s
Presolved: 192 rows, 193 columns, 1551 nonzeros
Variable types: 0 continuous, 193 integer (1