In [218]:
import functools
import inspect
import warnings

string_types = (type(b''), type(u''))


def deprecated(reason):
    """
    This is a decorator which can be used to mark functions
    as deprecated. It will result in a warning being emitted
    when the function is used.
    """

    if isinstance(reason, string_types):

        # The @deprecated is used with a 'reason'.
        #
        # .. code-block:: python
        #
        #    @deprecated("please, use another function")
        #    def old_function(x, y):
        #      pass

        def decorator(func1):

            if inspect.isclass(func1):
                fmt1 = "Call to deprecated class {name} ({reason})."
            else:
                fmt1 = "Call to deprecated function {name} ({reason})."

            @functools.wraps(func1)
            def new_func1(*args, **kwargs):
                warnings.simplefilter('always', DeprecationWarning)
                warnings.warn(
                    fmt1.format(name=func1.__name__, reason=reason),
                    category=DeprecationWarning,
                    stacklevel=2
                )
                warnings.simplefilter('default', DeprecationWarning)
                return func1(*args, **kwargs)

            return new_func1

        return decorator

    elif inspect.isclass(reason) or inspect.isfunction(reason):

        # The @deprecated is used without any 'reason'.
        #
        # .. code-block:: python
        #
        #    @deprecated
        #    def old_function(x, y):
        #      pass

        func2 = reason

        if inspect.isclass(func2):
            fmt2 = "Call to deprecated class {name}."
        else:
            fmt2 = "Call to deprecated function {name}."

        @functools.wraps(func2)
        def new_func2(*args, **kwargs):
            warnings.simplefilter('always', DeprecationWarning)
            warnings.warn(
                fmt2.format(name=func2.__name__),
                category=DeprecationWarning,
                stacklevel=2
            )
            warnings.simplefilter('default', DeprecationWarning)
            return func2(*args, **kwargs)

        return new_func2


In [346]:
df.head()


NameError: name 'df' is not defined

**Self Extract using Python**

*Applicant*

In [219]:
def applicant_v1(con):
    applicant=""
    app_regex=r"Applic"
    case=r"Case .*"
    match_app = re.search(app_regex, con)
    case_match=re.search(case,con)
    if match_app:
        if case_match:
            applicant=con[case_match.end():match_app.start()]
            
    return applicant.strip('\n')

In [220]:
@deprecated("found other good approach")
def applicant_v2(content):
    app_regex=r"Applicant"
    search=r"[a-zA-Z]+. [a-zA-Z]+. $"
    match_app = re.search(app_regex, content)
    if match_app is None:
        applicant=''
    else:
        match=re.search(search,content[:match_app.end()])
        if match is not None:
            applicant=match.group(0)
        else:
            applicant=content[:match_app.start()].split("\n")[-1]
    return applicant

In [268]:
def applicant_endgame(filename):
    fullname=[]
    name=eda_file(filename)
    try:  
        names=name.split(" and ")
    #print(names)
        for each in names:
            first=each.split(",")[-1]
            last=each.split(",")[0]
            fullname.append(first + " "+ last)
    except:
        first=name.split(",")[-1]
        last=name.split(",")[0]
        fullname.append(first + " "+ last)
        return fullname
        
    return fullname

In [270]:
 applicant_endgame("2020_01_03_Arriaga, Eliza.pdf")

[' Eliza Arriaga']

In [221]:
def date_of_case(filename):
    name=eda_file(filename)
    #print(name)
    name=filename.replace(name,"").split("/")
    #print(name)
    date=name[-1].strip(".pdf").strip("_")
    date=date.split("_")
    day=date[-1]
    month=date[-2]
    year=date[0]
    return day + "/" + month + "/" +year
    

In [222]:
date_of_case("14-22/2014_02_03_Villalobos, Adela.pdf")

'03/02/2014'

In [223]:
print(case_decision(op_ocr("2007_07_16_Ramirez, Mario Salas.pdf")))

['For foregoing IS ORDERED Petition Reconsideration Compromise Release hereby ']


In [12]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
#from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 
nltk.download('omw-1.4')

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('/\n',"") 
    sentence=sentence.replace("[0-9]+","")
    
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)  
    #filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in tokens]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(tokens)

 

[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [158]:
preprocess(case_decision(tp))

'n nfor the foregoing reasons n nmanzano manuel singh sarojini 8 n n n n n x0c n n20 n n21 n n22 n n23 n n24 n n25 n n26 n n27 n n n n n nit is ordered that as the decision after reconsideration of the workers ncompensation appeals board in wcab case no n nit is further ordered that as the decision after reconsideration of the workers ncompensation appeals board in wcab case no'

In [15]:
case_decision(tp)

NameError: name 'case_decision' is not defined

In [224]:
import re
def clean_decision(list_dec):
    new_dec=[]
    pattern = '\n[0-9][0-9]|\n|\x0c'
    for dec in list_dec:
        dec=re.sub(pattern,'',dec)
        new_dec.append(dec)
    return new_dec

In [225]:
def case_decision(con):
    final_dec=[]
    decision_list=re.findall(r"([^.]*?ORDERED[^.]*\.)",con)
    for each_dec in decision_list:
        saf_dec=cleanning_dec(each_dec)
        final_dec.append(saf_dec)
    return final_dec
        
        
    

In [None]:
import os
import openai

openai.api_key = "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"

def decison_cleaning_openai(prom):
    response = openai.Completion.create(
  model="text-davinci-002",prompt=prom+"\n\nTl;dr",temperature=0.7,max_tokens=60,top_p=1.0,frequency_penalty=0.0,presence_penalty=0.0)

In [184]:
case_decision(op_ocr("2007_08_08_Olfatpour, Behzad.pdf"))

['For foregoing IS ORDERED petition removal ',
 'IT IS FURTHER ORDERED decision removal Board Order May provide case shall set mandatory settlement conference prior trial ',
 'IT IS FURTHER ORDERED case RETURNED trial level may consistent opinion ']

In [171]:
clean_decision(case_decision(op_ocr("input_2002_7_10_Manzano, Manuel and Singh, sarojini.pdf")))

['For the foregoing reasons,MANZANO, Manuel/SINGH, Sarojini 8   )  IT IS ORDERED that as the Decision After Reconsideration of the Workers’Compensation Appeals Board in WCAB Case No.',
 "IT IS FURTHER ORDERED that as the Decision After Reconsideration of the Workers'Compensation Appeals Board in WCAB Case No."]

In [165]:
text1='''
 For the foregoing reasons, IT IS ORDERED that defendant’s Petition for Removal is GRANTED.
 MORALES, Marcos 4 Oo co SY DB Uw F&F Ww NH NR PR Ww NNR De Be Be BP eB NYRR RR BSB FP FSF Ge AAA Bee S IT IS FURTHER ORDERED, as the Decision After Removal of the Workers’ Compensation Appeals Board, that the Findings and Orders dated May 13, 2013, are RESCINDED and that this matter is RETURNED to the trial level so that the WCJ may develop the record through the appointment of a “tegular physician” in the medical specialty of his choosing and for such further proceedings and decisions by the WCJ as may be required, consistent with this opinion.

'''

In [20]:
texzt=" the foregoing reasons, IT IS ORDERED that defendant’s Petition for Reconsideration of the December 20, 2013 Order Vacating Lien Dismissal is DISMISSED.IT IS ORDERED that defendant’s Petition for Removal of the December 20, 2013 Order Vacating Lien Dismissal is DENIED."

In [181]:
import re
import nltk.corpus

from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop = stopwords.words('english')
from nltk.corpus import words
def cleanning_dec(text1):
    ou=''
    texzt = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text1)
    texzzt = " ".join([word for word in texzt.split() if word not in (stop)])
    tokens = nltk.word_tokenize(texzzt)
    pos = nltk.pos_tag(tokens)
    for rach in tokens:
        if rach.lower() in words.words():
            ou+=rach+ " "
        #print(rach)
    return(ou)

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [226]:
cleanning_dec(text1)

'For foregoing IT IS ORDERED Petition Removal De Be Be Ge Bee S IT IS FURTHER ORDERED Decision After Removal Compensation Board May matter RETURNED trial level may develop record appointment tegular physician medical specialty choosing may consistent opinion '

In [227]:
#from nltk.corpus import words
nltk.download('words')

"would" in words.words()
    

[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [162]:
from nltk.corpus import words
"would" in words.words()

True

In [23]:
def applicant_extract(content):
    if len(applicant_v1(content))!=0:
        return applicant_v1(content)
        #return "yes"
    elif len(applicant_v2(content))!=0:
        return applicant_v2(content)
        #return "yes"
    else:
        return ""

Codes

In [233]:
def code_v1_new(content):
    code_reg=r"sec. [0-9a-zA-Z/(/)]+|section [0-9]+|section code [0-9a-zA-Z/(/)]+|sec [0-9a-zA-Z/(/)]+|Lab. Code, § [0-9]+"
    code = re.findall(code_reg, content)
    return code

In [231]:
code_v1_new(pp)

NameError: name 'pp' is not defined

In [230]:
def more_codes(content):
    possible_codes=[]
    a,b=extract_location(content)
    for i in b:
        if 'Code' in i:
            possible_codes.append(i)
    return possible_codes
    

In [229]:
possible_codes

NameError: name 'possible_codes' is not defined

In [232]:
pp=op_ocr("2020_07_31_De La Cruz, Michele Guerrero.pdf")

In [29]:
h=['(Lab. Code, § 5909.)','(WCJ)']
for i in h:
    if 'Code' in i:
        print(i)
        

(Lab. Code, § 5909.)


Cases

In [234]:
def cases_v1(content):
    con=extract_sucks(content)
    new,district=extract_location(con)
    cases,new_d=case_nos(new)
    return cases
    

In [31]:
@deprecated("all cases don't start with ADJ")
def cases_v2(content):
    pattern=r"ADJ[0-9]+"
    cases=re.findall(pattern,content)
    case_set=set(cases)
    case=list(case_set)
    return case
    
    

In [32]:
@deprecated("some case numbers have spaces")
def cases_v3(content):
    pattern=r"[A-Z][A-Z][A-Z][0-9]+"
    cases=re.findall(pattern,content)
    case_set=set(cases)
    case=list(case_set)
    return case
    

In [235]:
def cases_v4(content):
    pattern=r"[A-Z][A-Z][A-Z] [0-9]{3,7}|[A-Z][A-Z][A-Z][0-9]+"
    cases=re.findall(pattern,content)
    case_set=set(cases)
    case=list(case_set)
    return case
    

In [236]:
case_extract(pp)

['ADJ10094004', 'ADJ10152114', 'ADJ10094112']

In [237]:
def case_extract(content):
    return cases_v4(content)
   

In [238]:
oredre="""
ORDERS

‘IT IS ORDERED that defendant’s appeal of the December 18, 2006
Determination of the Rehabilitation Unit is GRANTED.

“IT IS FURTHER ORDERED that defendant withhold 20% of
applicant’s vocational rehabilitation maintenance allowance for potential
attorney’s fees. The reasonable value of the services of applicant’s attorney will
be determined subsequently upon petition therefor, with jurisdiction reserved at
the trial level if the parties are unable to adjust the matter.”

IT IS FURTHER ORDERED that the December 18, 2006 Determination and Order of
the Rehabilitation Unit is AFFIRMED, EXCEPT that the final paragraph is AMENDED as

follows:

HOLDREN, Catherine

|ORDER|
"""

In [37]:

import os
import openai

openai.api_key = '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'

response = openai.Completion.create(
  model="text-davinci-002",
  prompt=ret[4500:],
  temperature=0,
  max_tokens=100,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)


NameError: name 'ret' is not defined

In [38]:
texzt=" the foregoing reasons, IT IS ORDERED that defendant’s Petition for Reconsideration of the December 20, 2013 Order Vacating Lien Dismissal is DISMISSED.IT IS ORDERED that defendant’s Petition for Removal of the December 20, 2013 Order Vacating Lien Dismissal is DENIED."

In [39]:
import os
import openai

openai.api_key = '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'

response_mod = openai.Completion.create(
  model="text-davinci-002",
    prompt="Convert my short hand into a first-hand account of the meeting:\n\n"+texzt,
  temperature=0,
  max_tokens=64,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)

In [239]:
for i in response_mod.choices:
    print(i.text)



The court ordered that the defendant's petition for reconsideration of the December 20, 2013 order vacating lien dismissal is dismissed. The court also ordered that the defendant's petition for removal of the December 20, 2013 order vacating lien dismissal is denied.


In [41]:
ret=' \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\nWORKERS\' COMPENSATION APPEALS BOARD\n\nSTATE OF CALIFORNIA\n\n \n\nMANUEL MANZANO,\nApplicant,\nvs.\n\nFLAVURENCE CORPORATION;\nFREMONT COMPENSATION INSURANCE,\n\nDefendants.\n\n \n\nSAROJINI SINGH,\nApplicant,\nvs.\nAMERICAN SHOWER DOOR;\n\nREPUBLIC INDEMNITY COMPANY,\nDefendants.\n\n \n\n \n\n \n\n \n\nThese two cases are combined into a single Decision After Reconsideration because each\ncase raises the same issue of law, namely, whether the workers\' compensation administrative law\njudge (WC)J) correctly dismissed the California Insurance Guarantee Association (CIGA) as a party\ndefendant before a determination of underlying issues related to CIGA\'s liability. We will rescind\n\nthe orders dismissing CIGA in both cases and return the matters to the trial level for further\n\nproceedings and decision.\n\nWe hold that generally CIGA should not be dismissed from a case until a determination is\n\nmade on the issue of the date of injury, or period of injurious exposure, or other underlying issue\n\nCase No. LAO 778749\n\nOPINION AND DECISION\nAFTER\nRECONSIDERATION\n\nCase No. AHM 075204\n\nOPINION AND DECISION\nAFTER\nRECONSIDERATION\n\nwhich if adversely decided against CIGA would result in its liability.\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nOn February 19, 2002 and on February 22, 2002, we granted reconsideration in AHM\n075204 and LAO 778749, respectively, in order to allow sufficient opportunity to further study the\nfactual and legal issues in each case. Having completed our deliberations, we issue our decision\nafter reconsideration in both cases jointly.\n\nIn AHM 075204 (SINGH), defendant Republic Indemnity Company (Republic) challenges\nthe WCJ\'s Order Dismissing Party Defendant served December 3, 2001, in which the WCJ ordered\nCIGA/Kemper Insurance Company dismissed as a party defendant. Republic contends that the\ndismissal was in error because the WCJ failed to receive its objection to the Notice of Intention to\ndismiss CIGA and further that before any party may be dismissed, the date of injury must first be\ndetermined. Republic asserts that CIGA is a necessary party in this matter until there has been a\ndetermination as to the actual date of injury.\n\nIn LAO 778749 (MANZANO), defendant Fremont Compensation Insurance Company\n(Fremont) challenges the correctness of the December 3, 2001 Order of Dismissal of CIGA,\nwherein the WCJ ordered CIGA for Superior National Insurance Company (Superior), in\nliquidation, dismissed. Fremont contends that after it was joined as party defendant, it was denied\ndue process by dismissal of CIGA because it did not have an opportunity to investigate potential\nliability within ninety days as provided by Labor Code section 5402(b). Fremont further argues that\nCIGA had not proven special employment under the provisions of Insurance Code section 11663\nand that the doctrine of laches applied because CIGA waited over one year to file a petition for\njoinder of Fremont after CIGA assumed the claims of Superior, in liquidation.\n\nI. BACKGROUND\n1. WCAB Case No. AHM 075204 (SINGH).\n\nBy initial Application for Adjudication dated July 1, 1999, identified as WCAB Case No.\nAHM 075204, applicant Sarojini Singh alleged cumulative injury during the period from June\n1998 through February 7, 1999 bilaterally to his hands, arms, shoulders, wrists, bilaterally to his\nlower extremities and back, while employed by American Shower Door, whose insurance carrier\n\nwas unknown. An amended claim was filed by applicant on December 26, 2000, identifying an\n\nMANZANO, Manuel/SINGH, Sarojini 2\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nearlier commencement date for cumulative injury from February 7, 1998 to February 7, 1999, with\nthe same body parts shown as injured while employed by American Shower Door, and named\nCalifornia Compensation Insurance Company as the insurance carrier.\n\nAccording to the Workers\' Compensation Insurance Rating Bureau print-out, Republic was\non the risk from March 1, 1997 through March 1, 1998, with California Compensation Insurance\nCompany on the risk from March 1, 1998 through March 1, 1999. California Compensation\nInsurance Company became insolvent in September 2000, with CIGA assuming administration of\nclaims, adjusted by Kemper Employers Insurance Company, for the insolvent carrier.\n\nOn October 17, 2001, the WCJ issued an Order joining Republic as party defendant in\nresponse to CIGA\'s petition for joinder filed on September 27, 2001.\n\nPrior to the framing of issues or any trial in this matter, CIGA filed on October 22, 2001 a\npetition to be dismissed on the basis that there was "other insurance" within the meaning of the\nInsurance Code section 1063.1(c)(9)! during the period of the amended claim from February 7,\n1998 through February 7, 1999. The WCJ issued a Notice of Intention (NIT) to dismiss CIGA. An\norder dismissing CIGA issued on December 3, 2001. However, Republic had filed on November\n29, 2001 an objection to the NIT which was not brought to the WCJ\'s attention prior to the\nissuance of the dismissal order. In the objection, Republic referred to the initial Application for\n\nAdjudication for a period that was outside of its coverage and raised a statute of limitation defense\n\n1 Insurance Code section 1063.1 provides, in pertinent part,\n"As used in this article:\n\n"... (c) (1) \'Covered Claims\' means the obligation of an insolvent insurer, including the\nobligation for unearned premiums, (i) imposed by law and within the coverage of an insurance\npolicy of the insolvent insurer; (ii) which were unpaid by the insolvent insurer; (iii) which are\npresented as a claim to the liquidator in this state or to the association on or before the last date\nfixed for the filing of claims in the domiciliary liquidating proceedings; (iv) which were incurred\nprior to the date of coverage under the policy terminated and prior to, on, or within 30 days after\nthe date the liquidator was appointed; (v) for which the assets of the insolvent insurer are\ninsufficient to discharge in full; (vi) in the case of a policy of workers\' compensation insurance, to\nprovide workers\' compensation amendments thereto, not inconsistent with the provisions of this\narticle necessary to assure the fair, reasonable, and equitable manner of administering the\nassociation, and to provide for such other matters as are necessary or advisable to implement the\nprovisions of this article... .\n\n"(9) \'covered claims\' does not include (i) any claim to the extent it is covered by any other\ninsurance of a class covered by this article available to the claimant or insured."\n\nMANZANO, Manuel/SINGH, Sarojini 3\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nto applicant\'s amended Application for Adjudication filed on December 26, 2000. Republic also\nindicated that it had not received all medical records.\n\nThe dismissal Order triggered Republic\'s instant petition to vacate the order, or in the\nalternative petition for reconsideration of the WCJ\'s Order dismissing CIGA.\n\nIn her Report and Recommendation on Republic\'s petition pursuant to WCAB Rule 10860,\nthe WCJ recommended that Republic\'s petition be denied since dismissal was justified based upon\nInsurance Code section 1063.1(c)(9). The WCJ acknowledges that Republic\'s timely objection to\nher NIT did not come to her attention before the Order issued.\n\n2. WCAB Case No. LAO 778749 (MANZANO).\n\nBy Application for Adjudication filed on April 24, 2000 in LAO 778749, applicant Manuel\nManzano alleged that he sustained on July 21, 1998, industrial injury to his neck, left shoulder and\nhead (hearing), while employed by Parker Personnel, Inc., insured by Superior National Insurance\nCompany.\n\nIn September 2000, Superior went into liquidation and CIGA assumed administration of\nSuperior\'s claims. On November 9, 2001, CIGA filed a petition to join Flavurence Corporation as\nthe employer at the time of injury and its insurance carrier Fremont. CIGA also requested to be\ndismissed as a party defendant based on Insurance Code section 1063.1(c)(9). We note in this\npetition for joinder that CIGA indicated that applicant alleged two specific injuries occurring on\nJune 21, 1998 and September 11, 1998. But, only a single case number is listed (LAO 778749). In\nhis Report and Recommendation on Fremont\'s Petition for Reconsideration, the WCJ without\nexplanation also refers to these two specific injury dates.\n\nIt appears that at the time of the alleged injury dates in 1998, applicant was working as a\npacker. However, there is a potential general and special employment issue existing under\nInsurance Code section 116632 as well as the issue of Insurance Code section 1063.1(c)(9).\n2 Insurance Code section 11663 provides, in pertinent part: "As between insurers of general and\nspecial employers, one which insures the liability of the general employer is liable for the entire\ncost of compensation payable on account of injury occurring in the course of and arising out of\n\ngeneral and special employments unless the special employer had the employee on his or her\npayroll at the time of injury, in which case the insurer of the special employer is solely liable."\n\nMANZANO, Manuel/SINGH, Sarojini 4\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nFremont alleges that at the time of the injury, Parker Personnel, Inc., as a temporary employment\nagency, had placed applicant for a temporary assignment at Flavurence Corporation. It appears that\nSuperior, now in liquidation, was the workers\' compensation insurance carrier for Parker Personnel\nInc. and that Fremont was the carrier for Flavurence Corporation. Fremont alleges a general and\nspecial employment issue exists under Insurance Code section 11663, whereas CIGA asserts that at\nthe time of the injury that applicant was jointly employed by Parker Personnel, Inc. and Flavurence\nCorporation, which would relieve it of any liability under Insurance Code section 1063.1(c)(9).\n\nAt the Mandatory Settlement Conference held on December 3, 2001 pursuant to Labor\nCode section 5502(d), the WCJ joined Fremont as a party defendant and, over Fremont\'s objection,\ndismissed as party defendant CIGA for Superior National Insurance Company in liquidation, and\nits administrator Integrated Claims Administrators.\n\nThis led to the instant petition for reconsideration filed by Fremont. Fremont, on behalf of\nthe alleged special employer, alleges that Superior, and subsequently CIGA, had accepted the\ninjury (presumably injury of July 21, 1998) and was paying benefits. It is not clear if benefits\ncontinue to be "administered" in light of the aforementioned dismissal.\n\nIn his January 3, 2002 Report and Recommendation on the petition, the WCJ recommends\nthat Fremont\'s petition be denied based upon his interpretation and application of Insurance Code\nsection 11663 where Insurance Code section 1063.1(c)(9) applies. The WCJ interprets and applies\nInsurance Code section 11663 as only applying between "insurers" and not between employers.\nCIGA is not an insurer. The WCJ also concludes that a year in delaying the joinder of Fremont is\ninsufficient to support laches as asserted by Fremont.\n\nIl. DISCUSSION\n\nCIGA asserts in both of the above entitled cases that it should be dismissed since there is\n"other insurance" and thus it has no liability pursuant to Insurance Code section 1063.1(c)(9).\nCIGA cites Industrial Indemnity Co. v. Workers\' Compensation Appeals Board, California\nInsurance Guarantee Association (Garcia) (1997) 60 Cal.App.4th 548 [62 Cal.Comp.Cases 1661]\n\nin support of its position.\n\nMANZANO, Manuel/SINGH, Sarojini 5\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nIt is true that under certain circumstances dismissal of CIGA is an appropriate disposition.\nInsurance Code section 1063.1(c)(9)(i) provides that \'[c]overed claims\' [do] not include any claim\nto the extent it is covered by any other insurance of a class covered by this article [14.2] available\nto the claimant or insured [.]" This section was applied in Garcia, supra, where the employee filed\na worker\'s compensation claim for cumulative injury from November 6, 1990 through November\n6, 1991, during which time the employer had three successive workers\' compensation catriers.\nBefore trial, one carrier became insolvent and CIGA sought dismissal from the employee Garcia\'s\ncase, asserting that the claim was not a "covered claim" under Insurance Code section 1063.1(c)(9)\nbecause "other" workers\' compensation insurance was available through the other two jointly and\nseverally liable carriers. The WCJ denied CIGA\'s request for dismissal and issued awards\nproportionate to the time of coverage of the three carriers. The Board reversed and substituted a\njoint and several award against the remaining two carriers, concluding that the employee\'s claim\nwas covered by other insurance because all carriers during the period of exposure were jointly and\nseverally liable for benefits and hence the employee\'s claim was not a "covered claim" under\nsection 1063.1(c)(9). The Court of Appeal affirmed, holding, in pertinent part:\n\n"Garcia had the substantive right to collect his entire benefit award from\n\nIndustrial and SCIF since each was jointly and severally liable. Since\n\nGarcia\'s benefits claim was fully protected by solvent insurers Industrial\n\nand SCIF, both Garcia and his employer had ‘other insurance\' available\n\nwithin the meaning of Insurance Code section 1063.1, subdivision (c).\n\nHence, the award favoring Garcia against Industrial and SCIF did not\n\nconstitute a statutorily-defined ‘covered claim.\' (id. At subd. (c)(9).)\n\nAccordingly, the Board properly determined CIGA had no statutory\n\nliability for any portion of Garcia\'s award [Footnote omitted.]" (62\n\nCal.Comp.Cases 1661 at 1669)\n\nCIGA\'s reliance upon Garcia, supra, is not helpful. Unlike the instant cases, in Garcia\nthere was no dispute regarding the date of injury versus the date of last injurious exposure, or other\nthreshold issue which if determined adversely against CIGA would result in liability of CIGA. In\nGarcia there were three carriers in the period of liability under Labor Code section 5500.5 with one\n\nbecoming insolvent after a joint and several award; therefore CIGA was properly dismissed as a\n\nparty defendant since it was undisputed that there were two other carriers jointly liable during the\n\nMANZANO, Manuel/SINGH, Sarojini 6\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nperiod also covered by the insolvent carrier. It was clearly not a "covered claim" under Insurance\nCode section 1063.1(c)(9) as far as CIGA\'s liability.\n\nIn both of the cases before us, unlike the situation in Garcia, supra, there are underlying\nissues which if decided adversely against CIGA would establish its liability.\n\nWCAB Case No. AHM 075204 (SINGH) involves alleged cumulative trauma injury claim\nunder Labor Code section 5500.5. There has been no determination of the date of injury or period\nof injurious exposure based on submission of evidence; therefore, the date of injury under section\n5500.5 has yet to be determined. It is well settled that for the purposes of liability under Labor\nsection 5500.5, the relevant date is either the date of injury as defined by Labor Code section 5412,\nor the year immediately preceding the date of last injurious exposure, whichever occurs first. (See,\nWestern Growers Insurance Co. v. Workers\' Comp. Appeals Bd. (Austin) (1993) 16 Cal.App.4th\n227 [58 Cal.Comp.Cases 323, 331]; Aetna Cas. & Surety Co. v. Workmen\'s Comp. Appeals Bd.\n(Coltharp) (1973) 35 Cal. App.3d 329 [38 Cal. Comp. Cases 720].)\n\nApplicant SINGH initially alleged cumulative trauma injury during a period that the now\ninsolvent carrier was the only carrier on the risk, but later alleged injury during a period that both\nRepublic and the insolvent carrier were on the risk. Against the latter claim, Republic has raised a\nstatute of limitation defense that has not been determined. It has not been determined on evidence\nwhether the date of injury is outside of Republic\'s coverage and only within the insolvent carrier\'s\ncoverage or whether the statute of limitation defense will prevail. These are threshold issues which\nif decided adversely to CIGA, would establish CIGA\'s liability. Accordingly, it is premature to\ndismiss CIGA under these circumstances.\n\nWe recognize that in a situation where the parties stipulate to the elements of date of injury\nfor purposes of Labor Code section 5412, thereby identifying the potential entities that may be\nliable in the event that injury is found, then CIGA may be dismissed. However, this would likely be\nthe exception, rather than the rule, an issue not presently before us. Therefore, the WCJ\'s order\ndismissing CIGA in WCAB Case No. AHM 075204 (SINGH) will be rescinded and the matter\n\nreturned to the trial level.\n\nMANZANO, Manuel/SINGH, Sarojini 7\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nLikewise, in WCAB Case No. LAO 778749 (MANZANO) there are issues which if\ndecided adversely to CIGA, would result in CIGA\'s liability, unlike the situation in Garcia, supra.\nIt is disputed whether a general/special employment relationship exists under Insurance Code\nsection 11663, whether section 11663 is applicable to CIGA, or whether at the time of the alleged\ninjury applicant was jointly employed by Parker Personnel, Inc. and Flavurence Corporation for\npurposes of the application of Insurance Code section 1063.1(c)(9). These underlying disputes need\nto be determined on evidence before entertaining a motion to dismiss CIGA as a party defendant.\nIn addition, and as noted above, CIGA was previously paying benefits on this accepted injury and\nshould continue to pay, subject to a determination of party liability, and CIGA may seek\nreimbursement depending on the ultimate disposition in this case.\n\nIn passing we note that with respect to CIGA\'s potential liability pertaining to the issue of\ngeneral/special employment and Insurance Code section 1063.1, parties may wish to delay\nlitigation pending a disposition by WCJ Robert T. Hjelle of the Van Nuys District Office relating\nto Remedy Temp, Inc. and CIGA. We are informed that the narrow issue presented is CIGA\'s\nliability where a general/special employment relationship exists. We note that some two hundred\ncases have been consolidated and that it appears that two have been selected as representative cases\nfor trial and decision. These matters are presently in the discovery stage with an MSC set for\nAugust 9, 2002, 10:00 a.m., and a trial date of September 13, 2002, 10:00 a.m.\n\nFinally, we observe that generally there should be a hearing and determination of the\ninjured workers\' entitlement to benefits and liability for same, before allowing potential defendants\nto be dismissed. This approach will avoid the necessity for joining and/or rejoining parties\ndepending on the facts that develop in the record.\n\nFor reasons given above, we conclude that the orders dismissing CIGA in WCAB Case No.\nAHM 075204 (SINGH) and in WCAB Case No. LAO 778749 (MANZANO) should be rescinded\nand the matters returned to the trial level for further proceedings and decision consistent with the\nopinions express herein.\n\nFor the foregoing reasons,\n\nMANZANO, Manuel/SINGH, Sarojini 8\n\n \n\n \n\x0c \n\n20\n\n21\n\n22\n\n23\n\n24\n\n25)\n\n26\n\n27\n\n \n\n \n\nIT IS ORDERED that as the Decision After Reconsideration of the Workers’\nCompensation Appeals Board in WCAB Case No. AHM 075204 (SINGH, Sarojini), the Order\nDismissing Party Defendant CIGA of December 3, 2001 be, and hereby is, RESCINDED, and this\nmatter is RETURNED to the trial level for further proceedings and decision consistent with the\nopinions expressed herein.\n\nIT IS FURTHER ORDERED that as the Decision After Reconsideration of the Workers\'\nCompensation Appeals Board in WCAB Case No. LAO 778749 (MANZANO, Manuel), the Order\nof Dismissal of CIGA served December 3, 2001 be, and hereby is, RESCINDED, and this matter\nis RETURNED to the trial level for further proceedings and decision consistent with the opinions\nexpressed herein.\n\nWORKERS\' COMPENSATION APPEALS BOARD\n\n \n\nI CONCUR,\n\n \n\n \n\nDATED AND FILED IN SAN FRANCISCO, CALIFORNIA\n\n7/10/02\nSERVICE BY MAIL ON SAID DATE TO ALL PARTIES LISTED ON THE OFFICIAL\nADDRESS RECORD, EXCEPT LIEN CLAIMANTS.\n\ned\nMANZANO, Manuel/SINGH, Sarojini 9\n\n \n\n \n\x0c  |References|'

In [42]:
for i in response_tldr:
    print(i)

NameError: name 'response_tldr' is not defined

In [None]:
for i in response_tldr.choices:
     print(i.text)



In [None]:
!pip install openai

District Office

In [240]:
def district_office_v1(content):
    final=[]
    con=extract_sucks(content)
    new,district=extract_location(con)
    if len(district)>0:
        for each in district:
            if "District Office" in each:
                final.append(each)
    return final
    

In [241]:
def district_office_v2(content):
    str1=" "
    office=r"District Office"
    match_off = re.search(office, content)
    if match_off is None:
        office_res='NA'
    else:
        #print("hai")
        office=content[match_off.start()-50:match_off.start()].split(" ")
        new_list = [x for x in office if x != '']
        office_res=str1.join(new_list[-2:])
    return office_res
    
    
    
    

In [157]:
tp=op_ocr("input_2002_7_10_Manzano, Manuel and Singh, sarojini.pdf")

In [242]:
def extract_district_office(content):
    
    if len(district_office_v1(content))!=0:
        return district_office_v1(content)
    elif len(district_office_v2(content))!=0:
        return district_office_v2(content)
    else:
        return "NA"
    

In [243]:
extract_district_office(tp)

'Van Nuys'

Defendant

In [244]:
def defendant_v1(content):
    defendant=""
    con=extract_sucks(content)
    #print(con)
    new,district=extract_location(con)
    #print(new)
    cases,new_d=case_nos(con)
    appl=applicant(new_d)
    #print(cases)
    #print(district)
    #print(new_d)
#print(applicant(new_d))
    new_fucking_point=clear_fucking_mess(new_d)
#print(new_fucking_point)
    defendant=defan(new_fucking_point).strip("\n")
    return defendant

In [49]:
dip=op_ocr('2020_07_31_De La Cruz, Michele Guerrero.pdf')

NameError: name 'op_ocr' is not defined

In [50]:
dip

NameError: name 'dip' is not defined

In [245]:
def defendant_v2(content):
    defendant=''
    regex=r"VS.|vs."
    regex2=r"Defen"
    match = re.search(regex, content)
    match2=re.search(regex2,content)
    if match is not None:
        if match2 is not None:
            text=content[match.end():match2.start()]
            point=mess_case(text)
            defandant=point
            
    else:
        defendant=''
    return defendant
        
    

In [52]:
def extract_defendant(content):
    if len(defendant_v5(content))!=0:
        #print("j")
        return defendant_v5(content)
    elif len(defendant_v2(content))!=0:
        #print("k")
        return defendant_v2(content)
    else:
        return ""
    

In [246]:
mess_case("kinxi")

'kinxi'

In [54]:
extract_defendant(tp)

NameError: name 'tp' is not defined

In [55]:
tp[0:500]

NameError: name 'tp' is not defined

In [56]:
t[0:700]

NameError: name 't' is not defined

In [57]:
t=op_ocr("2007_09_17_Rush, Janis.pdf")


NameError: name 'op_ocr' is not defined

In [58]:
extract_defendant(t)

NameError: name 't' is not defined

Convert to Object

In [59]:
def pranjal(content,filename):
    defandant=extract_defendant(content)
    appl=applicant_extract(content)
    code=code_v1(content)
    cases=case_extract(content)
    district=extract_district_office(content)
    class Output:
        def __init__(self, defandant, appl, code,cases,district,filename):
            self.defandant = defandant
            self.appl = appl
            self.code = code
            self.cases=cases
            self.district=district
            self.filename=filename
            
    out=Output(defandant, appl, code,cases,district,filename)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    return data

In [60]:
# Import Libraries
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from prettytable import PrettyTable

import re
import os
import pandas as pd


In [61]:
# Set your variables
project_id = %system gcloud config get-value core/project
project_id = project_id[0]
location = 'us'           # Replace with 'eu' if processor does not use 'us' location
gcs_input_bucket  = "doc20-22"   # Bucket name only, no gs:// prefix
gcs_input_prefix  = "20-22/"                     # Input bucket folder e.g. input/
gcs_output_bucket = "doc20-22"   # Bucket name only, no gs:// prefix
gcs_output_prefix = "output/"                    # Input bucket folder e.g. output/
timeout = 300

In [62]:
from google.cloud import storage


def download_blob_into_memory(bucket_name, blob_name):
    """Downloads a blob into memory."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    #blob_name = "storage-object-name"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(blob_name)
    contents = blob.download_as_string()

    print(
        "Downloaded storage object {} from bucket {} as the following string: {}.".format(
            blob_name, bucket_name, contents
        )
    )


In [63]:
import json
new_error=[]
f=open("pranjal-528-files.json","r")
content=f.read()
c=json.loads(content)
print(len(c))
print(type(c))
for each in c:
    #if len(each.get('defandant'))==0:
        #new_error.append(each['filename'])
    if len(each.get('appl'))==0:
        new_error.append(each['filename'])
s=set(new_error)
faltu_list=list(s)
print(len(faltu_list))

528
<class 'list'>
76


In [64]:
import json
faltu1=[]
f=open("pranjal_kar.json","r")
content=f.read()
c=json.loads(content)
print(len(c))
print(type(c))
for each in c:
    #print(type(each.get('defendant')))
    if len(each.get('defendant'))==0:
        faltu1.append(each['filename'])
    if len(each.get('applicant'))==0:
        faltu1.append(each['filename'])
    if len(each.get('code'))==0:
        faltu1.append(each['filename'])
s=set(faltu1)
faltu_list=list(s)
print(len(faltu_list))
#print(faltu_list)

print(len(faltu1))


FileNotFoundError: [Errno 2] No such file or directory: 'pranjal_kar.json'

In [65]:
def improve_list(name):
    import re
    start='_[a-zA-Z].*'
    match=re.search(start,name)
    end=".json"
    match2=re.search(end,name)
    if match is not None:
        if match2 is not None:
            return name[match.start()+1:match2.start()-2]
    #return name[match.start()+1:match2.start()-2]
    
    

In [66]:
paisa_chahiye=[]
for faltu_file in faltu_list:
    paisa_chahiye.append(improve_list(faltu_file))


In [67]:
len(paisa_chahiye)

76

In [68]:
project_id = %system gcloud config get-value core/project
project_id = project_id[0]
location = 'us'           # Replace with 'eu' if processor does not use 'us' location
gcs_new_input_bucket  = "doc20-22"   # Bucket name only, no gs:// prefix
gcs_new_input_prefix  = "20-22"

In [69]:
storage_client = storage.Client()

In [285]:
new_json=[]
blobs = storage_client.list_blobs(gcs_new_input_bucket, prefix=gcs_new_input_prefix)
input_configs = []
count=0
print("Input Files:")
count=1
for blob in blobs:
    
    if ".pdf" in blob.name:
        print(eda_file(blob.name))
        with open('ocr1.txt', 'wb') as f: 
            f.write(blob.download_as_string())
            convert_to_pdf()
        fresh_content=op_ocr('extract.pdf')
        with open('{c}.txt'.format(c=eda_file(blob.name)),'w+') as f:
            f.write(fresh_content)
        count+=1
        

        #fresh_content=op_ocr('extract.pdf')
        #with open('3.txt','w') as f:
            #f.write(fresh_content)


        
            
        #new_json.append(pranjal(fresh_content,blob.name))
        

Input Files:
Arriaga, Eliza
Avola, Maria
Dudley, Glen
Emerson, Gary
Gonzalez, Richard
Hernandez, Antonio
Jimenez, Berta (Ramirez)
Ledesma, Richard
Lee, Tracy
Munso, Nathan
Noriega, Ernesto
Ortega, Ricardo


KeyboardInterrupt: 

In [248]:
ls -lr | grep ".txt"

-rw-r--r--  1 jupyter jupyter        0 Sep 11 07:52 untitled.txt
-rw-r--r--  1 jupyter jupyter  1497051 Aug 24 17:58 text.txt
-rw-r--r--  1 jupyter jupyter        0 Aug 31 13:27 test_blob.txt
-rw-r--r--  1 jupyter jupyter   281324 Aug 24 17:54 test.txt
-rw-r--r--  1 jupyter jupyter   281324 Aug 26 20:20 tes.v1.txt
-rw-r--r--  1 jupyter jupyter   152093 Sep 14 20:09 ocr_debug.txt
-rw-r--r--  1 jupyter jupyter   178525 Oct 12 20:26 ocr1.txt
-rw-r--r--  1 jupyter jupyter  1497051 Sep  9 14:04 ocr.txt
-rw-r--r--  1 jupyter jupyter   969219 Oct 12 15:13 kya-hoga.txt
-rw-r--r--  1 jupyter jupyter   968453 Oct 12 15:20 kya-hoga-9.txt
-rw-r--r--  1 jupyter jupyter        0 Oct 12 15:13 kya-hoga-2.txt
-rw-r--r--  1 jupyter jupyter   968494 Oct 12 15:25 kya-hoga-11.txt
-rw-r--r--  1 jupyter jupyter   968448 Oct 12 15:23 kya-hoga-10.txt
-rw-r--r--  1 jupyter jupyter     1089 Oct  9 15:42 heyhey.txt
-rw-r--r--  1 jupyter jupyter  4736696 Oct 12 15:08 gandu1.txt
-rw-r--r--  1 jupyter jupyter   9467

In [249]:
def debug_jsonify(l):
    import json
    with open('pranjal-new-dip2.json', 'a', encoding='utf-8') as f:
        json.dump(l, f, ensure_ascii=False, indent=4)
    

In [73]:
len(new_json)

0

In [250]:
debug_jsonify(new_json)

In [349]:
#Batch Function-Final
new_json=[]
blobs = storage_client.list_blobs(gcs_new_input_bucket, prefix=gcs_new_input_prefix)
input_configs = []
count=0
print("Input Files:")
for blob in blobs:
    if ".pdf" in blob.name:
        print(eda_file(blob.name))
        count+=1
        print(count)
        with open('ocr1.txt', 'wb') as f: 
            f.write(blob.download_as_string())
            convert_to_pdf()
        fresh_content=op_ocr('extract.pdf')
            #bol=applicant_checker(fresh_content)
            #print(bol)
        new_json.append(pranjal_updated(fresh_content,blob.name,gcs_input_bucket))
            
            
            #new_json.append(sabkaammaihikarukya(content,blob.name))
            #print(extract(content,blob.name))
            #new_json.append(extract(content,blob.name))
            
             
            


            #op_ocr(link)

Input Files:
Arriaga, Eliza
1
Avola, Maria
2
Dudley, Glen
3


KeyboardInterrupt: 

In [354]:
jsonify(new_json)

In [291]:
print("hello")

hello


In [77]:
tag

NameError: name 'tag' is not defined

In [251]:
con=extract_sucks(new_format_test)
new,district=extract_location(con)
#print(new)
cases,new_d=case_nos(new)
#print(cases)
#print(district)
#print(new_d)
#print(applicant(new_d))
new_fucking_point=clear_fucking_mess(new_d)
#print(new_fucking_point)
defandant=defan(new_fucking_point).strip("\n")
print(defandant)

NameError: name 'new_format_test' is not defined

In [252]:
fresh_content=op_ocr('extract.pdf')
extract(fresh_content,'jdkd/jkdj')

{'defendant': ' FOR RECONSIDERATION\nAND DECISION AFTER\nSOUTHERN CALIFORNIA EDISON; RECONSIDERATION\n\nPermissibly Self-Insured,\n\n  \n\n',
 'applicant': ' \n\n—\n\noO FN DH BR WD bw\n\n \n\n \n\n \n\nWORKERS’ COMPENS',
 'code': ['section 4661', 'section 4903'],
 'filename': 'input'}

In [80]:
sabkaammaihikarukya(content,blob.name)

NameError: name 'sabkaammaihikarukya' is not defined

In [81]:

def convert_to_pdf():
    file = open('extract.pdf', 'wb')
    for line in open('ocr1.txt', 'rb').readlines():
        file.write(line)
    file.close()
    '''
    
    with open('ocr.txt', 'rb') as f:
        b=f.read().decode('base64')
    text_file = open('result.pdf', "w")
    text_file.write(b)
    text_file.close()
    '''

In [82]:
new_json

[]

In [83]:
def eda_file(f):
    import re
    start='_[a-zA-Z].*'
    match=re.search(start,f)
    end=".pdf"
    match2=re.search(end,f)
    if match is not None:
        if match2 is not None:
            return f[match.start()+1:match2.start()]
    

In [84]:
if 'Kroepil Joa' in paisa_chahiye:
    print('yes')

In [85]:
!pip install opencv-python
!pip install pdf2image
!sudo apt-get install -y tesseract-ocr
!sudo apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2build2).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-utils is already the newest version (0.86.1-0ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [86]:
from PIL import Image
import pytesseract

from pdf2image import convert_from_path
def op_ocr(li):
    s=''
    images = convert_from_path(li)
    for i in images:
        text = pytesseract.image_to_string(i, lang="eng")
        s=s+text
    return(s)

        

'''

# Store Pdf with convert_from_path function
images = convert_from_path('form.pdf')
print(type(images))
'''

"\n\n# Store Pdf with convert_from_path function\nimages = convert_from_path('form.pdf')\nprint(type(images))\n"

In [87]:
#getting 1 image
from PIL import Image
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
def debug_test(li):
    s=''
    images = convert_from_path(li)
    print(images)
    count=0
    for i in images:
        count+=1
        if count!=len(images)-5:
            pass
        i.save("sample.png", "")

    


In [88]:
debug_test('2006_5_17_Hoffman, John H..pdf')

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDEA08E3F10>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE9A004390>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE9A004BD0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FC7F10>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE9A16DED0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD6250>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD6290>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD62D0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD65D0>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD6210>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD6310>, <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x7FDE99FD6350>, <PI

In [120]:
def extract(content,filename):
    l=[]
    import re
    defendant=''
    regex=r"VS.|vs."
    regex2=r"Defendants"
    match = re.search(regex, content)
    match2=re.search(regex2,content)
    app_regex=r"A|applicant"
    pdf_extract=r"/[0-9].*"
    matches = re.search(pdf_extract, filename)
    #pdf=matches.group(0)
    identifier='input'
    
    #print(match.group(0))
    
    match_app = re.search(app_regex, content)
    if match_app is None:
        applicant=''
    else:
        applicant=content[0:match_app.start()].strip('\n')
    if match is not None:
        if match2 is not None:
            defendant=content[match.end():match2.start()]
    else:
        defendant=''
        
        
         
    
    
    code_reg=r"section [0-9]+"
    code = re.findall(code_reg, content)
    
    class Output:
        def __init__(self, defendant, applicant, code,filename):
            self.defendant = defendant
            self.applicant = applicant
            self.code = code
            self.filename=filename
    out=Output(defendant,applicant,code,identifier)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    return data
    #print(data)
    
    

In [353]:
def jsonify(l):
    import json
    with open('pranjalbatch.json', 'a', encoding='utf-8') as f:
        json.dump(l, f, ensure_ascii=False, indent=4)
    

Testing @ 02-09-2022

Testing

In [91]:
kya_format_test=op_ocr('2019_11_14_Colamonico, Ashley.pdf')

In [92]:
extract(format_test,'file/output/78455/kMdkdjk_dkdd.pdf')

NameError: name 'format_test' is not defined

In [254]:
new_format_test

NameError: name 'new_format_test' is not defined

In [None]:
rd_format_test=op_ocr('2019_9_11_Pau, Puni.pdf')

In [None]:
new_extract(rd_format_test,'file/output/78455/Significant_test!.pdf')

In [253]:
new_format_test=op_ocr('2019_09_4_Olague, Michael.pdf')

KeyboardInterrupt: 

In [None]:
new_extract(new_format_test,'file/output/78455/Regular1.pdf')

In [None]:
new_format_test

In [255]:
def new_extract(content,filename):
    l=[]
    #print(content)
    import re
    defendant=''
    regex=r"VS.|vs."
    regex2=r"Defendan"
    match = re.search(regex, content)
    match2=re.search(regex2,content)
    app_regex=r"Applicant"
    case=r"Case .*"
    pdf_extract=r"/[0-9].*"
    matches = re.search(pdf_extract, filename)
    #pdf=matches.group(0)
    identifier='input'
    
    
    match_app = re.search(app_regex, content)
    #print(match.group(0))
    new=r"[a-zA-Z]+. [a-zA-Z]+."
    text=content[0:match_app.start()]
    #print(text)
    print(content[:match2.start()])
    
    text_r,district_office=extract_location(text)
    # print(text_r)
    # print(district_office)
    case_match=re.search(case,text_r)
    efficient_app=re.search(new,text_r)
    
    
    if case_match is None:
        case=''
    else:
        case=text_r[case_match.start():case_match.end()]
   
    if match_app is None:
        applicant=''
    else:
        #applicant=content[0:match_app.start()]
        print(text_r[efficient_app.start():efficient_app.end()])
        applicant=text_r[case_match.end():].strip('\n')
    if match is not None:
        if match2 is not None:
            defendant=content[match.end():match2.start()]
    else:
        defendant=''
        
        
         
    
    
    code_reg=r"section [0-9]+"
    code = re.findall(code_reg, content)
    code_set=set(code)
    code=list(code_set)
    
    class Output:
        def __init__(self, defendant, applicant, code,case,district_office,filename):
            self.defendant = defendant
            self.applicant = applicant
            self.code = code
            self.case=case
            self.district_office=district_office
            self.filename=filename
            
    out=Output(defendant,applicant,code,case,district_office,identifier)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    return data
    #print(data)
    
    

In [256]:
!pip install nltk
import nltk
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return "".join(y)



In [95]:
def extract_location(con):
    pattern=r"\(.*\)"
    district=re.findall(pattern,con)
    new_con=re.sub(pattern, '', con)
    return new_con,district

In [96]:
extract_location('caseno.adj12080707(vannuydistrictoffice)michaelolague,applicant,')

('caseno.adj12080707michaelolague,applicant,', ['(vannuydistrictoffice)'])

In [97]:
case=r"Case No. .*"
test="—\noO\n\n11\n\n \n\nWORKERS’ COMPENSATION APPEALS BOARD\nSTATE OF CALIFORNIA\n\n \n     \n   \n  \n  \n   \n\nCase No. ADJ12080707\n\n\nMICHAEL OLAGUE,"
matches = re.search(case,test)
print(test[matches.start():matches.end()])


Case No. ADJ12080707


In [98]:
def extract_sucks(content):
    import re
    defendant=''
    #regex=r"VS.|vs."
    till=r"Defendan"
    #match = re.search(regex, content)
    match2=re.search(till,content)
    if match2 is not None:
        return content[:match2.start()]
    return defendant
    

In [99]:
extract_sucks(format_test)

NameError: name 'format_test' is not defined

In [None]:
extract_sucks(new_format_test)

In [257]:
con=extract_sucks(new_format_test)
new,district=extract_location(con)
#print(new)
cases,new_d=case_nos(new)
#print(cases)
#print(district)
#print(new_d)
#print(applicant(new_d))
new_fucking_point=clear_fucking_mess(new_d)
#print(new_fucking_point)
defandant=defan(new_fucking_point).strip("\n")
print(defandant)

NameError: name 'new_format_test' is not defined

In [258]:
def case_nos(con):
    pattern=r"ADJ[0-9]+"
    cases=re.findall(pattern,con)
    new_con=re.sub(pattern, '', con)
    #new_con=re.sub(pattern, '', con)
    return cases,con

In [259]:
def applicant(con):
    applicant=""
    app_regex=r"A|applic"
    case=r"Case .*"
    match_app = re.search(app_regex, con)
    case_match=re.search(case,con)
    if match_app:
        if case_match:
            applicant=con[case_match.end():match_app.start()]
            
    return applicant.strip('\n')

In [102]:
def defan(con):
    regex=r"VS.|vs."
    defandant=""
    #regex2=r"Defendan"
    match = re.search(regex, con)
    #match2=re.search(regex2,con)
    if match is not None:
        defandant=con[match.end():]
    return defandant
    

In [103]:
def sabkaammaihikarukya(content,filename):
    con=extract_sucks(content)
    #print(con)
    new,district=extract_location(con)
    #print(new)
    cases,new_d=case_nos(con)
    appl=applicant(new_d)
    #print(cases)
    #print(district)
    #print(new_d)
#print(applicant(new_d))
    new_fucking_point=clear_fucking_mess(new_d)
#print(new_fucking_point)
    defandant=defan(new_fucking_point).strip("\n")
    #print(defandant)
    #print(new_fucking_point)
    code_reg=r"section [0-9]+"
    code = re.findall(code_reg, content)
    code_set=set(code)
    code=list(code_set)
    
    class Output:
        def __init__(self, defandant, appl, code,cases,district,filename):
            self.defandant = defandant
            self.appl = appl
            self.code = code
            self.cases=cases
            self.district=district
            self.filename=filename
            
    out=Output(defandant, appl, code,cases,district,filename)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    return data

In [104]:
sabkaammaihikarukya(content,'filename')

NameError: name 'clear_fucking_mess' is not defined

In [None]:
sabkaammaihikarukya(new_format_test,'2019_11_14_Colamonico, Ashley.pdf')

In [None]:
sabkaammaihikarukya(new_format_test,'2019_09_4_Olague, Michael.pdf')

In [None]:
sabkaammaihikarukya(rd_format_test,'2019_9_11_Pau, Puni.pdf')

In [None]:
co=op_ocr("input_2002_7_10_Manzano, Manuel and Singh, sarojini.pdf")

In [None]:
co[0:500]

In [None]:
extract_defendant(co)

In [None]:
xx=op_ocr("2007_09_17_Rush, Janis.pdf")

In [None]:
xx[0:500]

In [None]:
nono=op_ocr("2007_09_17_Cordell, Dean.pdf")

In [105]:
nono[0:1000]
import re
r='YS.'
match=re.findall(r,nono[0:1000])
match

NameError: name 'nono' is not defined

In [None]:
extract_defendant(nono)

In [None]:
defendant_v5(pp)

In [None]:
defendant_v5(nono)

In [106]:
def defendant_v5(content):
    d=''
    regex=r"VS.|vs.|Vs.|ys.|Ys.|YS. "
    regex2=r"Defen"
    match = re.search(regex, content)
    match2=re.search(regex2,content)
    #print(match2)
    #print(match)
    if match is not None:
        if match2 is not None:
            text=content[match.end():match2.start()].replace("\n"," ")
            #print(text)
            point=mess_case(text)
            #print(point)
            d=point
            
    else:
        d=''
    return d
        
    

In [339]:
def reference(content):
    refat=r"\(.*?\)"
    rege=r"v\."
    rege2=r"Cal."
    match=re.search(rege,content)
    dekho=re.search("see also",content)
    if dekho is not None:
        
    print(match.group())
    mota=re.findall(refat,content)
    match2=re.search(rege2,content)
    text=content[match.end():match2.start()]
    print(text)
    return (mota)

In [318]:
tutu='''
A WC is required to “make and file findings upon all facts involved in the controversy and an
award, order, or decision stating the determination as to the rights of the parties. Together with the
findings, decision, order or award there shall be served upon all the parties to the proceedings a summary
of the evidence received and relied upon and the reasons or grounds upon which the determination was
made.” (Lab. Code, § 5313; see also Blackledge v. Bank of America, ACE American Insurance Company
(Blackledge) (2010) 75 Cal.Comp.Cases 613, 621-22.) Appeals Board Rule 10566(c)! mandates that the
Minutes of Hearing and Summary of Evidence include: “All interlocutory orders, admissions and
stipulations, the issues and matters in controversy, a descriptive listing of all exhibits received for
identification or in evidence ...” (Cal. Code Regs. tit. 8, § 10566(c).)

Here, it appears the WCJ allowed the parties too much leeway in framing issues to be submitted
for decision and recorded the parties’ arguments as issues. The third issue is “as stated by the applicant”'''

In [340]:
reference(error)

v.
 Eastside Reservoir
Project etc. (1997) 62 


['(EMCOR)',
 '(Reed)',
 '(elbow)',
 '(b)',
 '(Specialty)',
 '(b)',
 '(1997)',
 '(panel decision)',
 '(Becerra)',
 '(b)',
 '(commencing with Section 3200)',
 '(Emphasis added.)',
 '(Johnson-Peltier)',
 '(1998)',
 '(Costa)',
 '(/d.)',
 '(Becerra, supra.)',
 '(b)',
 '(b)',
 '(Russell)',
 '(2001)',
 '(writ den.)',
 '(Henry)',
 '(2001)',
 '(writ den.)',
 '(1996)',
 '(writ den.)',
 '(1998)',
 '(writ den.)',
 '(Reynoso)',
 '(1997)',
 '(writ den.)',
 '(Coker)',
 '(writ den.)',
 '(Mesecher v. County of San Diego (1992)',
 '(1999)',
 '(1999 supp.)',
 '(Civ. Code, § 3515.)',
 '(Civ. Code, § 3516.)',
 '(Civ. Code, § 3527.)',
 '(b)']

In [107]:
words=['OPINION','AND','DECISION','AFTER','RECONSIDERATION','PETITION','RECONSIDERATION','REMOVAL','DECISION AFTER REMOVAL','ORDER','GRANTING', 'FOR']
def clear_fucking_mess(con):
    for i in words:
        con = con.replace(i,'')
    return con

In [108]:
words=['OPINION',' AND ','DECISION',' AFTER ','RECONSIDERATION','PETITION','RECONSIDERATION','REMOVAL',' DECISION AFTER REMOVAL ',' ORDER ','GRANTING', 'FOR ']
def clear_fucking_mess(con):
    for i in words:
        con = con.replace(i,'')
    return con
    

In [109]:
f=open("pranjal-528-files.json","r")
debug=f.read()
print(type(debug))

<class 'str'>


In [110]:
import json
new_error=[]
f=open("pranjal-528-files.json","r")
content=f.read()
c=json.loads(content)
print(len(c))
print(type(c))
for each in c:
    #if len(each.get('defandant'))==0:
        #new_error.append(each['filename'])
    if len(each.get('defandant'))==0:
        new_error.append(each['filename'])
s=set(new_error)
faltu_list=list(s)
print(len(faltu_list))

528
<class 'list'>
42


In [111]:
for wach in faltu_list:
    

SyntaxError: unexpected EOF while parsing (2005341762.py, line 2)

In [None]:

debug_faltu=[]
for each in faltu_list:
    debug_faltu.append(eda_file(each))

In [121]:
#Debug
new_json=[]
blobs = storage_client.list_blobs(gcs_input_bucket, prefix=gcs_input_prefix)
input_configs = []
count=0
print("Input Files:")
for blob in blobs:
    if ".pdf" in blob.name:
        count+=1
        

        if eda_file(blob.name) in debug_faltu:
            print(eda_file(blob.name))
            #source="gs://{bucket}/{name}".format(bucket=gcs_input_bucket,name=blob.name)
            #source2="https://storage.cloud.google.com/{bucket}/{file}".format(bucket=gcs_input_bucket,file=blob.name)
            #print(blob)
            with open('ocr_debug.txt', 'wb') as f: 
                f.write(blob.download_as_string())
                convert_to_pdf_debug()
            
            fresh_content=op_ocr('extract_debug.pdf')
            #bol=applicant_checker(fresh_content)
            #print(bol)
            if(len(defendant_v5(fresh_content))==0):
                print("No")
            else:
                print("yes")
            #print(defendant_v5(fresh_content))
            
            #new_json.append(pranjal(fresh_content,blob.name))
            
            
            #new_json.append(sabkaammaihikarukya(content,blob.name))
            #print(extract(content,blob.name))
            #new_json.append(extract(content,blob.name))
            
          



Input Files:


NameError: name 'debug_faltu' is not defined

In [None]:
count

In [260]:

def convert_to_pdf_debug():
    file = open('extract_debug.pdf', 'wb')
    for line in open('ocr_debug.txt', 'rb').readlines():
        file.write(line)
    file.close()

In [None]:
nono=op_ocr("2007_09_21_Mestas, Michael J..pdf")

In [None]:
m=op_ocr("input_2002_7_10_Manzano, Manuel and Singh, sarojini.pdf")

In [214]:
#words=['OPINION','AND','DECISION','AFTER','RECONSIDERATION','PETITION','RECONSIDERATION','REMOVAL','DECISION AFTER REMOVAL','ORDER','GRANTING', 'FOR']
def mess_case(con):
    for i in consider_words:
        con = con.replace(i+" ","")
    return con
    

In [None]:
import re

In [None]:
nono[0:1000]

In [None]:
defendant_v6(nono)

In [None]:
c=op_ocr("2007_08_08_Olfatpour, Behzad.pdf")
defendant_v5(c)

In [None]:
c[0:1000]

In [None]:
faltu_list

In [261]:
def defendant_v6(content):
    d=''
    regex=r"VS. |vs. |Vs. |ys. |Ys. |YS. "
    regex2=r"Defendant\(s\)."
    match = re.findall(regex, content)
    match2=re.findall(regex2,content)
    print(match2)
    print(match)
    '''
    if match is not None:
        if match2 is not None:
            text=content[match.end():match2.start()]
            #print(text)
            point=clear_fucking_mess(text)
            #print(point)
            d=point
            
    else:
        d=''
    return d
    '''    
    

In [None]:
matha=r"Defendant\(s\)."
print(re.findall(matha,c))

In [262]:
s='''
OPINION AND ORDER
DENYING PETITION FOR
RECONSIDERATION AND
ORDER OF REMOVAL ON
BOARD'S MOTION WITH
NOTICE DISMISSING OF INTENTION TO
IMPOSE SANCTIONS OPINION AND ORDER
DISMISSING DENYING PETITION
FOR RECONSIDERATION OPINION AND ORDER
GRANTING RECONSIDERATION
AND DECISION AFTER
RECONSIDERATION OPINION AND ORDERS DISMISSING
PETITION FOR
RECONSIDERATION, AND
GRANTING REMOVAL ON MOTION
OF THE APPEALS BOARD AND
DECISION AFTER REMOVAL OPINION AND ORnER
DENYING DISQUALIFICATION OPINION AND DECISION
AFTER REMITTITUR ORDER DENYING
RECONSIDERATION ORDER VACATING ORDER 
GRANTING RECONSIDERATION, 
OPINION AND ORDER GRANTING 
REMOVAL AND DECISION 
AFTER REMOVALOPINION AND ORDER 
DISMISSING PETITION
FOR REMOVAL OPINION AND ORDERS DISMISSING
PETITION FOR REMOVAL;
GRANTING RECONSIDERATION AND 
DECISION AFTER
RECONSIDERATION OPINION AND ORDER GRANTING
PETITION FOR RECONSIDERATION
AND DECISION AFTER
RECONSIDERATION ORDER DISMISSING
PETITION FOR
RECONSIDERATION
AND DENYING REMOVAL'''

In [212]:
si=s.strip("\n").split(" ")

In [213]:
consider_words=si

In [115]:
import json
takram=[]
new_error=[]
f=open("pranjal-528-files.json","r")
content=f.read()
c=json.loads(content)
print(len(c))
print(type(c))
for each in c:
    if len(each.get('appl'))!=0:
        if len(each.get('defandant'))!=0:
            if len(each.get('code'))!=0:
                takram.append(eda_file(each['filename']))
                
print(len(takram))

528
<class 'list'>
316


In [202]:
takram

['Lucena, Isidoro A.',
 'Hall, Les',
 'Lee, Alfred R.',
 'Motallebi, Shanin',
 'Yee Sanchez Donna and Piatt, Natalie',
 'Vega, Noe',
 'Grom, Kenneth',
 'Lett, John',
 'Messinese, Thomas',
 'Diggle, Wilma',
 'Reyes, Jose',
 'Erickson, Caryl',
 'Ward, Kathy',
 'Robbins, Catherine',
 None,
 'Rathke, Scott',
 'Brent, Daniel',
 'Carrillo, Leonardo',
 'Hayden, Julie',
 'James, Ronald',
 'Lopez, Moises',
 'Pace, Latonia',
 'Pini, Ronald',
 'Ruiz, Antonio',
 'Tabarez, Tina',
 'Bell, Paulette',
 'Cariaso, Felino',
 'Celis, Pedro',
 'Ellison, Michael',
 'Hein, Arthur P.',
 'Holzmeister, Karen',
 'Kos, Deborah',
 'Montes, Dulce',
 'Rodriguez, Ricardo Ramirez',
 'Soyinthisane, Amphavanh',
 'Stokes, Kimberly',
 'Clarke, Larry',
 'Marshall, Terry',
 'Mendez, Araceli',
 'Padgett, Gerald',
 'Sanjoro, Jesus',
 'Stroth, Janet',
 'Thompson, Patricia L.',
 'Devore, Gregory',
 'Gerald, Katherine',
 'Herrera, Luis',
 'Caballero, Felipe',
 'Medina, Danny',
 'Meyer, Wilfried',
 'Ortiz, Patricia A.',
 'Rojas, 

In [347]:
def pranjal_updated(content,filename,gcs_input_bucket):
    try:
        defandant=extract_defendant(content)
        appl=applicant_endgame(filename)
        code=code_v1_new(content)
        cases=case_extract(content)
        district=extract_district_office(content)
        date=date_of_case(filename)
        decision=case_decision(content)
        raw=content
        case_name=gcs_input_bucket
    except:
        print("Error")
        return
    class Output:
        def __init__(self, defandant, appl, code,cases,district,date,decision,raw,filename,case_name):
            self.defandant = defandant
            self.appl = appl
            self.code = code
            self.cases=cases
            self.district=district
            self.date=date
            self.decision=decision
            self.raw=raw
            self.filename=filename
            self.case_name=case_name
            
    out=Output(defandant, appl, code,cases,district,date,decision,raw,filename,case_name)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    return data

In [263]:
pranjal_updated(error,"2007_09_21_Gallegos, Armando")

Error


In [206]:
error=op_ocr("2007_09_21_Gallegos, Armando.pdf")

In [274]:
applicant_endgame("2007_09_21_Gallegos, Armando.pdf")

[' Armando Gallegos']

In [275]:
code_v1_new(error)

['section 3201',
 'section 5275',
 'section 5275',
 'section 3201',
 'section 3201',
 'section 3201',
 'section 5275',
 'section 5275']

In [276]:
case_extract(error)

['SDO 0341777', 'SEP21']

In [277]:
extract_district_office(error)

'NA'

In [278]:
date_of_case("2007_09_21_Gallegos, Armando.pdf")

'21/09/2007'

In [279]:
case_decision(error)

['For foregoing IS ORDERED decision reconsideration Board Award ']

In [284]:
pranjal_updated(error,"2007_09_21_Gallegos, Armando.pdf")

{'defandant': '  A.O, REED & COMPANY; ST. PAUL TRAVELERS INSURANCE; UNIVERSAL MECHANICAL (EMCOR), AMERICAN CASUALTY COMPANY READING, PA., adjusted by SPECIALTY RISK SERVICES,  ',
 'appl': [' Armando Gallegos'],
 'code': ['section 3201',
  'section 5275',
  'section 5275',
  'section 3201',
  'section 3201',
  'section 3201',
  'section 5275',
  'section 5275'],
 'cases': ['SDO 0341777', 'SEP21'],
 'district': 'NA',
 'date': '21/09/2007',
 'decision': ['For foregoing IS ORDERED decision reconsideration Board Award '],
 'raw': "1 |\n\nn Ww\n\nco\n\n10\n\n14q\n\n13\n14\n15\n16\n17\n18\n19\n20)\n\nv\n\n24\n2g\n23\n2al\n25)\n24!\n27\n\nWORKERS’ COMPENSATION APPEALS BOARD\n\nSTATE OF CALIFORNIA\n\n \n\n \n\n \n\nCase No. SDO 0341777\n\nARMANDO GALLEGOS,\nApplicant, OPINON AND DECISION\nAFTER RECONSIDERATION\nys.\n\nA.O, REED & COMPANY; ST. PAUL\nTRAVELERS INSURANCE; UNIVERSAL\nMECHANICAL (EMCOR), AMERICAN\nCASUALTY COMPANY OF READING, PA.,\nadjusted by SPECIALTY RISK SERVICES,\n\nDefendants.

In [118]:
import json
import re

f=open("pranjal-new-dip2.json","r")
content=f.read()
c=json.loads(content)
order = r'[0-9]'

filtered= re.sub(order,'',c)
print(filtered)
    



TypeError: expected string or bytes-like object

Cleaning Order Decisison

In [122]:
!pip install pytextrank



In [123]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [124]:
import spacy
import pytextrank

In [153]:
import pathlib
text = pathlib.Path("heyhey.txt").read_text()
text1='''
0:" For the foregoing reasons, IT IS ORDERED, as the Decision After Reconsideration of the Workers’ Compensation Appeals Board, that the January 25, 2013 Orders Dismissing Liens for Failure to Pay Activation Fee Prior to Lien Conference are RESCINDED, and jurisdiction of this matter is RETURNED to the trial level for further proceedings and decision consistent with this opinion."
1:" ffl ffl Hf MT /if ‘td fll ALTAMIRANO, Marco 2 Co OD DD NW BR WwW PB = Be YP BY BP BY YP BD DY me Be KB — IT IS FURTHER ORDERED that QUALIFIED BILLING AND COLLECTIONS, LLC and DIEGO S."
'''


In [126]:
!pip install summa



In [127]:
from summa import summarizer
from summa import keywords

In [155]:
summarizer.summarize(text1,words=0)

'0:" For the foregoing reasons, IT IS ORDERED, as the Decision After Reconsideration of the Workers’ Compensation Appeals Board, that the January 25, 2013 Orders Dismissing Liens for Failure to Pay Activation Fee Prior to Lien Conference are RESCINDED, and jurisdiction of this matter is RETURNED to the trial level for further proceedings and decision consistent with this opinion."\n1:" ffl ffl Hf MT /if ‘td fll ALTAMIRANO, Marco 2 \x0cCo OD DD NW BR WwW PB = Be YP BY BP BY YP BD DY me Be KB — IT IS FURTHER ORDERED that QUALIFIED BILLING AND COLLECTIONS, LLC and DIEGO S."'

In [174]:
!pip install sumy



In [172]:
!pip install summa



In [173]:
from summa import summarizer
from summa import keywords

In [344]:
import pandas as pd
import csv
pk=pd.read_csv("pranjal-batch_13thoct-2022batch.csv", quoting=csv.QUOTE_NONE,on_bad_lines='skip')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [345]:
pk.head()

Unnamed: 0,defandant,appl/0,cases/0,district/0,date,decision/0,raw,filename,code/0,code/1,...,code/180,code/181,code/182,code/183,decision/4,decision/5,decision/6,cases/22,cases/23,cases/24
0,""" 7 || CALIFORNIA TRUCK EQUIPMENT COMPANY; STA...","9 """,Eliza Arriaga,ADJ7628890,(Los Angeles District Office),03/01/2020,For foregoing IS ORDERED Petition Reconsiderat...,"""",,,...,,,,,,,,,,
1,—,,,,,,,,,,...,,,,,,,,,,
2,WORKERS’ COMPENSATION APPEALS BOARD,,,,,,,,,,...,,,,,,,,,,
3,2 STATE OF CALIFORNIA,,,,,,,,,,...,,,,,,,,,,
4,3,,,,,,,,,,...,,,,,,,,,,
