In [2]:
import pandas as pd
from pyodbc import connect
from copy import deepcopy
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from flashtext import KeywordProcessor
from symspellpy import SymSpell,Verbosity
from nltk import edit_distance, corpus
from itertools import chain, combinations
from os import path
import re
import numpy as np
from collections import Counter

# _conn_prepro_pyodbc = connect(f"DSN={'PREPRO_DEV_dbsed4754_al'};UID={'rrtml_alrwx_d'};PWD={'QtrG:&3mAhaNzJ'}")
# _conn_prepro = create_engine("mssql+pyodbc://rrtml_alrwx_d:QtrG:&3mAhaNzJ@PREPRO_DEV_dbsed4754_al")
_conn_dde = create_engine("mssql+pyodbc://rrtml_ddepprwx_p:7NFj=Ajt@PREPRO_PROD_dbsep6458cls_dde")

def remove_hyphen_from_memid(p_memid):
    if '-' in str(p_memid):
        p_memid = str(p_memid).split("-")[0]
    return str(p_memid) if len(p_memid) <= 9 else str(p_memid)[:9]


def gps_zfill(memid):
    if len(str(memid)) < 9:
        memid = str(memid).zfill(9)
    return str(memid)

In [3]:
def filter_dde_data_for_cids(cids, _filter=True):
    dde = get_dde_data_for_cids(cids)
    _unq = dde['CaseID'].nunique()
    if _filter:
        _duplicates = dde.shape[0] - dde['CaseID'].nunique()
        dde = dde.drop_duplicates(subset=['CaseID'], keep=False).reset_index(drop=True)
        return dde, _unq, _duplicates
    return dde, _unq, 0

def _read_sql_table(tablename, where_condition=None, con=None):
    if not where_condition:
        return pd.read_sql_query(f"SELECT * FROM [dbo].[{tablename}] with (nolock)", con)
    return pd.read_sql_query(f"SELECT * FROM [dbo].[{tablename}] with (nolock) WHERE {where_condition}", con)

In [4]:
def search_pattern_in_content(full_content, p_search_pattern, flavour='extra', search_with_block=False):
    if search_with_block:
        if type(p_search_pattern) == list:
            match_list = list(chain(*[re.findall(r'\b' + str(ele) + r'\b', full_content) for ele in p_search_pattern]))
        else:
            match_list = re.findall(r'\b' + str(p_search_pattern) + r'\b', full_content)
        return match_list
    processor = KeywordProcessor()
    if flavour == 'extra':
        if type(p_search_pattern) == list:
            processor.add_keywords_from_list(list(map(lambda x: str(x).lstrip('0'), p_search_pattern)))
        else:
            processor.add_keyword(str(p_search_pattern).lstrip('0'))
    else:
        if type(p_search_pattern) == list:
            processor.add_keywords_from_list(p_search_pattern)
        else:
            processor.add_keyword(str(p_search_pattern))
    match_list = processor.extract_keywords(full_content)
    return match_list

def format_file(file_content, doc_source, _remove='alpha', remove_stopwords=True):
    _remove = _remove.lower()
    if _remove not in ['alpha', 'digit', 'specialchar', 'nothing']:
        _remove = 'specialchar'
    if type(file_content) == list:
        file_content = '\n'.join(file_content)
    if doc_source == 'icare':
        if file_content.count(ocr_page_sep) > 0:
            file_content = file_content.split(ocr_page_sep)[2:]
            file_content = str(ocr_page_sep).join(file_content)
    if _remove == 'alpha':
        file_content = re.sub("[^ 0-9]", ' ', file_content)
        file_content = ' '.join(map(correct_claimid, file_content.split()))
        return file_content
    elif _remove == 'digit':
        file_content = re.sub("[^ a-zA-Z]", ' ', file_content)
        file_content = ' '.join(file_content.split()).lower()
    elif _remove == 'specialchar':
        file_content = re.sub("[^ a-zA-Z0-9]", ' ', file_content)
        file_content = ' '.join(file_content.split()).lower()
    elif _remove == 'nothing':
        file_content = ' '.join(file_content.split()).lower()
    else:
        pass
    if remove_stopwords:
        file_content = ' '.join([word for word in file_content.split() if word not in stopwords_list])
    return file_content

def get_pageno_for_keywords(full_content, key_words, page_sep='page break for ml processing', p_count_starts_from=0):
    pno = []
    for key in key_words:
        if type(key) == tuple:
            pno.append(
                [len(search_pattern_in_content(full_content[:ele.span()[0]], page_sep)) + p_count_starts_from for ele in
                 re.finditer(r'\b' + str(key[0]) + r'\b', full_content)])
        else:
            pno.append(
                [len(search_pattern_in_content(full_content[:ele.span()[0]], page_sep)) + p_count_starts_from for ele in
                 re.finditer(r'\b' + str(key) + r'\b', full_content)])
    return pno

In [5]:
from pickle import load
df = load(open("CaseDetailsDataSet_12.01-01.31.pkl", 'rb')).iloc[:15000, :]
print(df.shape)
df.head()

(15000, 2)


Unnamed: 0,caseid,content
0,CA15FEFA123B6CB.txt,ph from page fax transmission to unitedhealth ...
1,OFF5FEF93A2A4DB.txt,am uhc to fax from sinuva connect patient supp...
2,CA15FEF677C6889.txt,jan from clara kramer uheeler phone faxzero co...
3,CA15FEF60F2BE86.txt,jan nicholas diaz uin ted cara fi fcsxqs uotm ...
4,CA15FEF5627AA2D.txt,fsp pm page fax server fax to company fax phon...


# ______

# Expedited FP cases only

In [6]:
# PC - 2336,2359,2334,2368,23004
# PD - 2310,2311,2338

dataset = df.copy()
dataset = dataset.rename(columns={'content': 'doc_content'})

print(dataset.shape)
dataset.head()

(15000, 2)


Unnamed: 0,caseid,doc_content
0,CA15FEFA123B6CB.txt,ph from page fax transmission to unitedhealth ...
1,OFF5FEF93A2A4DB.txt,am uhc to fax from sinuva connect patient supp...
2,CA15FEF677C6889.txt,jan from clara kramer uheeler phone faxzero co...
3,CA15FEF60F2BE86.txt,jan nicholas diaz uin ted cara fi fcsxqs uotm ...
4,CA15FEF5627AA2D.txt,fsp pm page fax server fax to company fax phon...


###### ______

In [7]:
from nltk import corpus
STOPWORDS = list(set(corpus.stopwords.words('english')))

def filter_stopword(sentence):
    return ' '.join([word for word in sentence.split() if word not in STOPWORDS])

In [8]:
dataset['doc_content_fil'] = dataset['doc_content'].apply(filter_stopword)
print(dataset.shape)
dataset.head()

(15000, 3)


Unnamed: 0,caseid,doc_content,doc_content_fil
0,CA15FEFA123B6CB.txt,ph from page fax transmission to unitedhealth ...,ph page fax transmission unitedhealth care sou...
1,OFF5FEF93A2A4DB.txt,am uhc to fax from sinuva connect patient supp...,uhc fax sinuva connect patient support program...
2,CA15FEF677C6889.txt,jan from clara kramer uheeler phone faxzero co...,jan clara kramer uheeler phone faxzero con rec...
3,CA15FEF60F2BE86.txt,jan nicholas diaz uin ted cara fi fcsxqs uotm ...,jan nicholas diaz uin ted cara fi fcsxqs uotm ...
4,CA15FEF5627AA2D.txt,fsp pm page fax server fax to company fax phon...,fsp pm page fax server fax company fax phone f...


## Using Package

In [13]:
from template import TemplateClassifier
tc  = TemplateClassifier(9)

In [9]:
# ['waiver of liability', 'power of attorney', 'certificate of death', 'appointment of representative']
template_header = ['waiver of liability',]

template_sep='page break ml processing'

template_header = list(map(filter_stopword, template_header))
template_header

['waiver liability']

In [None]:
from template import TemplateClassifier
tc  = TemplateClassifier(9)
tc.fit_for_template(dataset['doc_content_fil'], template_header, tokens_p_template=(10, 25),
                    template_sep='page break ml processing',
                   max_df=0.95, min_df=100, ngram_range=(4, 4))

 |+ Analyzing Contents +|
 |+ Fitting Vectorizer +|


In [22]:
tc.save_model("TemplateClassfier_WOL_v1.0.pkl")

***

### Line by Line - Template Classifier

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.95, min_df=100, ngram_range=(4, 4))
data = cv.fit_transform(dataset['doc_content_fil'])

In [12]:
template_headers = list(chain(*[[feature for feature in cv.get_feature_names() if header in feature] for header in 
                                template_header]))
len(template_headers)

32

In [13]:
data_arr = data.toarray()
data_arr.shape

(15000, 5162)

In [14]:
idx_headers = np.array([cv.get_feature_names().index(head) for head in template_headers])

filter1 = (data_arr[:, idx_headers].sum(axis=1) > 0).astype(int).nonzero()[0]
filter2 = (np.count_nonzero(data_arr, axis=1) > 0).astype(int).nonzero()[0]

filter_index = np.array(sorted(list(set(filter1).intersection(filter2))))
filter_index

array([    0,    11,    22, ..., 14974, 14981, 14987])

In [15]:
data_ = dataset['doc_content_fil'].loc[filter_index].str.split(template_sep.lower())
data_ = data_.apply(pd.Series).stack().reset_index()[0].replace('', np.NaN).dropna().reset_index(drop=True)
data_

0        ph page fax transmission unitedhealth care sou...
1         ph page soaurrhaerm iptaimnn emyaacl chhgsirr...
2         sy ph page elb gr united health care po box h...
3         ph page fsp pm page fax server unitedhealthca...
4         ph page southern pain spine specialists dr am...
5         ph page mugl al wrmse ul ts homithcare ca po ...
6        fax fsp pm dpage fax server confidential unite...
7         fax fsp pm page fax server confidential reque...
8         fa xe septamber dear customer following proof...
9         ax fedex ship manager prin labe sl zel isnovy...
10        fay nl date page order number new level llc t...
11        waiver liability statement enrollee name thec...
12       fax fspp dpage fax server confidential united ...
13        fax fsp dpage fax server confidential request...
14        fax december dear customer tha following proo...
15        fay nn fedex ship manager print labe ows clr ...
16        fax tn nl frsingun date page order number shi.

In [16]:
cv = CountVectorizer(max_df=0.95, min_df=100, ngram_range=(4, 4))
data = cv.fit_transform(data_)

In [18]:
data = data[(data.sum(axis=1) > int(9)).reshape(-1).nonzero()[-1]].toarray()
data.shape

(4517, 647)

In [40]:
temp_page_data = data[data[:, 13] > 0]
temp_page_data = temp_page_data[np.count_nonzero(temp_page_data, axis=1) > 9]

temp_page_data.shape

(338, 647)

In [54]:
templates_ = dict()
filtered_templates = []
tokens_p_template = (20, 25)

for form in [cv.vocabulary_.get(e, None) for e in template_headers]:
    if form is not None:
        form_tokens = list(sorted(zip(range(len(cv.get_feature_names())), np.count_nonzero(temp_page_data, axis=0)),
                                  reverse=True, key=lambda x: x[1]))
        form_tokens = np.array([ele[0] for ele in form_tokens])[:tokens_p_template[1]]
        form_tokens = set(form_tokens).union([form])
        if len(form_tokens) > tokens_p_template[0]:
            templates_[form] = form_tokens
        else:
            filtered_templates.append(form)

In [56]:
templates_

{13: {13,
  43,
  54,
  56,
  80,
  92,
  167,
  217,
  246,
  325,
  337,
  346,
  364,
  436,
  483,
  493,
  518,
  527,
  561,
  570,
  602,
  603,
  605,
  610,
  623},
 24: {13,
  24,
  43,
  54,
  56,
  80,
  92,
  167,
  217,
  246,
  325,
  337,
  346,
  364,
  436,
  483,
  493,
  518,
  527,
  561,
  570,
  602,
  603,
  605,
  610,
  623},
 42: {13,
  42,
  43,
  54,
  56,
  80,
  92,
  167,
  217,
  246,
  325,
  337,
  346,
  364,
  436,
  483,
  493,
  518,
  527,
  561,
  570,
  602,
  603,
  605,
  610,
  623},
 79: {13,
  43,
  54,
  56,
  79,
  80,
  92,
  167,
  217,
  246,
  325,
  337,
  346,
  364,
  436,
  483,
  493,
  518,
  527,
  561,
  570,
  602,
  603,
  605,
  610,
  623},
 100: {13,
  43,
  54,
  56,
  80,
  92,
  100,
  167,
  217,
  246,
  325,
  337,
  346,
  364,
  436,
  483,
  493,
  518,
  527,
  561,
  570,
  602,
  603,
  605,
  610,
  623},
 106: {13,
  43,
  54,
  56,
  80,
  92,
  106,
  167,
  217,
  246,
  325,
  337,
  346,
  364,
  436,


***

In [29]:
list(filter(lambda x: ' waiver ' in x, cv.get_feature_names()))

['appeal waiver of liability',
 'at waiver of liability',
 'attached waiver of liability',
 'case at waiver of',
 'completed waiver and other',
 'completed waiver must be',
 'completed waiver of liability',
 'date pst waiver of',
 'department enclosure waiver of',
 'department enclosures waiver of',
 'enclosure waiver of liability',
 'enclosures waiver of liability',
 'fax server waiver of',
 'faxed completed waiver must',
 'of this waiver does',
 'or completed waiver of',
 'pst waiver of liability',
 'section waiver of fee',
 'section waiver of payment',
 'server waiver of liability',
 'signed the waiver title',
 'signed waiver of liability',
 'signing the waiver fax',
 'signing the waiver phone',
 'signing the waiver you',
 'the attached waiver of',
 'the completed waiver and',
 'the waiver fax number',
 'the waiver of liability',
 'the waiver phone number',
 'the waiver title position',
 'the waiver you may',
 'this waiver does not',
 'unless the waiver of',
 'your appeal waiver of'

In [33]:
[[head for head in cv.get_feature_names() if ele in head] for ele in template_header]

[[], [], []]

# ___

In [23]:
len(tc.cv.vocabulary)

647

In [43]:
tc.cv.vocabulary

{'according paperwork reduction act': 0,
 'act call us learn': 1,
 'act must sign date': 2,
 'act name relative friend': 3,
 'act persons required respond': 4,
 'act representative want someone': 5,
 'additional appeal rights get': 6,
 'additional details including send': 7,
 'additional information previously denied': 8,
 'address followup athenahealth com': 9,
 'address member number reasons': 10,
 'address po box city': 11,
 'address using eligibility search': 12,
 'adj totals prev pd': 13,
 'adjustment code reason code': 14,
 'adjustment information adjustment code': 15,
 'administrative manual provider website': 16,
 'advice pra explanation benefits': 17,
 'aforementioned services payment denied': 18,
 'ags health inc pg': 19,
 'allowed deduct coins grp': 20,
 'already received automatically give': 21,
 'already received give written': 22,
 'also ask copy guidelines': 23,
 'amount hc service date': 24,
 'amount owed contact fax': 25,
 'amount possible comments cpt': 26,
 'amt prov

In [42]:
tc.templates_

{110: {29,
  100,
  102,
  110,
  213,
  364,
  409,
  412,
  413,
  490,
  513,
  514,
  533,
  544,
  565,
  568,
  574,
  615,
  695,
  838},
 140: {12,
  64,
  140,
  142,
  149,
  324,
  387,
  440,
  463,
  541,
  542,
  596,
  628,
  643,
  650,
  707,
  730,
  738,
  753,
  825},
 141: {7,
  64,
  141,
  142,
  148,
  248,
  324,
  332,
  353,
  387,
  541,
  542,
  581,
  596,
  630,
  641,
  707,
  730,
  738,
  753},
 142: {12,
  62,
  64,
  142,
  254,
  324,
  329,
  330,
  372,
  440,
  581,
  596,
  630,
  710,
  738,
  752,
  753,
  807,
  808,
  813},
 143: {12,
  62,
  64,
  142,
  143,
  174,
  254,
  324,
  330,
  335,
  440,
  463,
  581,
  596,
  630,
  632,
  738,
  753,
  808,
  825},
 144: {142,
  144,
  148,
  169,
  321,
  323,
  324,
  330,
  372,
  466,
  541,
  542,
  561,
  581,
  601,
  630,
  633,
  656,
  730,
  753},
 332: {7,
  64,
  142,
  148,
  149,
  324,
  329,
  332,
  353,
  387,
  541,
  542,
  581,
  596,
  630,
  641,
  650,
  730,
  738,
 

In [24]:
print(len(tc.templates_))
tc.get_form_labels()

32


[(13, 'aletter completed waiver liability'),
 (24, 'appeal waiver liability write'),
 (42, 'attached waiver liability form'),
 (79, 'case waiver liability statement'),
 (100, 'complete attached waiver liability'),
 (106, 'completed waiver liability enclosed'),
 (107, 'completed waiver liability wol'),
 (130, 'date pst waiver liability'),
 (150, 'department enclosure waiver liability'),
 (151, 'department enclosures waiver liability'),
 (175, 'enclosure waiver liability statement'),
 (176, 'enclosures waiver liability statement'),
 (211, 'fax server waiver liability'),
 (297, 'letter completed waiver liability'),
 (392, 'page case waiver liability'),
 (463, 'pst case waiver liability'),
 (464, 'pst waiver liability form'),
 (502, 'review appeal waiver liability'),
 (503, 'review unless waiver liability'),
 (520, 'server case waiver liability'),
 (523, 'server waiver liability form'),
 (542, 'signed waiver liability form'),
 (602, 'unless waiver liability wol'),
 (616, 'waiver liability 

In [120]:
tc.get_form_labels(tc.templates_[337])

[(643, 'take longer ask extension'),
 (644, 'taking extra time explain'),
 (197, 'decision might take longer'),
 (646, 'tell taking extra time'),
 (264, 'extra time explain time'),
 (200, 'decision standard appeal within'),
 (72, 'ask extension need information'),
 (714, 'within days get appeal'),
 (654, 'time explain time needed'),
 (720, 'written decision standard appeal'),
 (337, 'important information appeal rights'),
 (530, 'received give written decision'),
 (721, 'written decision within days'),
 (343, 'information appeal rights kinds'),
 (409, 'might take longer ask'),
 (25, 'appeal decision might take'),
 (545, 'records important information appeal'),
 (35, 'appeal give written decision'),
 (42, 'appeal rights kinds appeals'),
 (302, 'get appeal decision might'),
 (623, 'standard appeal give written'),
 (48, 'appeal within days get'),
 (624, 'standard appeal within days'),
 (309, 'give written decision standard'),
 (310, 'give written decision within')]

In [121]:
tc.save_model('TokenVectorizer_dataset_all_t25_notice_extra.pkl', 1)

In [13]:
tc.load_model('TokenVectorizer_dataset_all_t20_notice.pkl', 1)

#### Evaluating

# ____

# Testing with Real time data

In [14]:
from pickle import load

real_data = load(open('NewUrgencyExpCase.pkl', 'rb'))
real_data = pd.DataFrame(real_data, columns=['caseid', 'doc_content', 'ig_page', 'len_exp_kw', 'exp_kw', 'pg_no'])
print(real_data.shape)
real_data.head()

(5135, 6)


Unnamed: 0,caseid,doc_content,ig_page,len_exp_kw,exp_kw,pg_no
0,CA15FE337AC4E59.txt,aisd itovleciasivd in bzobocils x v diacy swil...,[],2,"[('continuation of therapy', 1), ('expedited',...","[[0], [2, 2, 2, 2, 3, 3]]"
1,CA15FE335FB4E27.txt,dec pm no p p a east tennessee medical group a...,[],3,"[('cancer', 2), ('expedited', 4), ('physical t...","[[3, 8], [2, 2, 2, 2], [6, 8, 10]]"
2,CA15FE335DA4E25.txt,we fax a r t p i t serd ere b l f dtj omb appr...,[],2,"[('expedited', 11), ('fast appeal', 1)]","[[2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4], [4]]"
3,CA15FE339BAF561.txt,fsp pm page fax server fax to company fax phon...,[],2,"[('expedited', 1), ('urgent', 2)]","[[2], [2, 2]]"
4,CA15FE338CA3715.txt,page ot provider reconsideration appeal form t...,[],1,"[('end stage renal disease', 1)]",[[0]]


In [15]:
dde = pd.read_excel("dde_newdataset.xlsx")
dde['UrgencyLkup'].value_counts()

801    2310
802    2200
Name: UrgencyLkup, dtype: int64

In [16]:
real_data = real_data[['caseid', 'doc_content', 'pg_no']]
real_data['doc_content'] = real_data['doc_content'].apply(filter_stopword)

In [25]:
real_data['template'] = real_data['doc_content'
                                    ].apply(lambda x: tc.predict_template(x.split('page break ml processing')).tolist())

In [48]:
real_data_f.head()

Unnamed: 0,caseid,doc_content,pg_no,template,URG
0,CA15FE33A10B045.txt,dec p stanford health care fax transmittal cov...,{3},"[-1, -1, 401, -1, -1]",801
1,CA15FE334464DEB.txt,pm adsc page l international multiple sclerosi...,"{0, 4, 5, 7, 9}","[-1, -1, -1, 401, -1, -1, -1, -1, -1, -1, -1, -1]",802
2,CA15FE336E9418F.txt,renowin n renown facsimile f health renown ins...,"{3, 4, 5, 8, 9, 11}","[-1, 401, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...",802
3,CA15FE333D44DDA.txt,cc fax prod fsp cc fax prod page continuation ...,"{3, 4}","[-1, -1, 401, -1, -1, -1, -1]",802
4,CA15FE334729967.txt,cfax hcahealthcare com dickens scott pm page p...,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11}","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4...",802


In [47]:
real_data_f['doc_content'][0].split('page break ml processing')

['dec p stanford health care fax transmittal cover sheet pages including cover sheet name date department phone branch fax gl q g q u rgent notify sender upon arrival documents oyes ono arthritis rheumatology south bay fax phone comments pa g confidentiofity notice transmisslon ony attached documents may confidentiol ond contaln information protected state federal medkal privacy statutes legolly priviteged intended use addressee intendad reciplent transmission en agent intended reciplent prohthited reeding disclosing printing soving copying bsing otherwise disseminating ony information contalned transmission f recelved tronsmission ercor please accept apologles notify sender via appropriate phone number thonk page rcvd pm central standard time apsep uid ca sfe b csid ',
 ' dec p enrollee information enrollee name z kel date birth j enrollee address l og city ga lq state zipcode g l enrollee plan id number g complete following section person making request enrollee requestor name reques

In [45]:
tc.predict_template(real_data_f['doc_content'][0].split('page break ml processing'))

0     -1
1     -1
2    401
3     -1
4     -1
dtype: int64

In [26]:
real_data_f = real_data[real_data['template'].apply(lambda x: sum(x) > 0)]
real_data_f['pg_no'] = real_data_f['pg_no'].apply(lambda x: set(chain(*x)))
real_data_f.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
tc.get_form_labels()

[(140, 'claim reconsideration request close'),
 (141, 'claim reconsideration request definition'),
 (142, 'claim reconsideration request form'),
 (143, 'claim reconsideration requests may'),
 (144, 'claim reconsideration requests members'),
 (332, 'found claim reconsideration request'),
 (541, 'paper claim reconsideration request'),
 (542, 'paper claim reconsideration requests'),
 (643, 'regarding claim reconsideration request'),
 (710, 'separate claim reconsideration request'),
 (110, 'box health insurance claim'),
 (364, 'health insurance claim form'),
 (401, 'information notice denial payment'),
 (509, 'notice denial payment date')]

In [30]:
# template_map = {'std': [128,129,130,131,469,470,549,602,324],
#                 'skip': [351,444,337,381,436,533,534,555,556,557,558],
#                 'plus1': [351]}
template_map = {'std': [140,141,142,143,144,332,541,542,643,710,110,364],
                'skip': [401,509],
                'plus1': [401,509]}

def determine_urg(exp_pg_no, template):
    for ele in template_map['std']:
        if ele in template:
            return 801
    
    template = list(map(lambda x: x if x in template_map['skip'] else -1, template))
    
    add = []

    for i in range(1, len(template)):
        if template[i-1] in template_map['plus1']:
            add.append((i, template[i-1]))
        elif i>2 and template[i-2] in template_map['plus1']:
            add.append((i, template[i-1]))

    for i, ele in add:
        template[i] = ele
    
    template = (np.array(template) > 0).nonzero()[0]
    
    if len(set(exp_pg_no).difference(template)) > 0:
        return 802
    return 801

In [31]:
real_data_f['URG'] = real_data_f[['pg_no', 'template']].apply(lambda x: determine_urg(x[0], x[1]), axis=1)
real_data_f['URG'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


802    910
801     66
Name: URG, dtype: int64

In [32]:
Counter(list(filter(lambda x: x>0, chain(*real_data_f['template']))))

Counter({110: 16, 140: 8, 142: 3, 144: 2, 364: 11, 401: 965})

# ______

# Evaluate

In [33]:
df = real_data_f.copy()
df['caseid'] = df['caseid'].apply(lambda x: x.split('.')[0])
df = pd.merge(df, dde, left_on='caseid', right_on='UniqueImageID', how='inner')
print(df.shape)
df.head()

(902, 8)


Unnamed: 0,caseid,doc_content,pg_no,template,URG,UniqueImageID,UrgencyLkup,IntakeChannelQueueLkup
0,CA15FE334464DEB,pm adsc page l international multiple sclerosi...,"{0, 4, 5, 7, 9}","[-1, -1, -1, 401, -1, -1, -1, -1, -1, -1, -1, -1]",802,CA15FE334464DEB,802,2310
1,CA15FE333D44DDA,cc fax prod fsp cc fax prod page continuation ...,"{3, 4}","[-1, -1, 401, -1, -1, -1, -1]",802,CA15FE333D44DDA,801,2310
2,CA15FE334729967,cfax hcahealthcare com dickens scott pm page p...,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11}","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4...",802,CA15FE334729967,802,2311
3,CA15FE330DC35F1,dec pm pain management p print form wentworth ...,"{3, 5, 6, 7, 9, 10, 11, 12}","[-1, -1, 401, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE330DC35F1,801,2310
4,CA15FE32E619875,uneah auu qisd zei slyo diady wil piepuels enu...,"{8, 9, 3}","[-1, 401, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE32E619875,802,2311


In [34]:
df['UrgencyLkup'].value_counts()

802    552
801    350
Name: UrgencyLkup, dtype: int64

In [35]:
df['URG'].value_counts()

802    851
801     51
Name: URG, dtype: int64

In [36]:
print("TP - ", df.loc[(df['URG'] == 802) & (df['UrgencyLkup'] == 802)].shape[0])
print("FP - ", df.loc[(df['URG'] == 802) & (df['UrgencyLkup'] == 801)].shape[0])
print("TN - ", df.loc[(df['URG'] == 801) & (df['UrgencyLkup'] == 801)].shape[0])
print("FN - ", df.loc[(df['URG'] == 801) & (df['UrgencyLkup'] == 802)].shape[0])
print()
print("TP - ", df.loc[(df['URG'] == 802) & (df['UrgencyLkup'] == 802)].shape[0]/df.shape[0]*100)
print("FP - ", df.loc[(df['URG'] == 802) & (df['UrgencyLkup'] == 801)].shape[0]/df.shape[0]*100)
print("TN - ", df.loc[(df['URG'] == 801) & (df['UrgencyLkup'] == 801)].shape[0]/df.shape[0]*100)
print("FN - ", df.loc[(df['URG'] == 801) & (df['UrgencyLkup'] == 802)].shape[0]/df.shape[0]*100)

TP -  539
FP -  312
TN -  38
FN -  13

TP -  59.756097560975604
FP -  34.58980044345898
TN -  4.212860310421286
FN -  1.441241685144124


In [51]:
df.loc[(df['URG'] == 801) & (df['UrgencyLkup'] == 802)].to_excel('Urg_FN_STD_Form.xlsx', index=False)

In [56]:
df.loc[(df['URG'] == 802) & (df['UrgencyLkup'] == 801)]

Unnamed: 0,caseid,doc_content,pg_no,template,URG,UniqueImageID,UrgencyLkup,IntakeChannelQueueLkup
1,CA15FE333D44DDA,cc fax prod fsp cc fax prod page continuation ...,"{3, 4}","[-1, -1, 401, -1, -1, -1, -1]",802,CA15FE333D44DDA,801,2310
3,CA15FE330DC35F1,dec pm pain management p print form wentworth ...,"{3, 5, 6, 7, 9, 10, 11, 12}","[-1, -1, 401, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE330DC35F1,801,2310
7,CA15FE32867ADCB,p fsp pm page fax server enrollee information ...,"{8, 1, 5, 6}","[-1, -1, -1, -1, 401, -1, -1, -1, -1, -1]",802,CA15FE32867ADCB,801,2310
9,CA15FE32699359F,vineyard primary care fax p fsp pm page fax se...,"{3, 4, 5}","[-1, 401, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE32699359F,801,2310
10,CA15FE32941AA4F,hoag hoag memorial hospital presbyterian neuro...,"{9, 4, 7}","[-1, -1, -1, 401, -1, -1, -1, -1, -1, -1, -1]",802,CA15FE32941AA4F,801,2310
17,CA15FE31A6A32A9,dec p optumrx mohamed khab sonoma st hours ope...,"{2, 3, 5, 6, 7, 8}","[-1, 401, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE31A6A32A9,801,2311
19,CA15FE314BBEFBC,bhs fax server conpany fax phone pages notes p...,"{3, 4, 6, 7, 9, 15, 16}","[-1, -1, 401, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE314BBEFBC,801,2311
20,CA15FE315C53305,ermatology woodside executive ct aiken sc phon...,"{9, 3, 6, 7}","[-1, -1, -1, -1, -1, 401, -1, -1, -1, -1, -1]",802,CA15FE315C53305,801,2311
21,CA15FE30A9A3058,qisd evevoeiasivd qin z eaiay w l piepurys uid...,"{1, 4, 6, 10, 17, 18, 20}","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",802,CA15FE30A9A3058,801,2311
24,CA15FE306EDA8F0,dec p g tho fndo ro gaspentrol oy grory ohio g...,"{8, 3, 4, 6}","[-1, -1, 401, -1, -1, -1, -1, -1, -1, -1, -1, ...",802,CA15FE306EDA8F0,801,2310


In [235]:
df.loc[(df['URG'] == 802) & (df['UrgencyLkup'] == 801)]['pg_no'].str.len().sort_values()

482     1
438     1
395     1
93      1
209     1
497     1
577     1
614     1
60      1
667     1
45      1
44      1
426     1
697     1
42      1
425     1
334     1
7       1
908     1
259     1
36      1
248     1
413     2
217     2
406     2
219     2
393     2
365     2
251     2
386     2
       ..
241    10
942    10
550    10
130    10
252    10
215    11
127    11
134    11
325    11
652    11
328    11
464    12
466    12
400    12
227    12
567    12
852    12
844    12
370    12
247    12
673    12
643    12
75     12
822    13
451    13
508    15
806    15
163    16
807    22
658    34
Name: pg_no, Length: 348, dtype: int64