In [1]:
# Library Import
import pandas as pd

In [2]:
# Data Load
df = pd.read_pickle('../data/df_cases_200906.gzip') # go back to the parent directory and down to data folder 
label = pd.read_pickle('../data/df_label_200906.gzip')

In [3]:
# Define a function to consider if each contract if valid or not
# The reason for setting up a function instead of using lambda is that:
# next time if we want to change the condition (e.g. setting Quality Score <0.71 instead)
# amending a defined function would be more convenient and straightforward than amending a lambda function

def check_valid (df):
    if (df['IsExecuted'] == False) or (df['QualityScore'] <0.81):
        return False
    else:
        return True

In [4]:
# Usually we may use loc method or use direct naming  (e.g. df['Validity'])
# But to avoid SettingWithCopyWarning , using assign could be the best method currently
# Reference: https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas/12555510#12555510

df = df.assign(Validity=df.apply(check_valid, axis=1))

In [5]:
# To collect the inforation we want

# For ValidFileNames and InvalidFileNames
# Grouping the files into valid and invalid file
df_validfile = pd.DataFrame(df[df['Validity'] == True].groupby(['CaseId'])['FileName'].apply(list)).rename(columns={'FileName': "ValidFileNames"}).reset_index()
df_invalidfile = pd.DataFrame(df[df['Validity'] == False].groupby(['CaseId'])['FileName'].apply(list)).rename(columns={'FileName': "InvalidFileNames"}).reset_index()
df_combined = df_invalidfile.merge(df_validfile, on='CaseId', how='outer')
# Fill in the NaN value with empty list (to align with the example shown)
df_combined['InvalidFileNames'] = df_combined['InvalidFileNames'].fillna("").apply(list)
df_combined['ValidFileNames'] = df_combined['ValidFileNames'].fillna("").apply(list)



# For OcrText 
# concatentate all OcrText fields of all "VALID" contracts
Valid_Ocr = df[df.Validity == True][['CaseId','OcrText']].groupby(['CaseId'])['OcrText'].apply(lambda x: ' '.join(x)).reset_index()

# Merge the above dataframes
combined = df_combined.merge(Valid_Ocr, how='outer', on="CaseId")
# Fill in the NaN value with blank (to align with the example shown)
combined['OcrText'] = combined['OcrText'].fillna("")

# Merge with labels
final_dataset = combined.merge(label, how='inner', on='CaseId').sort_values(by=['CaseId']).reset_index(drop=True)

In [6]:
#For checking purpose
pd.set_option('display.max_colwidth', 0)

final_dataset.tail()

Unnamed: 0,CaseId,InvalidFileNames,ValidFileNames,OcrText,label_1,label_2
1093,3061230659,[003061230659_72651667_Order form_978-0-661-06636-2.pdf],[],,True,False
1094,3061230710,"[003061230710_80047544_other documents_978-1-02-279791-8.pdf, 003061230710_59256366_Order form_978-1-209-37083-5.pdf]",[],,True,False
1095,3061230728,"[003061230728_79408066_Master contract_978-0-14-891566-4.pdf, 003061230728_56717174_Amendments_978-1-4471-4999-6.pdf]",[003061230728_74076581_Amendments_978-0-14-763189-3.pdf],None attorney spend tend miss appear.,True,False
1096,3061230748,[003061230748_65193716_Contract Documents_978-0-9561206-8-7.pdf],[],,True,False
1097,3061230757,[003061230757_84690982_other documents_978-0-10-551297-4.pdf],[003061230757_72990476_Contract Documents_978-1-4943-5720-7.pdf],Determine go network.,False,False


In [7]:
#Export to gzip
final_dataset.to_csv("df_final.gzip", compression="gzip")