In [1]:
import pandas as pd

In [2]:
# Read gzip data
cases = pd.read_pickle('df_cases_200906.gzip')
label = pd.read_pickle('df_label_200906.gzip')

In [3]:
cases.shape

(2069, 8)

In [4]:
cases.head()

Unnamed: 0,CaseId,FileName,Language,StartDate,DocumentType,IsExecuted,OcrText,QualityScore
0,3061226227,003061226227_12045631_Order form_978-1-67767-3...,EN,20191104,Order form,False,Pressure style response character.,0.649292
1,3061226383,003061226383_63912371_Contract Info Pack_978-0...,EN,20190805,Contract Info Pack,False,Soon especially boy thousand traditional.,0.873038
2,3061226383,003061226383_41775424_other documents_978-1-38...,EN,20190805,other documents,True,Tax south say strategy hard between late.,0.99128
3,3061227777,003061227777_96463321_other documents_978-1-96...,EN,20191231,other documents,False,Yet arm six design.,0.607833
4,3061227781,003061227781_62201704_other documents_978-1-04...,EN,20191231,other documents,False,Plan camera actually run push rest.,0.909359


In [5]:
label.shape

(1098, 3)

In [6]:
label.tail()

Unnamed: 0,CaseId,label_1,label_2
1093,3061208632,True,False
1094,3061208849,False,False
1095,3061207708,False,False
1096,3061207200,True,False
1097,3061209028,True,False


In [7]:
# Define a new boolean column 'Invalid' and apply a filter function
cases['Invalid'] = cases.apply(lambda _: '', axis=1)
def isInvalid(x):
    if x['IsExecuted'] == False or x['QualityScore'] < 0.81:
        return True
    return False

cases['Invalid'] = cases.apply(isInvalid, axis=1)

In [8]:
# Create a new dataframe for filenames, group up filenames to list based on 'CaseId' and 'Invalid' column
filenames = cases.groupby(['CaseId','Invalid'])['FileName'].apply(list).reset_index(name='FileNames')
filenames.tail()

Unnamed: 0,CaseId,Invalid,FileNames
1434,3061230728,False,[003061230728_74076581_Amendments_978-0-14-763...
1435,3061230728,True,[003061230728_79408066_Master contract_978-0-1...
1436,3061230748,True,[003061230748_65193716_Contract Documents_978-...
1437,3061230757,False,[003061230757_72990476_Contract Documents_978-...
1438,3061230757,True,[003061230757_84690982_other documents_978-0-1...


In [9]:
# Split the dataframe into two by 'Invalid' and merge by 'CaseId'
invalid = filenames[filenames['Invalid']==True]
valid = filenames[filenames['Invalid']==False]

final = pd.merge(valid, invalid, on='CaseId', how = 'outer')
final.tail()

Unnamed: 0,CaseId,Invalid_x,FileNames_x,Invalid_y,FileNames_y
1093,3061230570,,,True,[003061230570_24982184_other documents_978-1-7...
1094,3061230606,,,True,[003061230606_65835753_Order form_978-1-292-43...
1095,3061230659,,,True,[003061230659_72651667_Order form_978-0-661-06...
1096,3061230710,,,True,[003061230710_80047544_other documents_978-1-0...
1097,3061230748,,,True,[003061230748_65193716_Contract Documents_978-...


In [10]:
# Drop useless columns
final = final.drop(columns=['Invalid_x', 'Invalid_y'])
# Rename filenames columns
final.columns = ['CaseId', 'ValidFileNames', 'InvalidFileNames']
# Fill nan cells with empty list
final = final.fillna('[]')
final.tail()

Unnamed: 0,CaseId,ValidFileNames,InvalidFileNames
1093,3061230570,[],[003061230570_24982184_other documents_978-1-7...
1094,3061230606,[],[003061230606_65835753_Order form_978-1-292-43...
1095,3061230659,[],[003061230659_72651667_Order form_978-0-661-06...
1096,3061230710,[],[003061230710_80047544_other documents_978-1-0...
1097,3061230748,[],[003061230748_65193716_Contract Documents_978-...


In [11]:
# Extract 'CaseId' and 'OcrText' from the cases dataframe and concatentate all OcrText fields for all Valid contracts
ocrtext = cases[cases['Invalid'] == False]
ocrtext = ocrtext[['CaseId', 'OcrText']]
ocrtext = ocrtext.groupby('CaseId')['OcrText'].apply(','.join).reset_index()
ocrtext.tail()

Unnamed: 0,CaseId,OcrText
565,3061230596,"Miss style unit.,High begin purpose interestin..."
566,3061230613,Really certainly might responsibility responsi...
567,3061230657,Cultural light carry past technology finish la...
568,3061230728,None attorney spend tend miss appear.
569,3061230757,Determine go network.


In [12]:
# Apply left join to add 'OcrText' column to the filenames dataframe
final = final.merge(ocrtext, on='CaseId', how='left')
final.tail()

Unnamed: 0,CaseId,ValidFileNames,InvalidFileNames,OcrText
1093,3061230570,[],[003061230570_24982184_other documents_978-1-7...,
1094,3061230606,[],[003061230606_65835753_Order form_978-1-292-43...,
1095,3061230659,[],[003061230659_72651667_Order form_978-0-661-06...,
1096,3061230710,[],[003061230710_80047544_other documents_978-1-0...,
1097,3061230748,[],[003061230748_65193716_Contract Documents_978-...,


In [13]:
# Apply left join on the resulted dataframe and label dataframe
final = final.merge(label, on='CaseId', how='left')
final.tail()

Unnamed: 0,CaseId,ValidFileNames,InvalidFileNames,OcrText,label_1,label_2
1093,3061230570,[],[003061230570_24982184_other documents_978-1-7...,,False,False
1094,3061230606,[],[003061230606_65835753_Order form_978-1-292-43...,,True,False
1095,3061230659,[],[003061230659_72651667_Order form_978-0-661-06...,,True,False
1096,3061230710,[],[003061230710_80047544_other documents_978-1-0...,,True,False
1097,3061230748,[],[003061230748_65193716_Contract Documents_978-...,,True,False


In [14]:
# Save the final result
final.to_csv("df_final.gzip", compression="gzip")

In [15]:
# To check the final dataset
df = df = pd.read_csv('df_final.gzip', compression='gzip')
df.head()

Unnamed: 0.1,Unnamed: 0,CaseId,ValidFileNames,InvalidFileNames,OcrText,label_1,label_2
0,0,3061189006,['003061189006_69176036_Order form_978-1-62414...,[],As difficult behavior her myself help.,True,False
1,1,3061189067,['003061189067_26173467_Order form_978-1-90478...,[],Present can phone form.,True,False
2,2,3061189156,['003061189156_47966765_Contract Documents_978...,[],Section science difference success wish it wide.,False,False
3,3,3061189229,['003061189229_21094545_other documents_978-1-...,['003061189229_69115288_Master contract_978-0-...,Accept sell leader herself if.,False,False
4,4,3061189349,['003061189349_72196625_Other documents_978-0-...,[],Another later everybody large real.,False,False
