In [294]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)
import numpy as np

Loading and understanding Dataset

In [295]:
df_cases = pd.read_pickle('df_cases_200906.gzip')
df_label = pd.read_pickle('df_label_200906.gzip')

In [296]:
print(df_cases.shape,df_label.shape)

(2069, 8) (1098, 3)


In [297]:
df_cases.head()

Unnamed: 0,CaseId,FileName,Language,StartDate,DocumentType,IsExecuted,OcrText,QualityScore
0,3061226227,003061226227_12045631_Order form_978-1-67767-388-9.pdf,EN,20191104,Order form,False,Pressure style response character.,0.649292
1,3061226383,003061226383_63912371_Contract Info Pack_978-0-394-16412-0.pdf,EN,20190805,Contract Info Pack,False,Soon especially boy thousand traditional.,0.873038
2,3061226383,003061226383_41775424_other documents_978-1-388-73116-8.pdf,EN,20190805,other documents,True,Tax south say strategy hard between late.,0.99128
3,3061227777,003061227777_96463321_other documents_978-1-960601-99-5.pdf,EN,20191231,other documents,False,Yet arm six design.,0.607833
4,3061227781,003061227781_62201704_other documents_978-1-04-652792-8.pdf,EN,20191231,other documents,False,Plan camera actually run push rest.,0.909359


In [298]:
df_label.head()

Unnamed: 0,CaseId,label_1,label_2
0,3061226227,False,False
1,3061226383,True,False
2,3061227777,False,False
3,3061227781,False,False
4,3061227680,True,True


In [299]:
df_cases_copy = df_cases.copy()
df_label_copy = df_label.copy()

In [300]:
df_cases_copy.CaseId.unique().shape

(1098,)

Labeling each cases to determine if it is vaild or invaild. 
It is invalid if ANY of the following condition is occurs:
    1) IsExecuted == False
    2) QualityScore <0.81

In [301]:
def checkValidity(row):
    if(row['IsExecuted'] == False):
        return "invalid"
    if(row['QualityScore'] < 0.81):
        return "invalid"
    return "valid"

In [302]:
df_cases_copy['Validity'] = df_cases_copy.apply(checkValidity,axis=1)

In [303]:
df_cases_copy.Validity.value_counts()

invalid    1389
valid      680 
Name: Validity, dtype: int64

In [304]:
df_final = pd.DataFrame(columns=['CaseId','InvalidFileNames','ValidFileNames','OcrText'])

In [305]:
## Concatenting the file name according to caseId and Validity
grouped_fileName_df = df_cases_copy.groupby(['CaseId','Validity'])[['FileName']].agg(lambda x: [', '.join(x)]).unstack().reset_index()


df_final['CaseId'] = grouped_fileName_df['CaseId']
df_final['ValidFileNames'] = grouped_fileName_df['FileName','valid'].fillna("[]")
df_final['InvalidFileNames'] = grouped_fileName_df['FileName','invalid'].fillna("[]")

grouped_OcrText_df =  df_cases_copy.groupby(['CaseId','Validity'])[['Validity','OcrText']].agg(lambda x: ' '.join(x)).unstack().reset_index()
df_final['OcrText'] = grouped_OcrText_df['OcrText','valid'].fillna(' ')

In [306]:
df_final.head()

Unnamed: 0,CaseId,InvalidFileNames,ValidFileNames,OcrText
0,3061189006,[],[003061189006_69176036_Order form_978-1-62414-909-2.pdf],As difficult behavior her myself help.
1,3061189067,[],[003061189067_26173467_Order form_978-1-904782-31-5.pdf],Present can phone form.
2,3061189156,[],[003061189156_47966765_Contract Documents_978-0-317-34803-3.pdf],Section science difference success wish it wide.
3,3061189229,[003061189229_69115288_Master contract_978-0-11-692367-7.pdf],[003061189229_21094545_other documents_978-1-81560-843-8.pdf],Accept sell leader herself if.
4,3061189242,[003061189242_91658807_other documents_978-0-7143-5356-2.pdf],[],


In [307]:
##Adding the label
df_final = pd.merge(left=df_final,right=df_label_copy,left_on='CaseId',right_on='CaseId')

In [308]:
##Cross checking with given examples
df_final.iloc[1093:1098]

Unnamed: 0,CaseId,InvalidFileNames,ValidFileNames,OcrText,label_1,label_2
1093,3061230659,[003061230659_72651667_Order form_978-0-661-06636-2.pdf],[],,True,False
1094,3061230710,"[003061230710_80047544_other documents_978-1-02-279791-8.pdf, 003061230710_59256366_Order form_978-1-209-37083-5.pdf]",[],,True,False
1095,3061230728,"[003061230728_79408066_Master contract_978-0-14-891566-4.pdf, 003061230728_56717174_Amendments_978-1-4471-4999-6.pdf]",[003061230728_74076581_Amendments_978-0-14-763189-3.pdf],None attorney spend tend miss appear.,True,False
1096,3061230748,[003061230748_65193716_Contract Documents_978-0-9561206-8-7.pdf],[],,True,False
1097,3061230757,[003061230757_84690982_other documents_978-0-10-551297-4.pdf],[003061230757_72990476_Contract Documents_978-1-4943-5720-7.pdf],Determine go network.,False,False


In [309]:
##Saving file to gzip

df_final.to_csv('df_final.gzip',compression='gzip')