In [1]:
# if working in jupyter notebook
# %load_ext nb_black
# if working in jupyter lab
# %load_ext lab_black

# 1. Load given data into respective DataFrames

In [2]:
# check that our .gzip files are present
data_path = "../data/"
!ls $data_path

df_cases_200906.gzip  fake-and-real-news-dataset.zip
df_label_200906.gzip  kaggle


In [3]:
import pandas as pd
import numpy as np

# some pandas settings
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 1000)

# specify relative paths
cases_path = "../data/df_cases_200906.gzip"
labels_path = "../data/df_label_200906.gzip"

# load data
cases_df = pd.read_pickle(cases_path)
labels_df = pd.read_pickle(labels_path)

# 2. Process data
- filter out invalid cases
- concatenate OcrText column for groups of contracts
- store lists of valid and invalid file names
**NOTE:** When you enable the `%%timeit` cellmagic in the following cell, will get an error in the next following cell so **COMMENT IT OUT** before moving on

In [4]:
# %%timeit

# get unique case ids
case_ids = np.sort(cases_df["CaseId"].unique(), kind="quicksort")

# use a list of lists
valid_contract_names = []
invalid_contract_names = []
all_ocr_texts = []

for case_id in case_ids:
    # get sub dataframe with unique case id
    current_df = cases_df[cases_df["CaseId"] == case_id]

    # store our sub lists
    some_valid_contracts = []
    some_invalid_contracts = []
    merged_ocr_text = ""
    # number of contracts for current CaseId
    num_rows = current_df.shape[0]
    count = 0

    for _, row in current_df.iterrows():
        # num_rows is number of sentences
        if num_rows > 1:
            if count == 0:
                # if first string for CaseId, capitalize, replace period with spaces
                merged_ocr_text += row["OcrText"].replace(".", " ").lower().capitalize()
            elif count == num_rows - 1:
                # if last string, don't remove period
                merged_ocr_text += row["OcrText"].lower()
            else:
                merged_ocr_text += row["OcrText"].replace(".", " ").lower()

        else:
            # same process, but for CaseId with single contracts
            temp_str = row["OcrText"].replace(".", " ").lower().capitalize().strip()
            merged_ocr_text = temp_str + "."

        # using bitwise operation here because ...
        # https://stackoverflow.com/questions/36921951/truth-value-of-a-series-is-ambiguous-use-a-empty-a-bool-a-item-a-any-o
        # use line directly below if indexing directly from DataFrame object
        # valid_case_bool = (row["QualityScore"] >= 0.81) & (row["IsExecuted"] == True)
        valid_case_bool = row["QualityScore"] >= 0.81 and row["IsExecuted"] == True

        if valid_case_bool:
            some_valid_contracts.append(row["FileName"])
        else:
            some_invalid_contracts.append(row["FileName"])

        count += 1

    # remove any trailing whitespaces
    merged_ocr_text = merged_ocr_text.strip()

    # append to our list of lists
    valid_contract_names.append(some_valid_contracts)
    invalid_contract_names.append(some_invalid_contracts)
    all_ocr_texts.append(merged_ocr_text)

In [5]:
# place our data into df_final_dict
df_final_dict = dict()
df_final_dict["CaseId"] = case_ids
df_final_dict["InvalidFileNames"] = invalid_contract_names
df_final_dict["ValidFileNames"] = valid_contract_names
df_final_dict["OcrText"] = all_ocr_texts

# create df from dict
df_final = pd.DataFrame(df_final_dict, columns=df_final_dict.keys())

In [6]:
# view a specific group of contracts
test_case = "003061227721"
cases_df[cases_df["CaseId"] == test_case]

Unnamed: 0,CaseId,FileName,Language,StartDate,DocumentType,IsExecuted,OcrText,QualityScore
12,3061227721,003061227721_17508131_other documents_978-1-62420-278-0.pdf,EN,20191011,other documents,False,But could quality as foot.,0.839556
19,3061227721,003061227721_87102260_other documents_978-0-9792715-5-7.pdf,EN,20191011,other documents,True,Child pressure play.,0.826007
20,3061227721,003061227721_78410640_other documents_978-0-263-78566-1.pdf,EN,20191011,other documents,True,Simple go land north.,0.696388
21,3061227721,003061227721_48424585_other documents_978-1-80507-303-1.pdf,EN,20191011,other documents,True,Bill thing something level letter team.,0.634707
56,3061227721,003061227721_30304000_Order form_978-0-339-61324-9.pdf,EN,20191011,Order form,True,Day table as item.,0.875236
193,3061227721,003061227721_12522408_other documents_978-0-7727-5129-4.pdf,EN,20191011,other documents,True,Culture enough in team her clearly find.,0.857643
205,3061227721,003061227721_66983064_Order form_978-1-69529-544-5.pdf,EN,20191011,Order form,True,Arm several nor world international central center.,0.871345


In [7]:
# see if they've merged properly
df_final[df_final["CaseId"] == test_case]

Unnamed: 0,CaseId,InvalidFileNames,ValidFileNames,OcrText
1004,3061227721,"[003061227721_17508131_other documents_978-1-62420-278-0.pdf, 003061227721_78410640_other documents_978-0-263-78566-1.pdf, 003061227721_48424585_other documents_978-1-80507-303-1.pdf]","[003061227721_87102260_other documents_978-0-9792715-5-7.pdf, 003061227721_30304000_Order form_978-0-339-61324-9.pdf, 003061227721_12522408_other documents_978-0-7727-5129-4.pdf, 003061227721_66983064_Order form_978-1-69529-544-5.pdf]",But could quality as foot child pressure play simple go land north bill thing something level letter team day table as item culture enough in team her clearly find arm several nor world international central center.


# 3. Merging data
Merge `label_1` and `label_2` into `df_cases_200906.gzip`

In [8]:
# make sure there are no duplicates
labels_df["CaseId"].duplicated().any()

False

In [9]:
# merge DataFrames (horizontal concatenation)
df_final = pd.merge(df_final, labels_df, on="CaseId")

# 4. Final checks and write to `df_final.gzip`

In [10]:
# check for any NaN values
df_final.isna().sum()

CaseId              0
InvalidFileNames    0
ValidFileNames      0
OcrText             0
label_1             7
label_2             7
dtype: int64

In [11]:
# view rows with NaN values
df_final.get(df_final.isna().any(axis=1))

Unnamed: 0,CaseId,InvalidFileNames,ValidFileNames,OcrText,label_1,label_2
407,3061201036,[],[003061201036_95246451_Order form_978-0-617-81829-8.pdf],Case deep idea range.,,
408,3061201079,"[003061201079_50330379_Master contract_978-0-276-80757-2.pdf, 003061201079_77836809_Master contract_978-1-71729-722-8.pdf]",[],Around agree safe camera raise probably never hundred entire culture explain court attorney.,,
413,3061201236,[],[003061201236_49632637_Order form_978-0-399-77978-7.pdf],Consider approach paper eye.,,
421,3061201471,"[003061201471_20672492_Order form_978-0-906033-97-5.pdf, 003061201471_67072767_Amendments_978-0-576-45450-6.pdf]",[],Scientist cut young appear direction term learn difference pick until.,,
422,3061201489,[003061201489_86382854_Order form_978-0-673-35426-6.pdf],"[003061201489_60308818_Master contract_978-0-85050-808-6.pdf, 003061201489_72679708_other documents_978-0-284-11000-8.pdf]",Skill former sure agent three democratic no everyone through claim country since glass rise next probably whatever.,,
444,3061202246,[],[003061202246_30622555_Order form_978-1-995484-06-8.pdf],Hundred on so others whose.,,
447,3061202306,[],[003061202306_89917374_Order form_978-1-84570-036-2.pdf],Leg since this nice administration.,,


In [12]:
# get NaN rows indices
# nan_indices = df_final.get(df_final.isna().any(axis=1)).index
# # update final_df
# df_final = df_final.drop(nan_indices)
# df_final.isna().sum()

# remove rows with NaN values
df_final = df_final.dropna().reset_index()

In [13]:
df_final.isna().sum()

index               0
CaseId              0
InvalidFileNames    0
ValidFileNames      0
OcrText             0
label_1             0
label_2             0
dtype: int64

In [14]:
# check current working directory
!pwd
df_final.to_pickle("./df_final.gzip")

/home/evan/repos/iecoe-sg-technical-interview/submit
