In [None]:
import os
import matplotlib.pyplot as plt
import cv2
import numpy as np
import pandas as pd
import math

In [None]:
def previewImg(basePath, imageFilesDf, cols):
    df = imageFilesDf.groupby("personId").min()
    df.head()

    rows = math.ceil(len(df)/cols);
    f, axs = plt.subplots(rows, cols, sharex=True, sharey=True)
    f.set_figheight(30)
    f.set_figwidth(80)

    row=0
    col=0

    for imgFileIdx in df.index:
        img = plt.imread(basePath + "/" + df.loc[imgFileIdx]["relPath"] + "/" + df.loc[imgFileIdx]["fileName"])
        img = cv2.resize(img, (1280,1024), interpolation=cv2.INTER_CUBIC)
        axs[row, col].imshow(img, aspect='auto',)
        axs[row, col].set_title(imgFileIdx, fontdict={'fontsize':30})
        col=col+1
        if col >= cols:
            col=0;
            row=row+1

    plt.show()

In [None]:
def getFiles(basePath):
    dirs = os.listdir(basePath)
    genuine_dir=[d for d in dirs if "forg" not in d]
    forged_dir=[d for d in dirs if "forg" in d]

    df_g = pd.DataFrame({"fileName": [os.listdir(basePath + "/" + d) for d in genuine_dir], "personId": genuine_dir, "relPath": genuine_dir})
    df_g = df_g.explode("fileName")
    df_g.reset_index(drop=True, inplace=True)
        
    df_f = pd.DataFrame({"fileName": [os.listdir(basePath + "/" + d) for d in forged_dir], "personId": forged_dir, "relPath": forged_dir})
    df_f["personId"] = df_f["personId"].apply(lambda x: x.replace("_forg", ""))
    df_f = df_f.explode("fileName")
    df_f.reset_index(drop=True, inplace=True)
    
    return (df_g, df_f)

In [None]:
data2_path = "/notebooks/capstone/dataset/dataset2/sign_data/train/"
df_g, df_f = getFiles(data2_path)
print(df_g.shape)
print(df_g.head())
print(df_f.shape)
print(df_f.head())

In [None]:
# Genuine images - checking they are unique for each person
previewImg(data2_path, df_g,10)

In [None]:
# forged images - checking alignent with genuine images
previewImg(data2_path, df_f,10)

<b>Testing data</b>

In [None]:
data2_path_test = "/notebooks/capstone/dataset/dataset2/sign_data/test/"
df_g_test, df_f_test = getFiles(data2_path_test)
print(df_g_test.shape)
print(df_g_test.head())
print(df_f_test.shape)  #25% of train data which is good
print(df_f_test.head())

In [None]:
# Genuine images - checking they are unique for each person
previewImg(data2_path_test, df_g_test,10)

In [None]:
# Genuine images - checking they are unique for each person
# forged images - checking alignent with genuine images
previewImg(data2_path_test, df_f_test,10)

<b>Prepare metadata csv</b>

In [None]:
#save to csv file
df_g.to_csv(data2_path + "/" + "train.csv")
df_g_test.to_csv(data2_path_test + "/" + "test.csv")

In [None]:
df_f.to_csv(data2_path + "/" + "train_f.csv")
df_f_test.to_csv(data2_path_test + "/" + "test_f.csv")