In [2]:
from pathlib import Path
import os
from tqdm import tqdm
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from fastai.data.all import get_image_files
# from fastai.vision.all import *
from statsmodels.stats.contingency_tables import mcnemar
from PIL import Image
import numpy as np
import joblib

In [3]:
image_file_extensions = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')

def is_image_path_valid(path: Path):
    return path.is_file() and path.suffix in image_file_extensions

def load_image_file(path):
    return Image.open(path)

from utils import load_images_recursively

image_dir = Path("./data/new_crops")

def load_image_paths(path: Path):
    fns = get_image_files(path)
    
    return fns

before_paths = load_image_paths(image_dir / 'before')
after_paths = load_image_paths(image_dir / 'after')



mean_rgb = (131.0912, 103.8827, 91.4953)
image_shape = (224,224,3)

def load_image_for_feature_extraction(path='', shape=image_shape):
    '''
    Referenced from VGGFace2 Paper:
    Q. Cao, L. Shen, W. Xie, O. M. Parkhi, and A. Zisserman, “VGGFace2: A dataset for recognising faces across pose and age,” arXiv:1710.08092 [cs], May 2018
    '''
    short_size = 224.0
    crop_size = shape
    img = Image.open(path)
    im_shape = np.array(img.size)    # in the format of (width, height, *)
    img = img.convert('RGB')

    ratio = float(short_size) / np.min(im_shape)
    img = img.resize(size=(int(np.ceil(im_shape[0] * ratio)),   # width
                           int(np.ceil(im_shape[1] * ratio))),  # height
                     resample=Image.BILINEAR)

    x = np.array(img)  # image has been transposed into (height, width)
    newshape = x.shape[:2]
    h_start = (newshape[0] - crop_size[0])//2
    w_start = (newshape[1] - crop_size[1])//2
    x = x[h_start:h_start+crop_size[0], w_start:w_start+crop_size[1]]
    
    # normalize colors to prevent overfitting on color differences 
    x = x - mean_rgb
    
    # returns transformed image, and original image
    return x

In [7]:
log_model = joblib.load("./saved_model/lasso_pol_dat_us.joblib")

In [8]:
from saved_model.prepare_resnet50 import prepare_resnet_model

resnet_model = prepare_resnet_model("./saved_model/resnet50_ft_weight.pkl")

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def full_pipeline(x):
    x = torch.Tensor(x.transpose(0, 3, 1, 2))  # nx3x224x224
    x = x.to(device)
    x = resnet_model(x).detach().cpu().numpy()
    return log_model.predict(x)

In [10]:
results = pd.DataFrame({'sample_paths': before_paths})

In [11]:
results['results_1'] = np.nan
results['results_2'] = np.nan

In [12]:
def preprocess(path):
    return np.expand_dims(load_image_for_feature_extraction(path), 0)

In [13]:
for i, (before_path, after_path) in tqdm(enumerate(zip(before_paths, after_paths)), total=len(before_paths)):
    if before_path.stem != after_path.stem:
        print(f"Before and after don't match for index {i}, before: {before_path}, after: {after_path}")
        break
        
    results.loc[i,'results_1'] = full_pipeline(preprocess(before_path))
    results.loc[i,'results_2'] = full_pipeline(preprocess(after_path))

100%|███████████████████████████████████████| 1292/1292 [01:33<00:00, 13.85it/s]


In [15]:
results.head()

Unnamed: 0,sample_paths,results_1,results_2
0,data/new_crops/before/haircut face before after 3_89.jpg,1.0,1.0
1,data/new_crops/before/makeup before after 3_108.jpg,0.0,0.0
2,data/new_crops/before/makeup before after 3_55.jpg,1.0,0.0
3,data/new_crops/before/makeup before after arabic_51.jpg,1.0,1.0
4,data/new_crops/before/beard before after 2_30.jpg,0.0,0.0


In [18]:
def get_blank_df():
    df = pd.DataFrame(index=cat_names)
    df['total'] = np.nan
    df['before liberal'] = np.nan
    df['before conservative'] = np.nan
    df['after liberal'] = np.nan
    df['after conservative'] = np.nan
    
    return df

In [31]:
def get_analysis(df: pd.DataFrame = None, before_colname = "results_1", after_colname = "results_2"):
    
    if df == None:
        df = get_blank_df()
        
    for cat in cat_names:
        pred_slice = results[categories == cat]
        total = len(pred_slice)
        before_lib = len(pred_slice[pred_slice[before_colname] == 0])
        before_con = len(pred_slice[pred_slice[before_colname] == 1])
        after_lib = len(pred_slice[pred_slice[after_colname] == 0])
        after_con = len(pred_slice[pred_slice[after_colname] == 1])

        df.loc[[cat], ['total']] = total
        df.loc[[cat], ['before liberal']] = before_lib
        df.loc[[cat], ['before conservative']] = before_con
        df.loc[[cat], ['after liberal']] = after_lib
        df.loc[[cat], ['after conservative']] = after_con

    total = df.sum()
    total.name = 'total'
    df = df.append(total.transpose())

    return df

In [22]:
fns = results['sample_paths']

In [23]:
# extract filename without extension from path
fns = fns.map(lambda fn: Path(fn).stem)

In [24]:
categories = fns.map(lambda fn: fn.split()[0])

In [25]:
def get_cat(stem):
    cat_dict = {
        'makeupe': 'makeup',
        'hiardoo': 'hairdoo',
        'hairdoocut': 'haircut'
    }
    
    first_word = stem.split()[0]
    
    if first_word in cat_dict.keys():
        return cat_dict[stem]
    else:
        return first_word

In [26]:
categories = categories.map(get_cat)

In [27]:
idx = np.where(categories == 'images')[0]

In [28]:
categories[idx] = fns[idx].map(lambda s: s.split()[2])

In [29]:
cat_names = categories.unique()
cat_names

array(['haircut', 'makeup', 'beard', 'drag', 'hairdoo', 'glasses'],
      dtype=object)

In [32]:
log_analysis_df = get_analysis()

In [33]:
log_analysis_df

Unnamed: 0,total,before liberal,before conservative,after liberal,after conservative
haircut,287.0,166.0,121.0,188.0,99.0
makeup,386.0,206.0,180.0,260.0,126.0
beard,357.0,207.0,150.0,229.0,128.0
drag,168.0,121.0,47.0,121.0,47.0
hairdoo,68.0,41.0,27.0,40.0,28.0
glasses,26.0,18.0,8.0,17.0,9.0
total,1292.0,759.0,533.0,855.0,437.0


In [34]:
log_analysis_df['total']

haircut     287.0
makeup      386.0
beard       357.0
drag        168.0
hairdoo      68.0
glasses      26.0
total      1292.0
Name: total, dtype: float64

In [35]:
log_analysis_df.to_csv("./results/before_after_log.csv")

In [36]:
def get_contingency_table(df: pd.DataFrame = None, before_colname = "results_1", after_colname = "results_2"):
    
    if df == None:
        df = pd.DataFrame(index=cat_names)
        
    for cat in cat_names:
        pred_slice = results[categories == cat]
        total = len(pred_slice)
        blal = len(pred_slice[(pred_slice[before_colname] == 0) & (pred_slice[after_colname] == 0)])
        blac = len(pred_slice[(pred_slice[before_colname] == 0) & (pred_slice[after_colname] == 1)])
        bcac = len(pred_slice[(pred_slice[before_colname] == 1) & (pred_slice[after_colname] == 1)])
        bcal = len(pred_slice[(pred_slice[before_colname] == 1) & (pred_slice[after_colname] == 0)])

        df.loc[[cat], ['total']] = total
        df.loc[[cat], ['blal']] = blal
        df.loc[[cat], ['blac']] = blac
        df.loc[[cat], ['bcac']] = bcac
        df.loc[[cat], ['bcal']] = bcal

    total = df.sum()
    total.name = 'total'
    df = df.append(total.transpose())

    return df

In [37]:
log_contingency_df = get_contingency_table()

In [38]:
log_contingency_df

Unnamed: 0,total,blal,blac,bcac,bcal
haircut,287.0,132.0,34.0,65.0,56.0
makeup,386.0,160.0,46.0,80.0,100.0
beard,357.0,161.0,46.0,82.0,68.0
drag,168.0,90.0,31.0,16.0,31.0
hairdoo,68.0,30.0,11.0,17.0,10.0
glasses,26.0,16.0,2.0,7.0,1.0
total,1292.0,589.0,170.0,267.0,266.0


In [39]:
log_contingency_df.to_csv("./results/log_Contingency.csv")

In [41]:
log_contingency_df.iloc[-1]

total    1292.0
blal      589.0
blac      170.0
bcac      267.0
bcal      266.0
Name: total, dtype: float64