In [2]:
%pip install statsmodels --quiet

Note: you may need to restart the kernel to use updated packages.


In [8]:
from pathlib import Path
import os
from tqdm import tqdm
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from fastai.data.all import get_image_files
# from fastai.vision.all import *
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
image_file_extensions = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')

def is_image_path_valid(path: Path):
    return path.is_file() and path.suffix in image_file_extensions

def load_image_file(path):
    return Image.open(path)

In [3]:
from utils import load_images_recursively

In [4]:
image_dir = Path("./data/new_crops")

In [5]:
# fns = get_image_files(image_dir / 'before')
# failed = verify_images(fns)

In [6]:
def load_image_paths(path: Path):
    fns = get_image_files(path)
    
    return fns

In [9]:
before_paths = load_image_paths(image_dir / 'before')
after_paths = load_image_paths(image_dir / 'after')

In [10]:
from PIL import Image
import numpy as np

mean_rgb = (131.0912, 103.8827, 91.4953)
image_shape = (224,224,3)

def load_image_for_feature_extraction(path='', shape=image_shape):
    '''
    Referenced from VGGFace2 Paper:
    Q. Cao, L. Shen, W. Xie, O. M. Parkhi, and A. Zisserman, “VGGFace2: A dataset for recognising faces across pose and age,” arXiv:1710.08092 [cs], May 2018
    '''
    short_size = 224.0
    crop_size = shape
    img = Image.open(path)
    im_shape = np.array(img.size)    # in the format of (width, height, *)
    img = img.convert('RGB')

    ratio = float(short_size) / np.min(im_shape)
    img = img.resize(size=(int(np.ceil(im_shape[0] * ratio)),   # width
                           int(np.ceil(im_shape[1] * ratio))),  # height
                     resample=Image.BILINEAR)

    x = np.array(img)  # image has been transposed into (height, width)
    newshape = x.shape[:2]
    h_start = (newshape[0] - crop_size[0])//2
    w_start = (newshape[1] - crop_size[1])//2
    x = x[h_start:h_start+crop_size[0], w_start:w_start+crop_size[1]]
    
    # normalize colors to prevent overfitting on color differences 
    x = x - mean_rgb
    
    # returns transformed image, and original image
    return x

In [12]:
import joblib

lasso_model = joblib.load("./saved_model/lasso_pol_dat_us.joblib")

In [13]:
from saved_model.prepare_resnet50 import prepare_resnet_model

resnet_model = prepare_resnet_model("./saved_model/resnet50_ft_weight.pkl")

In [6]:
from saved_model.binary_classifier import load_pretrained_classifier

binary_classifier = load_pretrained_classifier('./saved_model/weights-2.pth')

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def full_lime_pipeline(x):
    x = torch.Tensor(x.transpose(0, 3, 1, 2))  # nx3x224x224
    x = x.to(device)
    x = resnet_model(x).detach().cpu().numpy()
    return lasso_model.predict(x)

In [39]:
def full_nn_pipeline(x):
    x = torch.Tensor(x.transpose(0, 3, 1, 2))  # 1x3x224x224
    x = x.to(device)
    x = resnet_model(x)
    x = torch.sigmoid(binary_classifier(x))
    x = torch.round(x)
    return x.detach().cpu().numpy()

In [18]:
results = pd.DataFrame({'sample_paths': before_paths})

In [19]:
results['lasso_results_1'] = np.nan
results['nn_results_1'] = np.nan
results['lasso_results_2'] = np.nan
results['nn_results_2'] = np.nan

In [20]:
from tqdm import tqdm
import math

In [21]:
np.expand_dims(load_image_for_feature_extraction(before_paths[0]), 0).shape

(1, 224, 224, 3)

In [9]:
def preprocess(path):
    return np.expand_dims(load_image_for_feature_extraction(path), 0)

In [22]:
for i, (before_path, after_path) in tqdm(enumerate(zip(before_paths, after_paths)), total=len(before_paths)):
    if before_path.stem != after_path.stem:
        print(f"Before and after don't match for index {i}, before: {before_path}, after: {after_path}")
        break
        
    results.loc[i,'lasso_results_1'] = full_lime_pipeline(preprocess(before_path))
    results.loc[i,'lasso_results_2'] = full_lime_pipeline(preprocess(after_path))
    results.loc[i,'nn_results_1'] = full_nn_pipeline(preprocess(before_path)).squeeze()
    results.loc[i,'nn_results_2'] = full_nn_pipeline(preprocess(after_path)).squeeze()

100%|███████████████████████████████████████| 1292/1292 [00:48<00:00, 26.59it/s]


In [83]:
results

Unnamed: 0,sample_paths,lasso_results_1,nn_results_1,lasso_results_2,nn_results_2
0,data/new_crops/before/haircut face before after 3_89.jpg,1.0,1.0,0.0,1.0
1,data/new_crops/before/makeup before after 3_108.jpg,0.0,0.0,0.0,1.0
2,data/new_crops/before/makeup before after 3_55.jpg,0.0,0.0,0.0,0.0
3,data/new_crops/before/makeup before after arabic_51.jpg,1.0,1.0,1.0,1.0
4,data/new_crops/before/beard before after 2_30.jpg,0.0,0.0,0.0,0.0
...,...,...,...,...,...
1287,data/new_crops/before/makeup before after 3_39.jpg,1.0,0.0,1.0,1.0
1288,data/new_crops/before/glasses before after 1_11.jpg,0.0,0.0,0.0,0.0
1289,data/new_crops/before/beard before after 1_32.jpg,0.0,0.0,0.0,0.0
1290,data/new_crops/before/haircut face before after 1_69.jpg,0.0,0.0,0.0,1.0


In [38]:
total = len(results)
print(f"Total images: {total}")

lasso_1, lasso_2 = results['lasso_results_1'], results['lasso_results_2']

lasso_flipped = (lasso_1 != lasso_2).sum()
print(f"Predictions that flipped between liberal and conservative using logistic regression model: {lasso_flipped}")

nn_flipped = (results['nn_results_1'] != results['nn_results_2']).sum()
print(f"Predictions that flipped between liberal and conservative using neural net: {nn_flipped}")

results['lasso_results_1'].isna().sum()

lasso_lib_to_con = len(results[(results['lasso_results_1'] == 0) & (results['lasso_results_2'] == 1)])
print(f"Predictions that flipped from liberal to conservative using logistic regression model: {lasso_lib_to_con}")

lasso_con_to_lib = len(results[(results['lasso_results_1'] == 1) & (results['lasso_results_2'] == 0)])
print(f"Predictions that flipped from conservative to liberal using logistic regression model: {lasso_con_to_lib}")

nn_lib_to_con = len(results[(results['nn_results_1'] == 0) & (results['nn_results_2'] == 1)])
print(f"Predictions that flipped from liberal to conservative using neural network: {nn_lib_to_con}")

nn_con_to_lib = len(results[(results['nn_results_1'] == 1) & (results['nn_results_2'] == 0)])
print(f"Predictions that flipped from conservative to liberal using neural network: {nn_con_to_lib}")

Total images: 1292
Predictions that flipped between liberal and conservative using logistic regression model: 200
Predictions that flipped between liberal and conservative using neural net: 428
Predictions that flipped from liberal to conservative using logistic regression model: 92
Predictions that flipped from conservative to liberal using logistic regression model: 108
Predictions that flipped from liberal to conservative using neural network: 231
Predictions that flipped from conservative to liberal using neural network: 197


In [5]:
results_path = Path("./results/batch_preds.csv")

In [105]:
results.to_csv(results_path)

In [6]:
pred_results = pd.read_csv(results_path, index_col=0)

In [6]:
pred_results

Unnamed: 0,sample_paths,lasso_results_1,nn_results_1,lasso_results_2,nn_results_2
0,data/new_crops/before/haircut face before after 3_89.jpg,1.0,1.0,0.0,1.0
1,data/new_crops/before/makeup before after 3_108.jpg,0.0,0.0,0.0,1.0
2,data/new_crops/before/makeup before after 3_55.jpg,0.0,0.0,0.0,0.0
3,data/new_crops/before/makeup before after arabic_51.jpg,1.0,1.0,1.0,1.0
4,data/new_crops/before/beard before after 2_30.jpg,0.0,0.0,0.0,0.0
...,...,...,...,...,...
1287,data/new_crops/before/makeup before after 3_39.jpg,1.0,0.0,1.0,1.0
1288,data/new_crops/before/glasses before after 1_11.jpg,0.0,0.0,0.0,0.0
1289,data/new_crops/before/beard before after 1_32.jpg,0.0,0.0,0.0,0.0
1290,data/new_crops/before/haircut face before after 1_69.jpg,0.0,0.0,0.0,1.0


In [7]:
lasso_1 = pred_results['lasso_results_1']
lasso_2 = pred_results['lasso_results_2']
nn_1 = pred_results['nn_results_1']
nn_2 = pred_results['nn_results_2']
fns = pred_results['sample_paths']

In [8]:
# extract filename without extension from path
fns = fns.map(lambda fn: Path(fn).stem)

In [9]:
categories = fns.map(lambda fn: fn.split()[0])

In [10]:
def get_cat(stem):
    cat_dict = {
        'makeupe': 'makeup',
        'hiardoo': 'hairdoo',
        'hairdoocut': 'haircut'
    }
    
    first_word = stem.split()[0]
    
    if first_word in cat_dict.keys():
        return cat_dict[stem]
    else:
        return first_word

In [11]:
categories = categories.map(get_cat)

In [12]:
idx = np.where(categories == 'images')[0]

In [13]:
categories[idx] = fns[idx].map(lambda s: s.split()[2])

In [14]:
cat_names = categories.unique()
cat_names

array(['haircut', 'makeup', 'beard', 'drag', 'hairdoo', 'glasses'],
      dtype=object)

In [15]:
haircut_preds = pred_results[categories == 'haircut']

In [66]:
print(f"Number of images before and after haircuts: {len(haircut_preds)}")

Number of images before and after haircuts: 287


In [72]:
print(f"Haircut before predicted liberal: {len(haircut_preds[haircut_preds['lasso_results_1'] == 0])}")

Haircut before predicted liberal: 234


In [73]:
print(f"Haircut before predicted conservative: {len(haircut_preds[haircut_preds['lasso_results_1'] == 1])}")

Haircut before predicted conservative: 53


In [17]:
def get_blank_df():
    df = pd.DataFrame(index=cat_names)
    df['total'] = np.nan
    df['before liberal'] = np.nan
    df['before conservative'] = np.nan
    df['after liberal'] = np.nan
    df['after conservative'] = np.nan
    
    return df

In [18]:
def get_analysis(df: pd.DataFrame = None, before_colname = "", after_colname = ""):
    
    if df == None:
        df = get_blank_df()
        
    for cat in cat_names:
        pred_slice = pred_results[categories == cat]
        total = len(pred_slice)
        before_lib = len(pred_slice[pred_slice[before_colname] == 0])
        before_con = len(pred_slice[pred_slice[before_colname] == 1])
        after_lib = len(pred_slice[pred_slice[after_colname] == 0])
        after_con = len(pred_slice[pred_slice[after_colname] == 1])

        df.loc[[cat], ['total']] = total
        df.loc[[cat], ['before liberal']] = before_lib
        df.loc[[cat], ['before conservative']] = before_con
        df.loc[[cat], ['after liberal']] = after_lib
        df.loc[[cat], ['after conservative']] = after_con

    total = df.sum()
    total.name = 'total'
    df = df.append(total.transpose())

    return df

In [32]:
lasso_analysis_df = get_analysis(before_colname="lasso_results_1", after_colname="lasso_results_2")

In [33]:
lasso_analysis_df

Unnamed: 0,total,before liberal,before conservative,after liberal,after conservative
haircut,287.0,234.0,53.0,236.0,51.0
makeup,386.0,332.0,54.0,336.0,50.0
beard,357.0,320.0,37.0,339.0,18.0
drag,168.0,146.0,22.0,141.0,27.0
hairdoo,68.0,60.0,8.0,59.0,9.0
glasses,26.0,24.0,2.0,21.0,5.0
total,1292.0,1116.0,176.0,1132.0,160.0


In [66]:
lasso_analysis_df.to_csv('./results/lasso_retest_analysis.csv')

In [29]:
nn_analysis_df = get_analysis(before_colname="nn_results_1", after_colname="nn_results_2")

In [30]:
nn_analysis_df

Unnamed: 0,total,before liberal,before conservative,after liberal,after conservative
haircut,287.0,142.0,145.0,134.0,153.0
makeup,386.0,205.0,181.0,206.0,180.0
beard,357.0,175.0,182.0,152.0,205.0
drag,168.0,76.0,92.0,69.0,99.0
hairdoo,68.0,24.0,44.0,27.0,41.0
glasses,26.0,15.0,11.0,15.0,11.0
total,1292.0,637.0,655.0,603.0,689.0


In [31]:
nn_analysis_df.to_csv('./results/nn_retest_analysis.csv')

In [34]:
from scipy.stats import chi2_contingency

In [49]:
lasso_total_preds = lasso_analysis_df.loc[['total']].values[:,1:]
lasso_total_preds

array([[1116.,  176., 1132.,  160.]])

In [51]:
chi2_contingency([[1116.,  176.], [1132.,  160.]])

(0.7697318251143873,
 0.380300027154185,
 1,
 array([[1124.,  168.],
        [1124.,  168.]]))

In [52]:
lasso_analysis_df['chi_squared'] = np.nan

In [64]:
def get_chi_squared(row):
    bl = row['before liberal']
    bc = row['before conservative']
    al = row['after liberal']
    ac = row['after conservative']
    
    return chi2_contingency([[bl,  bc], [al,  ac]])[:2]
    

lasso_analysis_df['chi_squared'] = lasso_analysis_df.apply(get_chi_squared, axis=1)

In [65]:
lasso_analysis_df

Unnamed: 0,total,before liberal,before conservative,after liberal,after conservative,chi_squared
haircut,287.0,234.0,53.0,236.0,51.0,"(0.0117430441898527, 0.9137059038318389)"
makeup,386.0,332.0,54.0,336.0,50.0,"(0.1000115154306771, 0.7518158155181155)"
beard,357.0,320.0,37.0,339.0,18.0,"(6.3825631121534006, 0.011524688103102226)"
drag,168.0,146.0,22.0,141.0,27.0,"(0.38227974116475855, 0.5363856757737189)"
hairdoo,68.0,60.0,8.0,59.0,9.0,"(0.0, 1.0)"
glasses,26.0,24.0,2.0,21.0,5.0,"(0.6603174603174603, 0.4164478902261446)"
total,1292.0,1116.0,176.0,1132.0,160.0,"(0.7697318251143873, 0.380300027154185)"


In [66]:
lasso_analysis_df.to_csv('./results/lasso_retest_analysis.csv')

In [67]:
nn_analysis_df['chi_squared'] = nn_analysis_df.apply(get_chi_squared, axis=1)

In [68]:
nn_analysis_df

Unnamed: 0,total,before liberal,before conservative,after liberal,after conservative,chi_squared
haircut,287.0,142.0,145.0,134.0,153.0,"(0.3419657620854003, 0.5586967386040109)"
makeup,386.0,205.0,181.0,206.0,180.0,"(0.0, 1.0)"
beard,357.0,175.0,182.0,152.0,205.0,"(2.7307683189910628, 0.09843192132463244)"
drag,168.0,76.0,92.0,69.0,99.0,"(0.4367575374616357, 0.5086916201104175)"
hairdoo,68.0,24.0,44.0,27.0,41.0,"(0.12549019607843137, 0.7231545670823014)"
glasses,26.0,15.0,11.0,15.0,11.0,"(0.0, 1.0)"
total,1292.0,637.0,655.0,603.0,689.0,"(1.68849366359447, 0.19379965652517878)"


In [69]:
nn_analysis_df.to_csv('./results/nn_retest_analysis.csv')

In [59]:
def get_contingency_table(df: pd.DataFrame = None, before_colname = "", after_colname = ""):
    
    if df == None:
        df = pd.DataFrame(index=cat_names)
        
    for cat in cat_names:
        pred_slice = pred_results[categories == cat]
        total = len(pred_slice)
        blal = len(pred_slice[(pred_slice[before_colname] == 0) & (pred_slice[after_colname] == 0)])
        blac = len(pred_slice[(pred_slice[before_colname] == 0) & (pred_slice[after_colname] == 1)])
        bcac = len(pred_slice[(pred_slice[before_colname] == 1) & (pred_slice[after_colname] == 1)])
        bcal = len(pred_slice[(pred_slice[before_colname] == 1) & (pred_slice[after_colname] == 0)])

        df.loc[[cat], ['total']] = total
        df.loc[[cat], ['blal']] = blal
        df.loc[[cat], ['blac']] = blac
        df.loc[[cat], ['bcac']] = bcac
        df.loc[[cat], ['bcal']] = bcal

    total = df.sum()
    total.name = 'total'
    df = df.append(total.transpose())

    return df

In [85]:
lasso_contingency = get_contingency_table(before_colname="lasso_results_1", after_colname="lasso_results_2")

In [61]:
lasso_contingency

Unnamed: 0,total,blal,blac,bcac,bcal
haircut,287.0,210.0,24.0,27.0,26.0
makeup,386.0,306.0,26.0,24.0,30.0
beard,357.0,310.0,10.0,8.0,29.0
drag,168.0,124.0,22.0,5.0,17.0
hairdoo,68.0,54.0,6.0,3.0,5.0
glasses,26.0,20.0,4.0,1.0,1.0
total,1292.0,1024.0,92.0,68.0,108.0


In [84]:
def get_mcnemar(row, target='statistic'):
    blal = row['blal']
    blac = row['blac']
    bcac = row['bcac']
    bcal = row['bcal']
    
    mc = mcnemar([[blal,  blac], [bcal,  bcac]])
    
    return mc.statistic, mc.pvalue

In [87]:
lasso_contingency.apply(get_mcnemar, axis=1)

haircut        (24.0, 0.887724827340783)
makeup        (26.0, 0.6888797607233395)
beard      (10.0, 0.0033778479119064286)
drag          (17.0, 0.5223973804968411)
hairdoo                       (5.0, 1.0)
glasses                     (1.0, 0.375)
total         (92.0, 0.2888204971747663)
dtype: object

In [91]:
lasso_contingency['statistic','pvalue'] = lasso_contingency.apply(get_mcnemar, axis=1)

In [92]:
lasso_contingency

Unnamed: 0,total,blal,blac,bcac,bcal,"(statistic, pvalue)"
haircut,287.0,210.0,24.0,27.0,26.0,"(24.0, 0.887724827340783)"
makeup,386.0,306.0,26.0,24.0,30.0,"(26.0, 0.6888797607233395)"
beard,357.0,310.0,10.0,8.0,29.0,"(10.0, 0.0033778479119064286)"
drag,168.0,124.0,22.0,5.0,17.0,"(17.0, 0.5223973804968411)"
hairdoo,68.0,54.0,6.0,3.0,5.0,"(5.0, 1.0)"
glasses,26.0,20.0,4.0,1.0,1.0,"(1.0, 0.375)"
total,1292.0,1024.0,92.0,68.0,108.0,"(92.0, 0.2888204971747663)"


In [93]:
lasso_contingency.to_csv('./results/lasso_mcnemar.csv')

In [94]:
nn_contingency = get_contingency_table(before_colname="nn_results_1", after_colname="nn_results_2")

In [95]:
nn_contingency['statistic','pvalue'] = nn_contingency.apply(get_mcnemar, axis=1)

In [96]:
nn_contingency

Unnamed: 0,total,blal,blac,bcac,bcal,"(statistic, pvalue)"
haircut,287.0,91.0,51.0,102.0,43.0,"(43.0, 0.47049211489037085)"
makeup,386.0,140.0,65.0,115.0,66.0,"(65.0, 1.0)"
beard,357.0,106.0,69.0,136.0,46.0,"(46.0, 0.03975050512654258)"
drag,168.0,39.0,37.0,62.0,30.0,"(30.0, 0.46381761789686937)"
hairdoo,68.0,18.0,6.0,35.0,9.0,"(6.0, 0.6072387695312499)"
glasses,26.0,12.0,3.0,8.0,3.0,"(3.0, 1.0)"
total,1292.0,406.0,231.0,458.0,197.0,"(197.0, 0.11057922806690416)"


In [97]:
nn_contingency.to_csv('./results/nn_mcnemar.csv')

### SMC for Logistic Regression

In [22]:
def get_smc(row):
    return (total['blal'] + total['bcac']) / total.sum()

In [2]:
lasso_contingency = pd.read_csv('./results/lasso_mcnemar.csv', index_col=0)

In [29]:
total = lasso_contingency.iloc[-1, 1:-1]

In [31]:
total

blal    1024.0
blac      92.0
bcac      68.0
bcal     108.0
Name: total, dtype: object

In [33]:
smc = get_smc(total)

In [34]:
smc

0.8452012383900929

In [38]:
total = lasso_contingency.iloc[-1, :-1]

In [39]:
total['blal'] / total['total']

0.7925696594427245

### SMC for Neural Network

In [42]:
nn_contingency = pd.read_csv('./results/nn_mcnemar.csv', index_col=0)

In [44]:
total = nn_contingency.iloc[-1, :-1]

In [47]:
total['blal'] / total['total']

0.3142414860681115

In [48]:
total['bcac'] / total['total']

0.3544891640866873

In [49]:
total = nn_contingency.iloc[-1, 1:-1]

In [50]:
smc = (total['blal'] + total['bcac']) / total.sum()

In [51]:
smc

0.6687306501547987

### Without "drag" sample

In [14]:
df = lasso_contingency.iloc[[0,1,2,4,5], 0:-1]

In [16]:
total = df.sum()
total.name = "total"
df = df.append(total.transpose())

In [17]:
df

Unnamed: 0,total,blal,blac,bcac,bcal
haircut,287.0,210.0,24.0,27.0,26.0
makeup,386.0,306.0,26.0,24.0,30.0
beard,357.0,310.0,10.0,8.0,29.0
hairdoo,68.0,54.0,6.0,3.0,5.0
glasses,26.0,20.0,4.0,1.0,1.0
total,1124.0,900.0,70.0,63.0,91.0


In [18]:
df.to_csv('results/before_after_contingency_wo_drag.csv')

In [24]:
total = df.iloc[-1, 1:]

In [25]:
get_smc(total)

0.8567615658362989

In [33]:
def full_lasso_pipeline_proba(x):
    x = torch.Tensor(x.transpose(0, 3, 1, 2))  # nx3x224x224
    x = x.to(device)
    x = resnet_model(x).detach().cpu().numpy()
    return lasso_model.predict_proba(x)

In [25]:
def full_nn_pipeline_proba(x):
    x = torch.Tensor(x.transpose(0, 3, 1, 2))  # 1x3x224x224
    x = x.to(device)
    x = resnet_model(x)
    x = torch.sigmoid(binary_classifier(x))
    return x.detach().cpu().numpy()

In [44]:
full_lime_pipeline(preprocess("./sample_images/alex.jpg"))

array([0])

In [43]:
full_lime_pipeline_proba(preprocess("./sample_images/alex.jpg"))

array([[0.59232954, 0.40767046]])

In [20]:
full_lasso_pipeline_proba(preprocess("./sample_images/alex-drag-1.png"))

array([[0.5597691, 0.4402309]])

In [51]:
full_nn_pipeline_proba(preprocess("./sample_images/alex.jpg"))

array([[0.35277236]], dtype=float32)

In [52]:
full_nn_pipeline_proba(preprocess("./sample_images/alex-drag-3.jpg"))

array([[0.26840058]], dtype=float32)

In [30]:
full_nn_pipeline_proba(preprocess("./sample_images/mustabeen_no_makeup.jpeg"))

array([[0.5075296]], dtype=float32)

In [31]:
full_nn_pipeline_proba(preprocess("./sample_images/mustabeen_makeup.jpeg"))

array([[0.8909782]], dtype=float32)

In [34]:
full_lasso_pipeline_proba(preprocess("./sample_images/mustabeen_no_makeup.jpeg"))

array([[0.74059344, 0.25940656]])

In [35]:
full_lasso_pipeline_proba(preprocess("./sample_images/mustabeen_makeup.jpeg"))

array([[0.91779061, 0.08220939]])