### RQ2: What makes a bystander? Can it be described by high level concepts? E.g. posing or not
- Look at the survey what people think they think
- Empirically try to regress / classify bystander from high level concepts
- Compare

**Note**: look into hosmer-lemeshow test for goodness of fit.

In [2]:
# generic imports
import sys
import os
sys.path.append('../')
#import helper
from definitions import *

import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

# data structure
import itertools
from collections import Counter,defaultdict

#stats amd ml
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm #https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.LogitResults.html
from scipy import stats
from sklearn import feature_selection
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis
from sklearn.metrics import roc_curve

from sklearn.metrics import matthews_corrcoef

In [3]:
from platform import python_version

print(python_version())

3.6.0


In [4]:
text_response_cols = ['why_subject', 'why_bystander', 'why_neither']

photo_df = pickle.load(open(os.path.join(survey_path, 'photo_df.pkl'), 'rb'))
mapping = pickle.load(open(survey_path +'mappings_pilot2','rb'))
feature_df = pickle.load(open(os.path.join(survey_path, 'high-feature-df.pkl'), 'rb'))
feature_df.shape, len(set(photo_df.index.values))

((4080, 14), 4080)

In [5]:
'''Insert image level concepts in photo_df'''
for c in img_level_concepts:
    photo_df[c]=feature_df[c]

# Participants' responses

In [6]:
def show_unique_texts(photo_df, show_res=True):
    '''Find unique text responses for subject/bystander in the photo dataframe'''
    unique_text_responses = dict()

    for t in text_response_cols:
        texts = photo_df[[t]]
        texts = texts[~texts.isnull().any(axis=1)]
        texts = texts.apply(lambda r: r[t].split(','), axis=1)
        unique_text_responses[t] = Counter(list(itertools.chain(*texts)))#set(list(itertools.chain(*texts))).difference(set(['Other (please describe)']))

        texts = photo_df[[t+'_text']]
        texts = texts[~texts.isnull().any(axis=1)]
        texts = texts.apply(lambda r: r[t+'_text'].split(','), axis=1)
        #unique_text_responses[t]+= Counter(list(itertools.chain(*texts)))

    if show_res:
        for key in unique_text_responses:
            print(key)
            for t in unique_text_responses[key].most_common(100):
                print('{} ({})'.format(t[0],t[1]))
            print()
    return unique_text_responses
from numpy import std, mean, sqrt

#correct if the population S.D. is expected to be equal for the two groups.
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)


In [None]:
'''For all responses'''
_ = show_unique_texts(photo_df=photo_df)

In [8]:
sub_responses = ['This photo is focused on this person.',
     'This person is taking a large space in the photo.',
     'This photo is about what this person was doing.',
     'This is the only person in the photo.',
     'This person was doing the same activity as other subject(s) in this photo.',
     'This person was interacting with other subject(s) in this photo.',
     'The appearance of this person is similar to other subject(s) of this photo.']

bystander_responses = ['This photo is not focused on this person.',
    'This person just happened to be there when the photo was taken.',
    'Object(s) other than people are the subject(s) of this photo.',
    'The activity of this person is similar to other bystander(s) in this photo.',
    'Appearance of this person is similar to other bystanders in this photo.',
    'There is no specific subject in this photo.',
    'This person is interacting with other bystander(s).',
    'This person is blocked by other people/object.',
    'The activity of this person is different than other subjects(s) in this photo.',
    'Appearance of this person is different that other subjects in this photo.']

In [9]:

def get_response_matrix(photo_df, unique_responses, question):
    '''
    Build a matrix showing which responses occur together.
    Note: need to balance for number of times a photo was shown.
    '''
    matrix = defaultdict(lambda: defaultdict(int))
    for response in unique_responses:
        other_responses = show_unique_texts(photo_df=
            photo_df[photo_df.apply(lambda row: 
            isinstance(row[question],str) and \
            response in row[question], axis=1)], show_res = False)

        row = dict()
        for t in other_responses[question].most_common(100):
            if t[0]!=response:
                row[t[0]] = t[1]
        matrix[response] = row
    return matrix

def get_response_dataframe(photo_df, unique_responses, question):
    '''
    Build a dataframe where each column is for one unique text response with binary
    value indicating if that text is in each of the responses in photo_df.
    
    Note: need to balance for number of times a photo was shown.
    '''
    #matrix = defaultdict(lambda: defaultdict(int))
    out = dict()
    
    texts = photo_df[[question]]
    texts = texts[~texts.isnull().any(axis=1)]
    texts = texts.apply(lambda r: r[question].split(','), axis=1)
    
    for response in unique_responses:
        out[response] = texts.apply(lambda textlist: int(response in textlist))
            
    
    return pd.DataFrame(out)

def get_response_dataframe_with_high_columns(photo_df, unique_responses, features, question):
    '''
    Build a dataframe where each column is for one unique text response with binary
    value indicating if that text is in each of the responses in photo_df.
    
    Note: need to balance for number of times a photo was shown.
    '''
    out = dict()
    
    texts = photo_df[[question,'subject_bystander_num']+features]
    texts = texts[~texts.isnull().any(axis=1)]
    
    for response in unique_responses:
        out[response] = texts.apply(lambda r: int(response in r[question].split(',')), axis=1)
        for c in features:
            out[c] = texts[c]
    
    return pd.DataFrame(out)

In [None]:
response_df_subject =get_response_dataframe(photo_df, sub_responses, question='why_subject')
response_df_bystander =get_response_dataframe(photo_df, bystander_responses, question='why_bystander')

In [None]:
#helper.plot_corr_matrix(mat=response_df_subject.corr(method='kendall'))

In [None]:
#helper.plot_corr_matrix(mat=response_df_bystander.corr(method='kendall'))

In [None]:
# responses = get_response_matrix(photo_df, sub_responses, 'why_subject')
# df = pd.DataFrame(responses).loc[sub_responses][sub_responses]
# df

In [None]:
#helper.plot_corr_matrix(mat=df)

In [None]:
# responses = get_response_matrix(photo_df, bystander_responses, 'why_bystander')
# df = pd.DataFrame(responses).loc[bystander_responses][bystander_responses]
# df

In [None]:
#helper.plot_corr_matrix(mat=df)

### Correlation between high level concepts and selected text responses

In [11]:
all_response_df_subject =get_response_dataframe_with_high_columns(
            photo_df, sub_responses, question='why_subject', features = high_level_concepts_num+img_level_concepts
                )
all_response_df_bystander =get_response_dataframe_with_high_columns(
            photo_df, bystander_responses, question='why_bystander', features = high_level_concepts_num+img_level_concepts
)

In [12]:
all_response_df_subject.head()#shape,all_response_df_bystander.shape

Unnamed: 0_level_0,The appearance of this person is similar to other subject(s) of this photo.,This is the only person in the photo.,This person is taking a large space in the photo.,This person was doing the same activity as other subject(s) in this photo.,This person was interacting with other subject(s) in this photo.,This photo is about what this person was doing.,This photo is focused on this person.,comfort_num,num_people,person_distance_axes_norm,person_size,photo_place_num,photographer_intention_num,posing_num,replacable_num,was_aware_num,will_num
photo_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
180,0,0,0,0,0,0,1,-2.0,1,0.200314,0.247632,-2.0,-1.0,0.0,-2.0,-2.0,-1.0
169,0,0,0,0,1,0,0,-2.0,5,0.36719,0.261564,-1.0,0.0,-2.0,-2.0,-1.0,-1.0
178,0,1,0,0,0,0,0,-1.0,1,0.226073,0.042239,-1.0,-2.0,-2.0,-1.0,-2.0,-1.0
179,0,0,0,0,0,0,1,2.0,3,0.047418,0.037202,-2.0,1.0,1.0,0.0,2.0,2.0
170,0,0,0,0,0,0,1,1.0,1,0.323908,0.016181,-1.0,2.0,1.0,0.0,2.0,0.0


In [None]:
feature_df.loc[['180','169','178','179','170']][['subject_bystander_num']]

In [20]:
import math

sub_dfs=dict()
for r in sub_responses:
    rows = []
    for c in high_level_concepts_num + img_level_concepts:
        
        cor = stats.spearmanr(all_response_df_subject[c],all_response_df_subject[r])
        x = all_response_df_subject[all_response_df_subject[r]==1][c]
        y = all_response_df_subject[all_response_df_subject[r]==0][c]
        stat, p = stats.ttest_ind(x,y,equal_var=False)
        effect_size = cohen_d(x,y)
        
        if math.fabs(cor[0])>=.1 and cor[1]<.001 and math.fabs(effect_size)>=.2 and p<.001:
            dic = {'Feature':c,'correlation coefficient (r)':cor[0], #'p1':cor[1], 
                   'effect-size (d)': effect_size#, 't-statistic':stat, 'p2':p
                  }
            rows.append(dic)
    if rows:
        sub_dfs[r]=pd.DataFrame(rows)#.set_index('col')
keys=list(sub_dfs.keys())

In [21]:
for k in keys:
    print(k)
    print(sub_dfs[k].set_index('Feature').T.rename(columns=high_level_concepts_name).T.round(2).to_latex())
    print('\n\n')

This photo is focused on this person.
\begin{tabular}{lrr}
\toprule
{} &  correlation coefficient (r) &  effect-size (d) \\
Feature          &                              &                  \\
\midrule
Awareness        &                         0.17 &             0.36 \\
Pose             &                         0.19 &             0.42 \\
Comfort          &                         0.15 &             0.30 \\
Willingness      &                         0.15 &             0.30 \\
Replaceable      &                        -0.20 &            -0.39 \\
Size             &                         0.35 &             0.69 \\
Distance         &                        -0.29 &            -0.63 \\
Number of people &                        -0.37 &            -0.82 \\
\bottomrule
\end{tabular}




This person is taking a large space in the photo.
\begin{tabular}{lrr}
\toprule
{} &  correlation coefficient (r) &  effect-size (d) \\
Feature          &                              &                  \\
\

In [22]:
by_dfs=dict()
for r in bystander_responses:
    rows = []
    for c in high_level_concepts_num + img_level_concepts:
        
        cor = stats.spearmanr(all_response_df_bystander[c],all_response_df_bystander[r])
        x = all_response_df_bystander[all_response_df_bystander[r]==1][c]
        y = all_response_df_bystander[all_response_df_bystander[r]==0][c]
        stat, p = stats.ttest_ind(x,y,equal_var=False)
        effect_size = cohen_d(x,y)
        
        if math.fabs(cor[0])>=.1 and cor[1]<.05 and math.fabs(effect_size)>=.2 and p<.05:
            dic = {'Feature':c,'correlation coefficient (r)':cor[0], #'p1':cor[1], 
                   'effect-size (d)': effect_size#, 't-statistic':stat, 'p2':p
                  }
            rows.append(dic)
    if rows:
        by_dfs[r]=pd.DataFrame(rows)#.set_index('col')
by_keys=list(by_dfs.keys())

In [23]:
for k in by_keys:
    print(k)
    print(by_dfs[k].set_index('Feature').T.rename(columns=high_level_concepts_name).T.round(2).to_latex())
    print('\n\n')

This photo is not focused on this person.
\begin{tabular}{lrr}
\toprule
{} &  correlation coefficient (r) &  effect-size (d) \\
Feature     &                              &                  \\
\midrule
Awareness   &                        -0.25 &            -0.59 \\
Pose        &                        -0.31 &            -0.77 \\
Comfort     &                        -0.25 &            -0.49 \\
Willingness &                        -0.26 &            -0.52 \\
Replaceable &                         0.16 &             0.31 \\
Photo place &                        -0.22 &            -0.52 \\
Size        &                        -0.20 &            -0.44 \\
Distance    &                         0.21 &             0.46 \\
\bottomrule
\end{tabular}




This person just happened to be there when the photo was taken.
\begin{tabular}{lrr}
\toprule
{} &  correlation coefficient (r) &  effect-size (d) \\
Feature     &                              &                  \\
\midrule
Awareness   &           

### How presence of other people influence decision 

In [None]:
'''Number of photos with subjects/bystander across total number of people'''

photos = set(photo_df.index.values)
feature_df.head()

In [None]:
'''
Load reverse map wheren key=imgeID and values are the list of photono used in the survey.
The length of the list indicates how many actual people are in the photo.
'''
reverse_map = pickle.load(open(os.path.join(survey_path, 'reverse-map.pkl'), 'rb'))

'''Load another dict where keys are number of people, and values are list of imgeIds containing that many people'''
photo_by_people = pickle.load(open(os.path.join(survey_path, 'photo-by-people.pkl'), 'rb'))        

In [None]:
'''
In a photo with x number of people, how many of them are subject(s)?
'''
for num_people in range(1, 6):
    num_subject = [0]*(num_people+1) # list indicates number of photos with number of people as subject
    for photoId in photo_by_people[num_people][:]: #for each photo
        #print(photoId,reverse_map[photoId])
        indices =[str(id) for id in reverse_map[photoId]] #find indices of this photo in survey dataframe
        #print(indices)
        df = feature_df.loc[indices] # find out the survey data for the photo indices
        sub_count = len(df[df.label==1]) # how many of them are subject
        #print(sub_count)
        num_subject[sub_count]+=1 # increase count
        
    #print(num_subject)
    print('Total photos with {} people: {}'.format(num_people, len(photo_by_people[num_people])))
    for s in range(num_people+1):
        print('\t{} subject: {} ({:.2f}%)'.format(s, num_subject[s], num_subject[s]*100/len(photo_by_people[num_people])))
    print()

In [None]:
photo_df.loc[indices][['subject_bystander_num','why_subject']]

In [None]:
'''
When only one person in a 2-person photo are categorized as subjects,
what are the reasons for this categorization?
'''
indices = []
for photoId in photo_by_people[2]: #for each photo
    idx =[str(id) for id in reverse_map[photoId]] #find indices of this photo in survey dataframe
    df = feature_df.loc[idx] # find out the survey data for the photo indices
    sub_count = len(df[df.label==1]) # how many of them are subject
    if sub_count==1:
        indices += idx #find indices of this photo in survey dataframe
print(len(indices))
_ = show_unique_texts(photo_df=photo_df.loc[indices])

In [None]:
'''
When both persons in a 2-person photo are categorized as subjects,
what are the reasons for this categorization?
'''
indices = []
for photoId in photo_by_people[2]: #for each photo
    idx =[str(id) for id in reverse_map[photoId]] #find indices of this photo in survey dataframe
    df = feature_df.loc[idx] # find out the survey data for the photo indices
    sub_count = len(df[df.label==1]) # how many of them are subject
    if sub_count==2:
        indices += idx #find indices of this photo in survey dataframe
print(len(indices))
_ = show_unique_texts(photo_df=photo_df.loc[indices])

In [None]:
'''
Some sample 2-persons photos, where only one of them was categorized as 'subject'.
'''
helper.draw_photos_from_path(photo_paths=[survey_photo_path+str(i)+'.jpg' for i in [3065,2428,638,3399,3294,3180,3252,3654]], col_size=3)

### Note:
In some 2-persons photos, while categorizing one of them, the relationships of that person with the other was selected as a reason (e.g. looking/interacting with other subect). But the other person in many of the photos were categorized as bystander. This is probably because the two photos were annotated by different set of subjects. But the relationships among people is likely be important.

In [None]:
'''
When only one person in a 3-person photo are categorized as subjects,
what are the reasons for this categorization?
'''
indices = []
for photoId in photo_by_people[3]: #for each photo
    idx =[str(id) for id in reverse_map[photoId]] #find indices of this photo in survey dataframe
    df = feature_df.loc[idx] # find out the survey data for the photo indices
    sub_count = len(df[df.label==1]) # how many of them are subject
    if sub_count==1:
        indices += idx #find indices of this photo in survey dataframe
print(len(indices))
_ = show_unique_texts(photo_df=photo_df.loc[indices])

In [None]:
indices
photo_df.loc[indices[:10]][['why_subject','why_bystander']]

## Classify subject/bystander from high level concepts

In [None]:
'''Mutual information between a single predictor and the target then select the predictor with 
the highest MI to train classifier'''

for i in range(len(high_level_concepts)):
    mi =feature_selection.mutual_info_classif(feature_df[high_level_concepts_num[i]].values.reshape(-1,1),
                                              feature_df.label.values,
                                              discrete_features='auto', n_neighbors=3, copy=True, random_state=None)
    if True or mi[0]>=.2:
        print('{}: {:.2f}'.format(high_level_concepts_num[i], mi[0]))

In [None]:
def test_logit(data, predictors, label='label', normalize=True):
    X = data[predictors]
    if normalize:
        X = X.apply(stats.zscore)
    X = sm.add_constant(X)
    y = data[label]
    return sm.Logit(endog=y, exog=X).fit(disp = False)

def print_chisq(model):
    print('\nChisq:{:.2f}, p:{:.2f}\n'.format(model.llr, model.llr_pvalue))
    
def get_Rsq(model):
    return (model.llr) / (- 2*model.llnull)

def get_model_summary(model):
    summary = dict()
    summary['Chi^2'] = model.llr
    summary['p(Chi^2)'] = model.llr_pvalue
    summary['R^2'] = (model.llr) / (- 2*model.llnull)
    return summary

def get_OR(model):
    output = model.conf_int()
    output['OR'] = model.params
    output.columns = ['2.5%', '97.5%', 'OR']
    return np.exp(output)[['OR', '2.5%', '97.5%']]

In [None]:
binary_data = feature_df[(feature_df.label==1) | (feature_df.label==-1)]
binary_data['label'] = binary_data.apply(lambda row: 1 if row.label==1 else 0, axis=1)

In [None]:
def get_null_rows(df):
    null_columns=df.columns[df.isnull().any()]
    return df[df.isnull().any(axis=1)][null_columns]

In [None]:
binary_data[['photographer_intention_num','label']]
feature_df.corr()

In [None]:
'''Use all high level concepts'''
model = test_logit(binary_data, high_level_concepts_num)
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
'''Use high level concepts found useful from the factor analysis'''
model = test_logit(binary_data, ['posing_num','replacable_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
'''Use high level concepts found useful from the factor analysis combined with predictors
with low factor loading'''
model = test_logit(binary_data, ['posing_num','replacable_num','photographer_intention_num',
                                'photo_place_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
feature_df[['person_size','person_distance','num_people']].corr()

In [None]:
'''Use high level concepts found useful from the factor analysis combined with the
image level predictors'''
model = test_logit(binary_data, [ 'posing_num','replacable_num',
                                 'person_size','num_people'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
'''Use factors (PCA) instead of predictors'''

'''NOTE: PCA gives worse result when all the variables are used.'''
normalized_data = binary_data[['was_aware_num',
 'posing_num',
 'comfort_num',
 'will_num',
 'photographer_intention_num',
 'replacable_num'
 ,'photo_place_num'
 ]].apply(stats.zscore)
n_comp = 2
pca = PCA(n_components=n_comp).fit(normalized_data.values)

pca_df = pd.DataFrame(pca.components_, columns=high_level_concepts)
component_matrix = pca.explained_variance_**.5 * pca_df.T
component_matrix

In [None]:
component_data = pca.transform(normalized_data.values)
component_data = pd.DataFrame(component_data, columns=['F1',"F2"])

X1 = sm.add_constant(component_data)
y = binary_data['label'].values
model = sm.Logit(endog=y, exog=X1).fit(disp = False)

print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
X1.head()

In [None]:
X.head()

In [None]:
'''Use factors (FA) instead of predictors'''

normalized_data = binary_data[['was_aware_num',
 'posing_num',
 'comfort_num',
 'will_num',
 'photographer_intention_num',
 'replacable_num'
 ,'photo_place_num'
 ]].apply(stats.zscore)

n_comp = 2

fa = FactorAnalysis(random_state=0, svd_method ='lapack', 
                    n_components=n_comp).fit(normalized_data)

factor_data = fa.transform(normalized_data)
factor_data = pd.DataFrame(factor_data, columns=['F1',"F2"])

X2 = sm.add_constant(factor_data)
y = binary_data['label'].values
model = sm.Logit(endog=y, exog=X2).fit(disp = False)

print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)


### Other tests

In [None]:
'''use predictors one by one'''
dicts = []
for pred in high_level_concepts_num:
    model = test_logit(binary_data, [pred])
    print(model.summary())
    d = get_model_summary(model)
    d['Predictor'] = pred
    ors = get_OR(model)
    d['OR'] = ors.loc[pred].OR
    d['2.5%'] = ors.loc[pred]['2.5%']
    d['97.5%'] = ors.loc[pred]['97.5%']
    dicts.append(d)
   # print()
pd.DataFrame(dicts)[['Predictor', 'OR', '2.5%', '97.5%','Chi^2', 'p(Chi^2)', 'R^2']].set_index("Predictor").round(3)

In [None]:
model = test_logit(binary_data, [ 'posing_num', 'photographer_intention_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, [ 'posing_num','replacable_num', 'photographer_intention_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, [ 'was_aware_num','replacable_num', 'photographer_intention_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, ['photographer_intention_num', 'replacable_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, ['photographer_intention_num', 'replacable_num',
                                 'was_aware_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, ['photographer_intention_num', 'replacable_num',
                                 'will_num'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
binary_data['dist_sqr'] = binary_data.apply(lambda row: row.person_distance**2, axis=1)
binary_data['dist_sqrt'] = binary_data.apply(lambda row: row.person_distance**.5, axis=1)
binary_data['size_sqr'] = binary_data.apply(lambda row: row.person_size**2, axis=1)
binary_data['size_sqrt'] = binary_data.apply(lambda row: row.person_size**.5, axis=1)
binary_data.head()

In [None]:
model = test_logit(binary_data, [ 'size_sqrt'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, [ 'person_distance', 'size_sqrt'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, [ 'dist_sqr'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
model = test_logit(binary_data, [ 'dist_sqrt'])
print(model.summary())
print(pd.DataFrame([get_model_summary(model)]))
get_OR(model)

In [None]:
np.sum([1307, 615, 318, 206, 137])