# Step 6. Descriptive statistics and quality checks

In [None]:
import os
import datetime

import numpy as np
import scipy
import fiona
import statistics
import math

import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString, shape, mapping, Point, Polygon, MultiPolygon
from shapely.ops import cascaded_union, transform
import pyproj

import matplotlib.pyplot as plt
from matplotlib import colors, cm, style
import matplotlib.patches as mpatches
# from descartes import PolygonPatch

import osmnx as ox
import networkx as nx

import rasterio
from rasterio import MemoryFile
from rasterio.plot import show
from rasterio.mask import mask
import json

import contextily as cx
import folium
from folium.features import DivIcon

import random

In [None]:
from getpass import getpass

import requests
from requests import Request, Session

import hashlib
import hmac
import base64
import urllib.parse as urlparse

from datetime import date
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [None]:
from scipy import stats
import seaborn as sns
from tabulate import tabulate

## Define city and other settings

In [None]:
print_and_plot = False

In [None]:
place_names = ['Rotterdam', 'Barcelona', 'Goteborg']

In [None]:
confidential_folder = os.path.expanduser('~/confidential_folder')

In [None]:
# set random seed for generating random numbers
# and for sampling rows from geodataframes
random_state = 42
random.seed(random_state)

In [None]:
radius = 15

## Read and preprocess data

In [None]:
perceptions = gpd.GeoDataFrame()
activity_perceptions = gpd.GeoDataFrame()
greenness_perceptions = gpd.GeoDataFrame()
activity_perception_points = gpd.GeoDataFrame()
greenness_perception_points = gpd.GeoDataFrame()

In [None]:
for place_name in place_names:

    perceptions_place = gpd.read_file(os.path.join(confidential_folder, 'preprocessed', 'radius_{}'.format(radius), 'perceptions_{}.csv'.format(place_name)))
    activity_perceptions_place = gpd.read_file(os.path.join(confidential_folder, 'preprocessed', 'radius_{}'.format(radius), 'activity_perceptions_{}.geojson'.format(place_name)))
    greenness_perceptions_place = gpd.read_file(os.path.join(confidential_folder, 'preprocessed', 'radius_{}'.format(radius), 'greenness_perceptions_{}.geojson'.format(place_name)))
    activity_perception_points_place = gpd.read_file(os.path.join(confidential_folder, 'preprocessed', 'radius_{}'.format(radius), 'activity_perception_points_{}.geojson'.format(place_name)))
    greenness_perception_points_place = gpd.read_file(os.path.join(confidential_folder, 'preprocessed', 'radius_{}'.format(radius), 'greenness_perception_points_{}.geojson'.format(place_name)))

    perceptions_place['place_name'] = place_name
    activity_perceptions_place['place_name'] = place_name
    greenness_perceptions_place['place_name'] = place_name
    activity_perception_points_place['place_name'] = place_name
    greenness_perception_points_place['place_name'] = place_name

    perceptions = pd.concat([perceptions, perceptions_place]).reset_index(drop=True)
    activity_perceptions = pd.concat([activity_perceptions, activity_perceptions_place]).reset_index(drop=True)
    greenness_perceptions = pd.concat([greenness_perceptions, greenness_perceptions_place]).reset_index(drop=True)
    activity_perception_points = pd.concat([activity_perception_points, activity_perception_points_place]).reset_index(drop=True)
    greenness_perception_points = pd.concat([greenness_perception_points, greenness_perception_points_place]).reset_index(drop=True)

In [None]:
if print_and_plot:
    print(len(activity_perception_points))
    print(len(greenness_perception_points))

In [None]:
activity_perceptions_forreference = activity_perceptions[activity_perceptions.place_category=='for_reference']
greenness_perceptions_forreference = greenness_perceptions[greenness_perceptions.place_category=='for_reference']
activity_perception_points_forreference = activity_perception_points[activity_perception_points.place_category=='for_reference']
greenness_perception_points_forreference = greenness_perception_points[greenness_perception_points.place_category=='for_reference']

In [None]:
activity_perceptions = activity_perceptions[activity_perceptions.place_category!='for_reference']
greenness_perceptions = greenness_perceptions[greenness_perceptions.place_category!='for_reference']
activity_perception_points = activity_perception_points[activity_perception_points.place_category!='for_reference']
greenness_perception_points = greenness_perception_points[greenness_perception_points.place_category!='for_reference']

In [None]:
if print_and_plot:
    print(len(activity_perception_points))
    print(len(greenness_perception_points))

In [None]:
num_fields = ['clicks']
for nf in num_fields:
    perceptions[nf] = perceptions[nf].astype(float)
    perceptions[nf] = perceptions[nf].astype(int)

In [None]:
num_fields = ['clicks', 'physical_rating_num', 'social_rating_num', 'relax_rating_num', 'commute_rating_num', 'children_rating_num']
for nf in num_fields:
    activity_perceptions[nf] = activity_perceptions[nf].astype(float)
    activity_perceptions[nf] = activity_perceptions[nf].astype(int)
activity_perceptions['ndvi_median'] = activity_perceptions['ndvi_median'].astype(float)
activity_perceptions.replace({
    'near_regular_greenspace': {'True': True, 'False': False}, 
    'near_pocket_greenspace': {'True': True, 'False': False}, 
    'near_square': {'True': True, 'False': False}, 
    'near_playspace': {'True': True, 'False': False}, 
    'near_street': {'True': True, 'False': False}}, 
    inplace=True)

In [None]:
num_fields = ['clicks', 'greenness_rating_num']
for nf in num_fields:
    greenness_perceptions[nf] = greenness_perceptions[nf].astype(float)
    greenness_perceptions[nf] = greenness_perceptions[nf].astype(int)
greenness_perceptions['ndvi_median'] = greenness_perceptions['ndvi_median'].astype(float)
greenness_perceptions.replace({
    'near_regular_greenspace': {'True': True, 'False': False}, 
    'near_pocket_greenspace': {'True': True, 'False': False}, 
    'near_square': {'True': True, 'False': False}, 
    'near_playspace': {'True': True, 'False': False}, 
    'near_street': {'True': True, 'False': False}}, 
    inplace=True)

In [None]:
num_fields = ['clicks', 'physical_rating_num', 'social_rating_num', 'relax_rating_num', 'commute_rating_num', 'children_rating_num']
for nf in num_fields:
    activity_perception_points[nf] = activity_perception_points[nf].astype(float)
    activity_perception_points[nf] = activity_perception_points[nf].astype(int)
activity_perception_points['ndvi_median'] = activity_perception_points['ndvi_median'].astype(float)
activity_perception_points.replace({
    'near_regular_greenspace': {'True': True, 'False': False}, 
    'near_pocket_greenspace': {'True': True, 'False': False}, 
    'near_square': {'True': True, 'False': False}, 
    'near_playspace': {'True': True, 'False': False}, 
    'near_street': {'True': True, 'False': False}}, 
    inplace=True)

In [None]:
num_fields = ['clicks', 'greenness_rating_num']
for nf in num_fields:
    greenness_perception_points[nf] = greenness_perception_points[nf].astype(float)
    greenness_perception_points[nf] = greenness_perception_points[nf].astype(int)
greenness_perception_points['ndvi_median'] = greenness_perception_points['ndvi_median'].astype(float)
greenness_perception_points.replace({
    'near_regular_greenspace': {'True': True, 'False': False}, 
    'near_pocket_greenspace': {'True': True, 'False': False}, 
    'near_square': {'True': True, 'False': False}, 
    'near_playspace': {'True': True, 'False': False}, 
    'near_street': {'True': True, 'False': False}}, 
    inplace=True)

## Agreement

In [None]:
# median greenness per place
greenness_perceptions_perplace = greenness_perceptions[['gsv_pano_id', 'greenness_rating_num']].groupby(['gsv_pano_id']).median()
greenness_perceptions_perplace.rename(columns={'greenness_rating_num': 'greenness_rating_num_medianperplace'}, inplace=True)

In [None]:
# calculate deviations of ratings from median per place
greenness_perceptions_potentialoutliers = greenness_perceptions[['prolific_id', 'gsv_pano_id', 'greenness_rating_num']].merge(greenness_perceptions_perplace[['greenness_rating_num_medianperplace']], on='gsv_pano_id', how='left')
greenness_perceptions_potentialoutliers['greenness_rating_num_difference'] = greenness_perceptions_potentialoutliers['greenness_rating_num']-greenness_perceptions_potentialoutliers['greenness_rating_num_medianperplace']
greenness_perceptions_potentialoutliers['greenness_rating_num_absdifference'] = abs(greenness_perceptions_potentialoutliers['greenness_rating_num']-greenness_perceptions_potentialoutliers['greenness_rating_num_medianperplace'])

In [None]:
if print_and_plot:
    greenness_perceptions_potentialoutliers.greenness_rating_num_difference.hist()

In [None]:
n_total = len(greenness_perceptions_potentialoutliers)

In [None]:
n_diff_0 = len(greenness_perceptions_potentialoutliers[greenness_perceptions_potentialoutliers.greenness_rating_num_absdifference==0])
n_diff_1 = len(greenness_perceptions_potentialoutliers[(greenness_perceptions_potentialoutliers.greenness_rating_num_absdifference>0) & (greenness_perceptions_potentialoutliers.greenness_rating_num_absdifference<=1)])
n_diff_2 = len(greenness_perceptions_potentialoutliers[(greenness_perceptions_potentialoutliers.greenness_rating_num_absdifference>1) & (greenness_perceptions_potentialoutliers.greenness_rating_num_absdifference<=2)])
n_diff_more = len(greenness_perceptions_potentialoutliers[greenness_perceptions_potentialoutliers.greenness_rating_num_absdifference>2])

In [None]:
print('{}% of perceptions equal to place-median'.format(round(100*n_diff_0/n_total, 3)))
print('{}% of perceptions not equal, but up to 1. from place-median'.format(round(100*n_diff_1/n_total, 3)))
print('{}% of perceptions up to 2. (excluding 1.) from place-median'.format(round(100*n_diff_2/n_total, 3)))
print('{}% of perceptions more than 2. from place-median'.format(round(100*n_diff_more/n_total, 3)))

In [None]:
agreement_table_greenness = pd.DataFrame(columns=['0', '1', '2', '3', '4'])
col = 'greenness_rating_num'

for gsv_pano_id in greenness_perceptions.gsv_pano_id:
    # get all ratings for this gsv_pano_id, and count occurrences per value
    ratings = greenness_perceptions[greenness_perceptions.gsv_pano_id==gsv_pano_id][col].values.tolist()
    # ignore gsv_pano_ids with less than 2 ratings
    if len(ratings)>=2:
        agreement_table_greenness.loc[gsv_pano_id] = [ratings.count(0), ratings.count(1), ratings.count(2), ratings.count(3), ratings.count(4)]

In [None]:
agreement_table_physical = pd.DataFrame(columns=['0', '1', '2', '3', '4'])
col = 'physical_rating_num'

for gsv_pano_id in activity_perceptions.gsv_pano_id:
    # get all ratings for this gsv_pano_id, and count occurrences per value
    ratings = activity_perceptions[activity_perceptions.gsv_pano_id==gsv_pano_id][col].values.tolist()
    # ignore gsv_pano_ids with less than 2 ratings
    if len(ratings)>=2:
        agreement_table_physical.loc[gsv_pano_id] = [ratings.count(0), ratings.count(1), ratings.count(2), ratings.count(3), ratings.count(4)]

In [None]:
agreement_table_social = pd.DataFrame(columns=['0', '1', '2', '3', '4'])
col = 'social_rating_num'

for gsv_pano_id in activity_perceptions.gsv_pano_id:
    # get all ratings for this gsv_pano_id, and count occurrences per value
    ratings = activity_perceptions[activity_perceptions.gsv_pano_id==gsv_pano_id][col].values.tolist()
    # ignore gsv_pano_ids with less than 2 ratings
    if len(ratings)>=2:
        agreement_table_social.loc[gsv_pano_id] = [ratings.count(0), ratings.count(1), ratings.count(2), ratings.count(3), ratings.count(4)]

In [None]:
agreement_table_relax = pd.DataFrame(columns=['0', '1', '2', '3', '4'])
col = 'relax_rating_num'

for gsv_pano_id in activity_perceptions.gsv_pano_id:
    # get all ratings for this gsv_pano_id, and count occurrences per value
    ratings = activity_perceptions[activity_perceptions.gsv_pano_id==gsv_pano_id][col].values.tolist()
    # ignore gsv_pano_ids with less than 2 ratings
    if len(ratings)>=2:
        agreement_table_relax.loc[gsv_pano_id] = [ratings.count(0), ratings.count(1), ratings.count(2), ratings.count(3), ratings.count(4)]

In [None]:
agreement_table_commute = pd.DataFrame(columns=['0', '1', '2', '3', '4'])
col = 'commute_rating_num'

for gsv_pano_id in activity_perceptions.gsv_pano_id:
    # get all ratings for this gsv_pano_id, and count occurrences per value
    ratings = activity_perceptions[activity_perceptions.gsv_pano_id==gsv_pano_id][col].values.tolist()
    # ignore gsv_pano_ids with less than 2 ratings
    if len(ratings)>=2:
        agreement_table_commute.loc[gsv_pano_id] = [ratings.count(0), ratings.count(1), ratings.count(2), ratings.count(3), ratings.count(4)]

In [None]:
agreement_table_children = pd.DataFrame(columns=['0', '1', '2', '3', '4'])
col = 'children_rating_num'

for gsv_pano_id in activity_perceptions.gsv_pano_id:
    # get all ratings for this gsv_pano_id, and count occurrences per value
    ratings = activity_perceptions[activity_perceptions.gsv_pano_id==gsv_pano_id][col].values.tolist()
    # ignore gsv_pano_ids with less than 2 ratings
    if len(ratings)>=2:
        agreement_table_children.loc[gsv_pano_id] = [ratings.count(0), ratings.count(1), ratings.count(2), ratings.count(3), ratings.count(4)]

In [None]:
export_sub_folder = os.path.join(confidential_folder, 'preprocessed', 'agreement')

In [None]:
if not os.path.exists(export_sub_folder):
    os.mkdir(export_sub_folder)

In [None]:
output_file = os.path.join(export_sub_folder, 'greenness_ratings_agreement_table.xlsx')
agreement_table_greenness.to_excel(output_file)

In [None]:
output_file = os.path.join(export_sub_folder, 'physical_ratings_agreement_table.xlsx')
agreement_table_physical.to_excel(output_file)

In [None]:
output_file = os.path.join(export_sub_folder, 'social_ratings_agreement_table.xlsx')
agreement_table_social.to_excel(output_file)

In [None]:
output_file = os.path.join(export_sub_folder, 'relax_ratings_agreement_table.xlsx')
agreement_table_relax.to_excel(output_file)

In [None]:
output_file = os.path.join(export_sub_folder, 'commute_ratings_agreement_table.xlsx')
agreement_table_commute.to_excel(output_file)

In [None]:
output_file = os.path.join(export_sub_folder, 'children_ratings_agreement_table.xlsx')
agreement_table_children.to_excel(output_file)

## Demographics

In [None]:
perceptions.Age_Prolific = perceptions.Age_Prolific.replace({'': np.nan})
perceptions.Age_Prolific = perceptions.Age_Prolific.astype(float)
perceptions.Age_Prolific = perceptions.Age_Prolific.astype('Int64')

In [None]:
if print_and_plot:
    print(perceptions[perceptions.Age_Prolific.isna()]['prolific_id'])

In [None]:
if print_and_plot:
    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20,4))

    b = len(perceptions.pre_gender.unique())
    perceptions.pre_gender.hist(ax=axs[0], color='#beaed4', bins=b, xrot=90)
    axs[0].set_title('Participant genders')

    b = len(perceptions.Age_Prolific.unique())
    perceptions.Age_Prolific.hist(ax=axs[1], color='#fdc086', bins=b)
    axs[1].set_title('Participant ages')

    b = len(perceptions.pre_country.unique())
    perceptions.pre_country.hist(ax=axs[2], color='#7fc97f', bins=b, xrot=90)
    axs[2].set_title('Participant countries of residence')

    plt.show()

In [None]:
if print_and_plot:
    for gender in perceptions.pre_gender.unique():
        print('{}, n={}, {}%'.format(
            gender, 
            len(perceptions[perceptions.pre_gender==gender]), 
            round(100*len(perceptions[perceptions.pre_gender==gender])/len(perceptions), 2)))

In [None]:
if print_and_plot:
    for age in perceptions.pre_age.unique():
        print('{}, n={}, {}%'.format(
            age, 
            len(perceptions[perceptions.pre_age==age]), 
            round(100*len(perceptions[perceptions.pre_age==age])/len(perceptions), 2)))

In [None]:
if print_and_plot:
    len(perceptions.pre_country.unique())

In [None]:
# check demographics
if print_and_plot:
    n = len(perceptions)
    print('{}% of participants live in a city'.format(round(100*len(perceptions[perceptions.post_city=='Yes'])/n, 2)))
    print('{}% of participants have children'.format(round(100*len(perceptions[perceptions.post_children=='Yes'])/n, 2)))
    print('{}% of participants are built environment professional'.format(round(100*len(perceptions[perceptions.post_profession_built=='Yes'])/n, 2)))
    print('{}% of participants are health professional'.format(round(100*len(perceptions[perceptions.post_profession_health=='Yes'])/n, 2)))

In [None]:
if print_and_plot:
    n = len(perceptions)
    print('{}% of participants found questions clear'.format(round(100*len(perceptions[perceptions.survey_questions_clear=='Yes'])/n, 2)))
    print('{}% of participants panned panoramas around'.format(round(100*len(perceptions[perceptions.survey_questions_panned=='Yes'])/n, 2)))
    print('{}% of participants did not walk in the panoramas'.format(round(100*len(perceptions[perceptions.survey_questions_walked=='No'])/n, 2)))
    print('{}% of participants did not know places from personal experience'.format(round(100*len(perceptions[perceptions.survey_questions_knewplaces=='No'])/n, 2)))

## Ratings

In [None]:
if print_and_plot:
    # 140 places per city (excluding the 3 reference-locations per city)
    n_total = 140 * len(place_names)
    print('{}/{} panoramas were not rated on activity by anyone'.format(n_total-len(activity_perceptions.groupby('gsv_pano_id')), n_total))

In [None]:
if print_and_plot:
    print('Activity-ratings per location:\nMean: {}\nMedian: {}\nMin: {}\nMax: {}'.format(
        round(activity_perceptions.groupby('gsv_pano_id')['prolific_id'].count().mean(), 2),
        round(activity_perceptions.groupby('gsv_pano_id')['prolific_id'].count().median(), 2),
        round(activity_perceptions.groupby('gsv_pano_id')['prolific_id'].count().min(), 2),
        round(activity_perceptions.groupby('gsv_pano_id')['prolific_id'].count().max(), 2)))

In [None]:
if print_and_plot:
    n_total = 140 * len(place_names)
    print('{}/{} panoramas were not rated on greenness by anyone'.format(n_total-len(greenness_perceptions.groupby('gsv_pano_id')), n_total))

In [None]:
if print_and_plot:
    print('Greenness-ratings per location:\nMean: {}\nMedian: {}\nMin: {}\nMax: {}'.format(
        round(greenness_perceptions.groupby('gsv_pano_id')['prolific_id'].count().mean(), 2),
        round(greenness_perceptions.groupby('gsv_pano_id')['prolific_id'].count().median(), 2),
        round(greenness_perceptions.groupby('gsv_pano_id')['prolific_id'].count().min(), 2),
        round(greenness_perceptions.groupby('gsv_pano_id')['prolific_id'].count().max(), 2)))

In [None]:
if print_and_plot:
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20,5), sharey=True)

    greenness_perceptions.iframe.hist(ax=axs[0], bins=len(greenness_perceptions.iframe.unique()))
    axs[0].set_title('Greenness rating')

    activity_perceptions.iframe.hist(ax=axs[1], bins=len(activity_perceptions.iframe.unique()))
    axs[1].set_title('Physical activity rating')
    
    axs[0].get_xaxis().set_visible(False)
    axs[1].get_xaxis().set_visible(False)

    plt.show()

In [None]:
if print_and_plot:
    fig, axs = plt.subplots(nrows=1, ncols=6, figsize=(20,5), sharey=True)

    bins = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]
    
    for ax in axs:
        ax.xaxis.set_ticks([0, 1, 2, 3, 4])
    
    greenness_perceptions.greenness_rating_num.hist(ax=axs[0], bins=bins, rwidth=0.9, xrot=90)
    axs[0].set_title('Greenness rating\nMedian: {}'.format(round(greenness_perceptions.greenness_rating_num.median(), 1)))

    activity_perceptions.physical_rating_num.hist(ax=axs[1], bins=bins, rwidth=0.9, xrot=90)
    axs[1].set_title('Physical activity rating\nMedian: {}'.format(round(activity_perceptions.physical_rating_num.median(), 1)))

    activity_perceptions.social_rating_num.hist(ax=axs[2], bins=bins, rwidth=0.9, xrot=90)
    axs[2].set_title('Social activity rating\nMedian: {}'.format(round(activity_perceptions.social_rating_num.median(), 1)))

    activity_perceptions.relax_rating_num.hist(ax=axs[3], bins=bins, rwidth=0.9, xrot=90)
    axs[3].set_title('Relaxation activity rating\nMedian: {}'.format(round(activity_perceptions.relax_rating_num.median(), 1)))

    activity_perceptions.commute_rating_num.hist(ax=axs[4], bins=bins, rwidth=0.9, xrot=90)
    axs[4].set_title('Commuting activity rating\nMedian: {}'.format(round(activity_perceptions.commute_rating_num.median(), 3)))

    activity_perceptions.children_rating_num.hist(ax=axs[5], bins=bins, rwidth=0.9, xrot=90)
    axs[5].set_title('Children\'s activity rating\nMedian: {}'.format(round(activity_perceptions.children_rating_num.median(), 1)))
    
    for ax in axs:
        labels = [item.get_text() for item in axs[0].get_xticklabels()]
        if ax==axs[0]:
            labels = ['not at all (0)', 'a little (1)', 'neutral (2)', 'fairly (3)', 'very (4)']
        else:
            labels = ['never (0)', 'rarely (1)', 'sometimes (2)', 'often (3)', 'always (4)']
        ax.set_xticklabels(labels)

    plt.show()

In [None]:
# are the ratings normally distributed?
if print_and_plot:
    print('Kolmogorov-Smirnov test, greenness: ', stats.kstest(greenness_perceptions.greenness_rating_num, 'norm'))
    print('Kolmogorov-Smirnov test, physical: ', stats.kstest(activity_perceptions.physical_rating_num, 'norm'))
    print('Kolmogorov-Smirnov test, social: ', stats.kstest(activity_perceptions.social_rating_num, 'norm'))
    print('Kolmogorov-Smirnov test, relax: ', stats.kstest(activity_perceptions.relax_rating_num, 'norm'))
    print('Kolmogorov-Smirnov test, commute: ', stats.kstest(activity_perceptions.commute_rating_num, 'norm'))
    print('Kolmogorov-Smirnov test, children: ', stats.kstest(activity_perceptions.children_rating_num, 'norm'))
    # -> none normally distributed

#### Check ratings of reference-locations

In [None]:
if print_and_plot:
    
    fig, axs = plt.subplots(nrows=1, ncols=6, figsize=(20,5), sharey=True)

    bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]
    
    greenness_perceptions_forreference.greenness_rating_num.hist(ax=axs[0], bins=bins, rwidth=0.9)
    axs[0].set_title('Greenness rating')

    activity_perceptions_forreference.physical_rating_num.hist(ax=axs[1], bins=bins, rwidth=0.9)
    axs[1].set_title('Physical activity rating')

    activity_perceptions_forreference.social_rating_num.hist(ax=axs[2], bins=bins, rwidth=0.9)
    axs[2].set_title('Social activity rating')

    activity_perceptions_forreference.relax_rating_num.hist(ax=axs[3], bins=bins, rwidth=0.9)
    axs[3].set_title('Relaxation activity rating')

    activity_perceptions_forreference.commute_rating_num.hist(ax=axs[4], bins=bins, rwidth=0.9)
    axs[4].set_title('Commuting activity rating')

    activity_perceptions_forreference.children_rating_num.hist(ax=axs[5], bins=bins, rwidth=0.9)
    axs[5].set_title('Children\'s activity rating')

    plt.show()

#### Ratings in relation to demographics

In [None]:
if print_and_plot:
    # difference in ratings between male and female?
    women = perceptions[perceptions.pre_gender=='Female'].prolific_id.unique()
    men = perceptions[perceptions.pre_gender=='Male'].prolific_id.unique()

    x = greenness_perceptions[greenness_perceptions.prolific_id.isin(women)]
    y = greenness_perceptions[greenness_perceptions.prolific_id.isin(men)]

    col = 'greenness_rating_num'
    print('Female vs. male, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    x = activity_perceptions[activity_perceptions.prolific_id.isin(women)]
    y = activity_perceptions[activity_perceptions.prolific_id.isin(men)]

    col = 'physical_rating_num'
    print('Female vs. male, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'social_rating_num'
    print('Female vs. male, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'relax_rating_num'
    print('Female vs. male, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'commute_rating_num'
    print('Female vs. male, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'children_rating_num'
    print('Female vs. male, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

In [None]:
# explore in further detail: how do male vs. female participants perceive...
if print_and_plot:
    col = 'relax_rating_num'
    statistic, pvalue = stats.mannwhitneyu(x[col], y[col])
    if pvalue <= 0.01:
        print('Significant difference {} between male and female participants'.format(col))
        print('female {}: median {}; mean {}'.format(col, x[col].median(), x[col].mean()))
        print('male {}: median {}, mean {}\n'.format(col, y[col].median(), y[col].mean()))
        
    col = 'commute_rating_num'
    statistic, pvalue = stats.mannwhitneyu(x[col], y[col])
    if pvalue <= 0.01:
        print('Significant difference {} between male and female participants'.format(col))
        print('female {}: median {}; mean {}'.format(col, x[col].median(), x[col].mean()))
        print('male {}: median {}, mean {}'.format(col, y[col].median(), y[col].mean()))

In [None]:
if print_and_plot:
    # difference in ratings between parents and non-parents?
    parents = perceptions[perceptions.post_children=='Yes'].prolific_id.unique()
    nonparents = perceptions[perceptions.post_children=='No'].prolific_id.unique()

    x = greenness_perceptions[greenness_perceptions.prolific_id.isin(parents)]
    y = greenness_perceptions[greenness_perceptions.prolific_id.isin(nonparents)]

    col = 'greenness_rating_num'
    print('Parent vs. non-parent, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    x = activity_perceptions[activity_perceptions.prolific_id.isin(parents)]
    y = activity_perceptions[activity_perceptions.prolific_id.isin(nonparents)]

    col = 'children_rating_num'
    print('Parent vs. non-parent, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

In [None]:
if print_and_plot:
    # difference in ratings between city dwellers and non-city dwellers?
    citydweller = perceptions[perceptions.post_city=='Yes'].prolific_id.unique()
    noncitydweller = perceptions[perceptions.post_city=='No'].prolific_id.unique()

    x = greenness_perceptions[greenness_perceptions.prolific_id.isin(citydweller)]
    y = greenness_perceptions[greenness_perceptions.prolific_id.isin(noncitydweller)]

    col = 'greenness_rating_num'
    print('City dweller vs. non-city dweller, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    x = activity_perceptions[activity_perceptions.prolific_id.isin(citydweller)]
    y = activity_perceptions[activity_perceptions.prolific_id.isin(noncitydweller)]

    col = 'physical_rating_num'
    print('City dweller vs. non-city dweller, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'social_rating_num'
    print('City dweller vs. non-city dweller, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'relax_rating_num'
    print('City dweller vs. non-city dweller, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'commute_rating_num'
    print('City dweller vs. non-city dweller, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

    col = 'children_rating_num'
    print('City dweller vs. non-city dweller, {} - {}'.format(col, stats.mannwhitneyu(x[col], y[col])))

In [None]:
# explore in further detail: how do city vs. non-city dwelling participants perceive suitability for commuting?
if print_and_plot:
    
    col = 'physical_rating_num'
    statistic, pvalue = stats.mannwhitneyu(x[col], y[col])
    if pvalue <= 0.01:
        print('Significant difference {} between city and non-city dwelling participants'.format(col))
        print('city dweller {}: median {}; mean {}'.format(col, x[col].median(), x[col].mean()))
        print('non-city dweller {}: median {}, mean {}\n'.format(col, y[col].median(), y[col].mean()))
    
    col = 'commute_rating_num'
    statistic, pvalue = stats.mannwhitneyu(x[col], y[col])
    if pvalue <= 0.01:
        print('Significant difference {} between city and non-city dwelling participants'.format(col))
        print('city dweller {}: median {}; mean {}'.format(col, x[col].median(), x[col].mean()))
        print('non-city dweller {}: median {}, mean {}'.format(col, y[col].median(), y[col].mean()))

In [None]:
if print_and_plot:
    # difference in ratings between age groups
    age_a = perceptions[perceptions.Age_Prolific<30].prolific_id.unique()
    age_b = perceptions[(perceptions.Age_Prolific<40) & (perceptions.Age_Prolific>=30)].prolific_id.unique()
    age_c = perceptions[(perceptions.Age_Prolific<50) & (perceptions.Age_Prolific>=40)].prolific_id.unique()
    age_d = perceptions[(perceptions.Age_Prolific<60) & (perceptions.Age_Prolific>=50)].prolific_id.unique()
    age_e = perceptions[perceptions.Age_Prolific>=60].prolific_id.unique()

    a = greenness_perceptions[greenness_perceptions.prolific_id.isin(age_a)]
    b = greenness_perceptions[greenness_perceptions.prolific_id.isin(age_b)]
    c = greenness_perceptions[greenness_perceptions.prolific_id.isin(age_c)]
    d = greenness_perceptions[greenness_perceptions.prolific_id.isin(age_d)]
    e = greenness_perceptions[greenness_perceptions.prolific_id.isin(age_e)]

    col = 'greenness_rating_num'
    print('Various age groups, {} - {}'.format(col, stats.kruskal(a[col], b[col], c[col], d[col], e[col])))

    headers = ['']
    rows = []
    name_cols = ['18-29', '30-39', '40-49', '50-59', '60+']
    data_cols = [a[col], b[col], c[col], d[col], e[col]]
    for i in range(len(data_cols)):
        name_a = name_cols[i]
        data_a = data_cols[i]
        row = [name_a]
        # print(type(name_a))
        for j in range(len(data_cols)):
            name_b = name_cols[j]
            # print(type(name_b))
            data_b = data_cols[j]
            if name_a == name_b:
                row.append('X')
                break
            else:
                statistic, pvalue = stats.mannwhitneyu(data_a, data_b)
                statistic = round(statistic, 3)
                if pvalue <= 0.05:
                    statistic = str(statistic)+'*'
                    if pvalue <= 0.01:
                        statistic = statistic+'*'
                row.append(statistic)
        headers.append(name_a)
        rows.append(row)
    print(tabulate(rows, headers=headers))

## Open answers

#### Random sample check of some open answers: quality and sensibility in relation to iframe

In [None]:
# for Pilot, GPT zero checks of 10 randomly selected answers resulted in
# 'your text is likely to be written entirely by a human'

In [None]:
cols = ['physical_reason', 'social_reason', 'relax_reason', 'commute_reason', 'children_reason']

In [None]:
def print_random_reason_iframe():
    col = random.choice(cols)
    one_entry = activity_perceptions[activity_perceptions[col].notna()].sample(1)
    i = one_entry.index[0]
    reason = one_entry[col][i]
    iframe = one_entry.iframe[i]
    
    print(col)
    print(reason)
    print(iframe)

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
cols = ['greenness_reason']

In [None]:
def print_random_reason_iframe():
    col = random.choice(cols)
    one_entry = greenness_perceptions[greenness_perceptions[col].notna()].sample(1)
    i = one_entry.index[0]
    reason = one_entry[col][i]
    iframe = one_entry.iframe[i]
    
    print(col)
    print(reason)
    print(iframe)

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

In [None]:
if print_and_plot:
    print_random_reason_iframe()

#### All open answers sensibility check

In [None]:
n_max = 20

In [None]:
col = 'physical_reason'
if print_and_plot:
    n = 0
    for answer in activity_perceptions[col]:
        if n >= n_max:
            break
        if not answer == 'nan':
            print(answer.replace('\n', ''))
            n += 1

In [None]:
col = 'social_reason'
if print_and_plot:
    n = 0
    for answer in activity_perceptions[col]:
        if n >= n_max:
            break
        if not answer == 'nan':
            print(answer.replace('\n', ''))
            n += 1

In [None]:
col = 'relax_reason'
if print_and_plot:
    n = 0
    for answer in activity_perceptions[col]:
        if n >= n_max:
            break
        if not answer == 'nan':
            print(answer.replace('\n', ''))
            n += 1

In [None]:
col = 'commute_reason'
if print_and_plot:
    n = 0
    for answer in activity_perceptions[col]:
        if n >= n_max:
            break
        if not answer == 'nan':
            print(answer.replace('\n', ''))
            n += 1

In [None]:
col = 'children_reason'
if print_and_plot:
    n = 0
    for answer in activity_perceptions[col]:
        if n >= n_max:
            break
        if not answer == 'nan':
            print(answer.replace('\n', ''))
            n += 1

In [None]:
col = 'greenness_reason'
if print_and_plot:
    n = 0
    for answer in greenness_perceptions[col]:
        if n >= n_max:
            break
        if not answer == 'nan':
            print(answer.replace('\n', ''))
            n += 1