# Description

This script takes the visual change dataset and provides the f1-scores for different classifiers estimating visual change on various subsets of the dataset.

### Imports

In [None]:
# Modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
import os.path
from collections import defaultdict

# Custom modules
from classifier import Classifier

# Settings

In [None]:
# Settings
min_obs_extent = 32 # extent too which obs are withdrawn 
baseline_feature_name = 'pixel_diff_count_bgr' # 'n_grams_jaccard' (must be not dropped by features_drop or features_filter)

# Drop and filter features
features_drop = [
    'bag_of_words_vocabulary_size',
    'optical_flow_angle_min',
    'optical_flow_angle_max',
    'optical_flow_magnitude_min']
features_filter = [ # not applied if empty (all features but dropped ones are then considered)
    #'edge_change_fraction',
    #'mssim_b',
    #'mssim_g',
    #'mssim_r',
    #'pixel_diff_acc_b',
    #'pixel_diff_acc_bgr',
    #'pixel_diff_acc_g',
    #'pixel_diff_acc_gray',
    #'pixel_diff_acc_hue',
    #'pixel_diff_acc_lightness',
    #'pixel_diff_acc_r',
    #'pixel_diff_acc_saturation',
    #'pixel_diff_count_b',
    #'pixel_diff_count_bgr',
    #'pixel_diff_count_g',
    #'pixel_diff_count_gray',
    #'pixel_diff_count_hue',
    #'pixel_diff_count_lightness',
    #'pixel_diff_count_r',
    #'pixel_diff_count_saturation',
    #'psnr',
    #'sift_match',
    #'sift_match_0',
    #'sift_match_16',
    #'sift_match_256',
    #'sift_match_4',
    #'sift_match_512',
    #'sift_match_64',
    #'sift_match_distance_max',
    #'sift_match_distance_mean',
    #'sift_match_distance_min',
    #'sift_match_distance_stddev',
    #'sift_match_spatial'
]

# Defines
dataset_visual_change_dir = r'C:/StimuliDiscoveryData/Dataset_visual_change' # SET ME!
participants = ['p1', 'p2', 'p3', 'p4']

# Categories
shopping = ['walmart', 'amazon', 'steam']
news = ['reddit', 'cnn', 'guardian']
health = ['nih', 'webmd', 'mayo']
cars = ['gm', 'nissan', 'kia']
categories = {'shopping': shopping, 'news': news, 'health': health, 'cars': cars}

# Session

In [None]:
# Session class holds one site visit by a participant
class Session:
    
    # Constructor
    def __init__(self, participant, site):
        
        # Store some members of general interest
        self.p = participant
        self.s = site
        
        # Load dataset (header of features dataset has extra comma)
        self.f_df = pd.read_csv(dataset_visual_change_dir + '/' + participant + '/' + site + '_features.csv')
        self.mf_df = pd.read_csv(dataset_visual_change_dir + '/' + participant + '/' + site + '_features_meta.csv')
        self.l1_df = pd.read_csv(dataset_visual_change_dir + '/' + participant + '/' + site + '_labels-l1.csv', header=None, names=['label'])
        self.m_df = pd.read_csv(dataset_visual_change_dir + '/' + participant + '/' + site + '_meta.csv')
        
        # Load additional labeling by external person if available
        file_path = dataset_visual_change_dir + '/' + participant + '/' + site + '_labels-l2.csv'
        self.l2_df = pd.DataFrame
        if os.path.exists(file_path):
            self.l2_df = pd.read_csv(file_path, header=None, names=['label'])
        
        # Drop columns of non-interest
        self.f_df = self.f_df.drop(features_drop, axis=1)
        
        # Filter for columns of interest
        if len(features_filter) > 0:
            self.f_df = self.f_df.filter(items=features_filter, axis=1)
        
        # Drop observations that are smaller than a certain extent
        width_idxs = self.mf_df[self.mf_df['overlap_width'] <= min_obs_extent].index
        height_idxs = self.mf_df[self.mf_df['overlap_height'] <= min_obs_extent].index
        drop_idxs = list(set(width_idxs) | set(height_idxs))
        self.f_df = self.f_df.drop(drop_idxs, axis=0)
        self.mf_df = self.mf_df.drop(drop_idxs, axis=0)
        self.l1_df = self.l1_df.drop(drop_idxs, axis=0)
        if not self.l2_df.empty: self.l2_df = self.l2_df.drop(drop_idxs, axis=0)
        self.skipped = len(drop_idxs)
        
    # Compare labelings
    def compare_labeling(self):
        if not self.l2_df.empty:            
            report = classification_report(self.l1_df.values, self.l2_df.values, output_dict=True)
            print('-> L1 VS L2 for ' + self.p + ', ' + self.s)
            print('Cohen Kappa Score: ' + str(cohen_kappa_score(self.l1_df.values, self.l2_df.values))) # 70% is good
            
            # Classes
            class_0 = report['0.0']
            class_1 = report['1.0']

            # Values
            print('Class 0: ', end='')
            print('Precision: ' + f"{class_0['precision']:.2f}" + ', ', end='') 
            print('Recall: ' + f"{class_0['recall']:.2f}" + ', ', end='')
            print('F1-Score: ' + f"{class_0['f1-score']:.2f}")
            print('Class 1: ', end='')
            print('Precision: ' + f"{class_1['precision']:.2f}" + ', ', end='') 
            print('Recall: ' + f"{class_1['recall']:.2f}" + ', ', end='')
            print('F1-Score: ' + f"{class_1['f1-score']:.2f}")
            
            # Return information about success
            return True
        return False

# General

In [None]:
# Import sessions of each category
sessions = []
for category, sites in categories.items():
    for site in sites:
        for p in participants:
            sessions.append(Session(p, site))
            
# Count observations and labels with value of one
labeled_obs_count = 0
skipped_pixel_perfect_obs_count = 0
skipped_overlap_extent_count = 0
total_obs_count = 0
diff_count = 0
screencast_seconds = 0.0
frame_count = 0
for session in sessions:
    labeled_obs_count += len(session.f_df.index) # aggregate observations (without skipped, those for which are features computed)
    skipped_pixel_perfect_obs_count += session.m_df['observation_count_skipped'].values[0] # aggregate skipped observations
    skipped_overlap_extent_count += session.skipped # skipped because of overlap extent
    total_obs_count += session.m_df['observation_total_count'].values[0] # aggregate total observations
    diff_count += np.count_nonzero(session.l1_df.values) # aggregate nonzero labels
    screencast_seconds += session.m_df['screencast_seconds'].values[0] # aggregate screencast duration
    frame_count += session.m_df['screencast_frame_total_count'].values[0] # aggregate frame count from screen cast
    
print('Screencast seconds: ' + str(screencast_seconds))
print('Screencast minutes: ' + str(screencast_seconds/60))
print('Count of labeled observations: ' + str(labeled_obs_count))
print('Count of skipped observations (because pixel-perfect similar): ' + str(skipped_pixel_perfect_obs_count))
print('Count of skipped observations (because overlap too small): ' + str(skipped_overlap_extent_count))
print('Count of total observations: ' + str(total_obs_count))
print('Count of observations labeled as visual different: ' + str(diff_count))
print('Count of frames in screen casts: ' + str(frame_count))
print()
print('### Compare different labelings')
print()
for session in sessions:
    if session.compare_labeling():
        print()

# Features Computation Times

In [None]:
# Import sessions of each category
sessions = []
for category, sites in categories.items():
    for site in sites:
        for p in participants:
            sessions.append(Session(p, site))

# Collect all entries
sift_match = []
pixel_diff = []
optical_flow = []
ocr = []
histogram = []
edge = []
psnr = []
mssim = []
n_grams = []
for session in sessions:
    sift_match.append(session.mf_df['sift_match_features [ms]'].values)
    pixel_diff.append(session.mf_df['pixel_diff_features [ms]'].values)
    optical_flow.append(session.mf_df['optical_flow_features [ms]'].values)
    ocr.append(session.mf_df['ocr_descriptors [ms]'].values)
    histogram.append(session.mf_df['histogram_descriptors [ms]'].values)
    edge.append(session.mf_df['edge_change_ratio_features [ms]'].values)
    psnr.append(session.mf_df['psnr_features [ms]'].values)
    mssim.append(session.mf_df['mssim_features [ms]'].values)
    n_grams.append(session.mf_df['n_grams_features [ms]'].values)

# Flatten the collected entries
sift_match = np.concatenate(sift_match).ravel()
pixel_diff = np.concatenate(pixel_diff).ravel()
optical_flow = np.concatenate(optical_flow).ravel()
ocr = np.concatenate(ocr).ravel()
histogram = np.concatenate(histogram).ravel()
edge = np.concatenate(edge).ravel()
psnr = np.concatenate(psnr).ravel()
mssim = np.concatenate(mssim).ravel()
n_grams = np.concatenate(n_grams).ravel()

# Print information
print('### Feature Computation Timings in Milliseconds')
print('Sift Match:   ' + f'{np.mean(sift_match):.2f}' + '±' + f'{np.std(sift_match):.2f}')
print('Pixel Diff:   ' + f'{np.mean(pixel_diff):.2f}' + '±' + f'{np.std(pixel_diff):.2f}')
print('Optical Flow: ' + f'{np.mean(optical_flow):.2f}' + '±' + f'{np.std(optical_flow):.2f}')
print('OCR:          ' + f'{np.mean(ocr):.2f}' + '±' + f'{np.std(ocr):.2f}')
print('Histogram:    ' + f'{np.mean(histogram):.2f}' + '±' + f'{np.std(histogram):.2f}')
print('Edge:         ' + f'{np.mean(edge):.2f}' + '±' + f'{np.std(edge):.2f}')
print('PSNR:         ' + f'{np.mean(psnr):.2f}' + '±' + f'{np.std(psnr):.2f}')
print('MSSIM:        ' + f'{np.mean(mssim):.2f}' + '±' + f'{np.std(mssim):.2f}')
print('N-Grams:      ' + f'{np.mean(n_grams):.2f}' + '±' + f'{np.std(n_grams):.2f}')

sum = np.mean(sift_match) + np.mean(pixel_diff) + np.mean(optical_flow) + np.mean(ocr) + np.mean(histogram) + np.mean(edge) + np.mean(psnr) + np.mean(mssim) + np.mean(n_grams)
print('Sum: '+ f'{sum:.2f}')

# Learner

In [None]:
# Base class of learners
class Learner:
    
    # Constructor
    def __init__(self):
        self.importances = defaultdict(list) # this is a dictionary with feature_name as key and list of importance values as value
        self.avg_class0_f1 = 0.0
        self.avg_class1_f1 = 0.0
        self.visual_changes_rf = [] # visual changes as computed by random forest classifier
    
    # Compute learning on sessions. Returns dictionary with predictions from different classifiers
    def compute(self, sessions, idxs_test):
        
        # Compose training set
        features_train = []
        labels_train = []
        idxs_train = set(range(len(sessions))) - set(idxs_test)
        for idx in idxs_train:
            features_train.append(sessions[idx].f_df)
            labels_train.append(sessions[idx].l1_df)
        f_train_df = pd.concat(features_train, ignore_index=True)
        l1_train_df = pd.concat(labels_train, ignore_index=True)
        
        # Compose test set
        features_test = []
        labels_test = []
        for idx in idxs_test:
            features_test.append(sessions[idx].f_df)
            labels_test.append(sessions[idx].l1_df)
        f_test_df = pd.concat(features_test, ignore_index=True)
        l1_test_df = pd.concat(labels_test, ignore_index=True)
        
        # Replace some values (but use complete training set for estimation)
        if 'optical_flow_magnitude_max' in f_train_df.columns:
            
            # Replace infinity datapoints in 'optical_flow_magnitude_max' with maximum value (encoded as -1)
            max_value = f_train_df['optical_flow_magnitude_max'].max() # maximum from complete training data

            # In both, training an test data
            f_train_df[f_train_df['optical_flow_magnitude_max'] == -1] = max_value
            f_test_df[f_test_df['optical_flow_magnitude_max'] == -1] = max_value
        
        # Convert pandas dataframe to numpy array
        X_train = f_train_df.values
        y_train = l1_train_df.values.flatten()
        X_test = f_test_df.values
        y_test = l1_test_df.values.flatten()
        
        # Use machine to predict labels of test data
        classifier = Classifier()
        idx_baseline = f_train_df.columns.get_loc(baseline_feature_name)
        pred = classifier.apply(X_train, y_train, X_test, idx_baseline)
        pred['truth'] = y_test # adding ground truth to the prediction
        
        # Also apply the classifier on the entire input data as test data (which includes training data)
        # This allows us to estimate the number of computed shot boundaries across the entire input data and compare
        # to related work in scene segmentation
        features_complete = []
        for session in sessions:
            features_complete.append(session.f_df)
        f_complete_df = pd.concat(features_complete, ignore_index=True)
        pred_complete = classifier.apply(X_train, y_train, f_complete_df.values, idx_baseline)
        self.visual_changes_rf.append(np.count_nonzero(pred_complete['forest']))
        
        # Store predictions
        # TODO: split computed labels by test sessions
        # TODO: provide kind of test as method parameter (e.g., same-site)
        # pd.DataFrame(pred['forest']).to_csv(dataset_dir + participant + '/' + site + '_l_forest.csv')
        
        # Return predictions
        return pred
    
    # Compute some members
    def analyze_predictions(self, preds, feature_names):
        
        # Go over predictions
        for pred in preds:
            
            # Store feature importance
            for i in range(len(pred['importance'])):
                self.importances[feature_names[i]].append(pred['importance'][i])
                
            # Store f1-score
            score = f1_score(pred['truth'], pred['forest'], average=None) # f1-scores for class 0 and class 1
            self.avg_class0_f1 += score[0]
            self.avg_class1_f1 += score[1]
            
        # Average the scores
        self.avg_class0_f1 /= len(preds)
        self.avg_class1_f1 /= len(preds)
    
    # Report about learning
    def report(self, preds, feature_names):
    
        # Report metrics by collecting all reports across idxs_test, per classifier
        reports = {'logreg': [], 'svc': [], 'forest': [], 'baseline': []}
        importances = []
        for pred in preds:
            
            # Generate classification reports from classifier predictions
            reports['logreg'].append(classification_report(pred['truth'], pred['logreg'], output_dict=True))
            reports['svc'].append(classification_report(pred['truth'], pred['svc'], output_dict=True))
            reports['forest'].append(classification_report(pred['truth'], pred['forest'], output_dict=True))
            reports['baseline'].append(classification_report(pred['truth'], pred['baseline'], output_dict=True))
            
            # Store feature importance
            for i in range(len(pred['importance'])):
                if i >= len(importances):
                    importances.append([])
                importances[i].append(pred['importance'][i])
                
        # Comprehend importances and sort list of feature names accordingly
        importances = [np.mean(x) for x in importances]
        important_features = sorted(zip(importances, feature_names), reverse=True) # the higher, the more important
        
        # Collect all metrics in a nested dictionary classifier -> metrics
        self.classifier_metrics = {}
        
        # Go over classifiers
        for classifier, reports in reports.items():
            
            # Compose report
            precision_0 = []
            precision_1 = []
            recall_0 = []
            recall_1 = []
            f1_score_0 = []
            f1_score_1 = []
            support_0 = [] # not yet used
            support_1 = [] # not yet used

            # Go over reports for specific classifier
            for report in reports:

                # Classes
                class_0 = report['0.0']
                class_1 = report['1.0']

                # Values
                precision_0.append(class_0['precision'])
                recall_0.append(class_0['recall'])
                f1_score_0.append(class_0['f1-score'])
                support_0.append(class_0['support'])
                precision_1.append(class_1['precision'])
                recall_1.append(class_1['recall'])
                f1_score_1.append(class_1['f1-score'])
                support_1.append(class_1['support'])
                
            # Fill metrics into dictionary for the classifier
            metrics = {}
            metrics['precision_0'] = [np.mean(precision_0), np.std(precision_0)]
            metrics['recall_0']    = [np.mean(recall_0),    np.std(recall_0)]
            metrics['f1-score_0']  = [np.mean(f1_score_0),  np.std(f1_score_0)]
            metrics['precision_1'] = [np.mean(precision_1), np.std(precision_1)]
            metrics['recall_1']    = [np.mean(recall_1),    np.std(recall_1)]
            metrics['f1-score_1']  = [np.mean(f1_score_1),  np.std(f1_score_1)]
            self.classifier_metrics[classifier] = metrics

        # Print report of mean and stddev of metrics
        metric_strings = ['precision_0', 'recall_0', 'f1-score_0', 'precision_1', 'recall_1', 'f1-score_1']
        
        # Print header
        print(''.ljust(14), end='')
        for classifier in self.classifier_metrics.keys():
            print(classifier.ljust(11), end='')
        print('Ranked Important Features (by mean importance)')
        
        # Print values
        i = 0
        for metric_string in metric_strings:
            print(metric_string.rjust(12) + '  ', end='')
            for classifier, metrics in self.classifier_metrics.items():
                metric = metrics[metric_string]
                print(f'{metric[0]:.2f}' + '±' + f'{metric[1]:.2f}', end='  ')
            if i < len(important_features):
                print(important_features[i][1].ljust(32) + '' + str(important_features[i][0]), end='')
            print()
            i += 1
            
        # Print some more important features
        while i < 15 and i < len(important_features):
            print('                                                          ', end='')
            print(important_features[i][1].ljust(32) + '' + str(important_features[i][0]), end='')
            print()
            i += 1
            
    # Report about learning for paper (latexish code)
    def report_paper(self, preds, feature_names):
    
        # Report metrics by collecting all reports across idxs_test, per classifier
        reports = {'svc': [], 'forest': [], 'baseline': []}
        importances = []
        for pred in preds:
            
            # Generate classification reports from classifier predictions
            reports['svc'].append(classification_report(pred['truth'], pred['svc'], output_dict=True))
            reports['forest'].append(classification_report(pred['truth'], pred['forest'], output_dict=True))
            reports['baseline'].append(classification_report(pred['truth'], pred['baseline'], output_dict=True))
        
        # Collect all metrics in a nested dictionary classifier -> metrics
        self.classifier_metrics = {}
        
        # Go over classifiers
        for classifier, reports in reports.items():
            
            # Compose report
            f1_score_0 = []
            f1_score_1 = []
            
            # Go over reports for specific classifier
            for report in reports:

                # Classes
                class_0 = report['0.0']
                class_1 = report['1.0']

                # Values
                f1_score_0.append(class_0['f1-score'])
                f1_score_1.append(class_1['f1-score'])
                
            # Fill metrics into dictionary for the classifier
            metrics = {}
            metrics['f1-score_0']  = [np.mean(f1_score_0),  np.std(f1_score_0)]
            metrics['f1-score_1']  = [np.mean(f1_score_1),  np.std(f1_score_1)]
            self.classifier_metrics[classifier] = metrics
            
        # Print report of mean and stddev of metrics
        metric_strings = ['f1-score_0', 'f1-score_1']
        
        # Print header
        print(''.ljust(14), end='')
        for classifier in self.classifier_metrics.keys():
            print(classifier.ljust(11), end='')
        print()
        
        # Print values
        for metric_string in metric_strings:
            print(metric_string.rjust(12) + '  ', end='')
            for classifier, metrics in self.classifier_metrics.items():
                metric = metrics[metric_string]
                print('$' + str(int(round(100*metric[0]))).zfill(2)  + '\\pm' + str(int(round(100*metric[1]))).zfill(2), end='$ & ')
            print()

## Specific Learners

In [None]:
# One-Session Learner
class OneSessionLearner(Learner):
    
    # Constructor
    def __init__(self, site):
        
        # Super
        Learner.__init__(self)
        
        # Import sessions
        self.sessions = []
        for p in participants:
            self.sessions.append(Session(p, site))
            
        # Perform learning for each session by using one session for training and three for test
        self.preds = [] # list of predictions
        for i in range(len(self.sessions)):
            idxs_test = [x for x in range(len(self.sessions))] # first, all indices of sessions
            idxs_test = list(filter(lambda x: x != i, idxs_test)) # then, remove the one training index
            self.preds.append(self.compute(self.sessions, idxs_test))
            
        # Analyze predictions
        self.analyze_predictions(self.preds, list(self.sessions[0].f_df.columns.values))
            
    # Print report
    def print_report(self):
    
        # Report about predictions
        self.report(self.preds, list(self.sessions[0].f_df.columns.values))
        # self.report_paper(self.preds, list(self.sessions[0].f_df.columns.values))

In [None]:
# Same-Site Learner
class SameSiteLearner(Learner):
    
    # Constructor
    def __init__(self, site):
        
        # Super
        Learner.__init__(self)
        
        # Import sessions
        self.sessions = []
        for p in participants:
            self.sessions.append(Session(p, site))
            
        # Perform learning for each session left out once
        self.preds = [] # list of predictions
        for i in range(len(self.sessions)):
            self.preds.append(self.compute(self.sessions, [i]))
        
        # Analyze predictions
        self.analyze_predictions(self.preds, list(self.sessions[0].f_df.columns.values))
            
    # Print report
    def print_report(self):
    
        # Report about predictions
        self.report(self.preds, list(self.sessions[0].f_df.columns.values))
        # self.report_paper(self.preds, list(self.sessions[0].f_df.columns.values))

In [None]:
# Same-Category Learner
class SameCategoryLearner(Learner):
    
    # Constructor
    def __init__(self, sites):
        
        # Super
        Learner.__init__(self)
 
        # Import sessions on each site
        self.sessions = []
        for site in sites:
            for p in participants:
                self.sessions.append(Session(p, site))
                
        # Perform learning for each site left out once
        self.preds = [] # list of predictions
        sessions_per_site = len(participants)
        for i in range(len(sites)):
            self.preds.append(self.compute(self.sessions, range(i*sessions_per_site, (i+1)*sessions_per_site)))
        
        # Analyze predictions
        self.analyze_predictions(self.preds, list(self.sessions[0].f_df.columns.values))
        
    # Print report
    def print_report(self):
    
        # Report about predictions
        self.report(self.preds, list(self.sessions[0].f_df.columns.values))
        # self.report_paper(self.preds, list(self.sessions[0].f_df.columns.values))

In [None]:
# Across-Category Leaener
class AcrossCategoryLearner(Learner):
    
    # Constructor
    def __init__(self, categories, test_category_name):
        
        # Super
        Learner.__init__(self)
 
        # Import sessions of each category
        self.sessions = []
        idxs_test = []
        for category, sites in categories.items():
            for site in sites:
                for p in participants:
                    self.sessions.append(Session(p, site))
                    if test_category_name == category:
                        idxs_test.append(len(self.sessions)-1)

        # Perform learning (no stddev to expect...)
        self.preds = [] # list of predictions
        self.preds.append(self.compute(self.sessions, idxs_test))
        
        # Analyze predictions
        self.analyze_predictions(self.preds, list(self.sessions[0].f_df.columns.values))
        
    # Print report
    def print_report(self):
    
        # Report about predictions
        self.report(self.preds, list(self.sessions[0].f_df.columns.values))
        # self.report_paper(self.preds, list(self.sessions[0].f_df.columns.values))

# Application

In [None]:
def print_ranked_importances(raw_importances):
    # https://stackoverflow.com/questions/15810339/how-are-feature-importances-in-randomforestclassifier-determined
    importances_acc = [] # triples of mean, stddev, feature name
    for feature_name, importances in raw_importances.items():
        importances_acc.append([np.mean(importances), np.std(importances), feature_name])
    importances_acc = sorted(importances_acc, reverse=True) # the higher, the more important
    for [mean, std, feature_name] in importances_acc:
        print(feature_name.ljust(32) + ': ' + f'{(100*mean):.2f}' + '±' + f'{(100*std):.2f}') # contribution in percentage
    
# One-Session learner
print('One-Session-Learning (4-fold cross valiation, one session as training, three sessions as test)')
avg_class0_f1 = 0.0
avg_class1_f1 = 0.0
i = 0
one_session_importances = defaultdict(list)
one_session_forest_precisions_1 = []
one_session_forest_recalls_1 = []
one_session_avg_visual_changes = []
for name, sites in categories.items():
    print()
    print('#------------------- ' + name.center(14) + ' -------------------#')
    print()
    for site in sites:
        print('                  <<<' + site.center(14) + '>>>                  ')
        
        # Learn
        one_session_learner = OneSessionLearner(site)
        
        # Print detailed report
        one_session_learner.print_report()
        
        # Collect count of visual changes
        one_session_avg_visual_changes.append(np.mean(one_session_learner.visual_changes_rf)) # there is one for each fold, thus, take avg across folds
        
        # Collect importances
        for feature_name, importances in one_session_learner.importances.items():
            one_session_importances[feature_name] += importances
        
        # Collect precision and recall
        one_session_forest_precisions_1.append(one_session_learner.classifier_metrics['forest']['precision_1'][0]) # zero for mean instead of std
        one_session_forest_recalls_1.append(one_session_learner.classifier_metrics['forest']['recall_1'][0])
        
        # Collect f1-scores
        avg_class0_f1 += one_session_learner.avg_class0_f1
        avg_class1_f1 += one_session_learner.avg_class1_f1
        i += 1
        print()
        
# Normalize f1-scores and report
avg_class0_f1 /= i
avg_class1_f1 /= i
print('Averaged F1-Score of class 0: ' + str(avg_class0_f1))
print('Averaged F1-Score of class 1: ' + str(avg_class1_f1))
print()
        
print('--- Ranked Feature Importances (already in percentage) ---')
print_ranked_importances(one_session_importances)

# Precision and recall for visual change
print('--- Overall Precision and Recall (already in percentage) ---')
one_session_forest_precisions_1_mean = np.mean(one_session_forest_precisions_1)
one_session_forest_precisions_1_std = np.std(one_session_forest_precisions_1)
print('Overall precision: '
      + f'{(100*one_session_forest_precisions_1_mean):.2f}'
      + '±'
      + f'{(100*one_session_forest_precisions_1_std):.2f}')
one_session_forest_recalls_1_mean = np.mean(one_session_forest_recalls_1)
one_session_forest_recalls_1_std = np.std(one_session_forest_recalls_1)
print('Overall recall:    ' +
      f'{(100*one_session_forest_recalls_1_mean):.2f}'
      + '±'
      + f'{(100*one_session_forest_recalls_1_std):.2f}')
print()

print('--- Number of Visual Changes Recognized by Random Forest Classifier (averaged over folds) ---')
print(str(np.sum(one_session_avg_visual_changes))) # sum over all pages
print()
        
print('#################################################################')
print()

# Same-Site learner
same_site_importances = defaultdict(list)
same_site_forest_precisions_1 = []
same_site_forest_recalls_1 = []
print('Same-Site-Learning (4-fold cross valiation, three sessions as training, one session as test)')
avg_class0_f1 = 0.0
avg_class1_f1 = 0.0
i = 0
for name, sites in categories.items():
    print()
    print('#------------------- ' + name.center(14) + ' -------------------#')
    print()
    for site in sites:
        print('                  <<<' + site.center(14) + '>>>                  ')
        
        # Learn
        same_site_learner = SameSiteLearner(site)
        
        # Print detailed report
        same_site_learner.print_report()
        
        # Collect importances
        for feature_name, importances in same_site_learner.importances.items():
            same_site_importances[feature_name] += importances
            
        # Collect precision and recall
        same_site_forest_precisions_1.append(same_site_learner.classifier_metrics['forest']['precision_1'][0]) # zero for mean instead of std
        same_site_forest_recalls_1.append(same_site_learner.classifier_metrics['forest']['recall_1'][0])
            
        # Collect f1-scores
        avg_class0_f1 += same_site_learner.avg_class0_f1
        avg_class1_f1 += same_site_learner.avg_class1_f1
        i += 1
        print()
        
# Normalize f1-scores and report
avg_class0_f1 /= i
avg_class1_f1 /= i
print('Averaged F1-Score of class 0: ' + str(avg_class0_f1))
print('Averaged F1-Score of class 1: ' + str(avg_class1_f1))
print()

print('--- Ranked Feature Importances (already in percentage) ---')
print_ranked_importances(same_site_importances)
print()

# Precision and recall for visual change
print('--- Overall Precision and Recall (already in percentage) ---')
same_site_forest_precisions_1_mean = np.mean(same_site_forest_precisions_1)
same_site_forest_precisions_1_std = np.std(same_site_forest_precisions_1)
print('Overall precision: '
      + f'{(100*same_site_forest_precisions_1_mean):.2f}'
      + '±'
      + f'{(100*same_site_forest_precisions_1_std):.2f}')
same_site_forest_recalls_1_mean = np.mean(same_site_forest_recalls_1)
same_site_forest_recalls_1_std = np.std(same_site_forest_recalls_1)
print('Overall recall:    '
      + f'{(100*same_site_forest_recalls_1_mean):.2f}'
      + '±'
      + f'{(100*same_site_forest_recalls_1_std):.2f}')
print()
        
print('#################################################################')
print()

# Same-Category learner
same_category_importances = defaultdict(list)
print('Same-Category-Learning (4-fold cross valiation, three sites as training, one site as test)')
for name, sites in categories.items():
    print()
    print('#------------------- ' + name.center(14) + ' -------------------#')
    
    # Learn
    same_category_learner = SameCategoryLearner(sites)
    
    # Print detailed report
    same_category_learner.print_report()

    # Collect importances
    for feature_name, importances in same_category_learner.importances.items():
        same_category_importances[feature_name] += importances
print()

print('--- Ranked Feature Importances (already in percentage) ---')
print_ranked_importances(same_category_importances)

print('#################################################################')
print()
    
# Across-Category learner
across_category_importances = defaultdict(list)
print('Across-Category-Learning (three categories as training, one category as test)')
for name in categories.keys():
    print()
    print('#------------------- ' + name.center(14) + ' -------------------#')
    
    # Learn
    across_category_learner = AcrossCategoryLearner(categories, name)
    
    # Print detailed report
    across_category_learner.print_report()
    
    # Collect importances
    for feature_name, importances in across_category_learner.importances.items():
        across_category_importances[feature_name] += importances
        
print('--- Ranked Feature Importances (already in percentage) ---')
print_ranked_importances(across_category_importances)