In [1]:
import csv
import random
import time


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, ParameterGrid
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib


In [3]:
def feet_inches_to_inches(string):
    split = string.split('ft')
    feet = split[0].strip()
    inches = split[1].strip()
    inches = inches.split('in')
    inches = inches[0].strip()
    return 12 * int(feet) + int(inches)


In [4]:
def get_day_time_and_group_size(user, location):
    if location.endswith('.'):
        location = location[:-1]  # Remove the period.
    match location:
        case "Bar / Nightclub":
            return user.bar_day_of_week, user.bar_time_of_day, user.bar_group_size
        case "Sit-down restaurant":
            return user.restaurant_day_of_week, user.restaurant_time_of_day, user.restaurant_group_size
        case "University":
            return user.university_day_of_week, user.university_time_of_day, user.university_group_size
        case "School / University":
            return user.university_day_of_week, user.university_time_of_day, user.university_group_size
        case "Workplace":
            return user.workplace_day_of_week, user.workplace_time_of_day, user.workplace_group_size
        case "Community event (block-party, social club, hangout, potluck, etc.)":
            return user.community_day_of_week, user.community_time_of_day, user.community_group_size
        case "Cafe / Coffee shop":
            return user.cafe_day_of_week, user.cafe_time_of_day, user.cafe_group_size
        case "Attraction (museum, concert, movie theater, shopping mall, amusement park, etc.)":
            return user.attraction_day_of_week, user.attraction_time_of_day, user.attraction_group_size
        case "Outdoor activity (walking/hiking trail, biking, park, neighborhood, dog park, etc.)":
            return user.outdoor_day_of_week, user.outdoor_time_of_day, user.outdoor_group_size
        case "Gym":
            return user.gym_day_of_week, user.gym_time_of_day, user.gym_group_size
        case _:
            raise Exception("Invalid location.")


In [5]:
def clean_clothing_list(clothing_list):
    result = []
    for clothing in clothing_list:
        cloth = clothing
        if cloth.endswith('.'):
            cloth = cloth[:-1]  # Remove the period.
        cloth = cloth.lower()
        result.append(cloth)
    return result


In [6]:
def has_clothing(clothing_str, user, location):
    clothing_str = clothing_str.lower()
    if clothing_str.endswith('.'):
        clothing_str = clothing_str[:-1]  # Remove the period.
    if clothing_str not in ['athletic clothes', 'casual clothes', 'trendy clothes', 'formal clothes', 'designer clothes', 'hat', 'eyeglasses', 'sunglasses', 'necklace', 'luxury watch', 'rings', 'earrings', 'smart watch']:
        raise Exception("Invalid clothing.")
    if location.endswith('.'):
        location = location[:-1]  # Remove the period.
    match location:
        case "Bar / Nightclub":
            return clothing_str in clean_clothing_list(user.bar_clothing)
        case "Sit-down restaurant":
            return clothing_str in clean_clothing_list(user.restaurant_clothing)
        case "University":
            return clothing_str in clean_clothing_list(user.university_clothing)
        case "School / University":
            return clothing_str in clean_clothing_list(user.university_clothing)
        case "Workplace":
            return clothing_str in clean_clothing_list(user.workplace_clothing)
        case "Community event (block-party, social club, hangout, potluck, etc.)":
            return clothing_str in clean_clothing_list(user.community_clothing)
        case "Cafe / Coffee shop":
            return clothing_str in clean_clothing_list(user.cafe_clothing)
        case "Attraction (museum, concert, movie theater, shopping mall, amusement park, etc.)":
            return clothing_str in clean_clothing_list(user.attraction_clothing)
        case "Outdoor activity (walking/hiking trail, biking, park, neighborhood, dog park, etc.)":
            return clothing_str in clean_clothing_list(user.outdoor_clothing)
        case "Gym":
            return clothing_str in clean_clothing_list(user.gym_clothing)
        case _:
            raise Exception("Invalid location (when checking clothing).")


In [7]:
class SurveyOne:
    def __init__(self, prolific_id, age, gender, height, hair_type, hair_color, has_tattoos,
                 education, is_student, is_in_workforce, industry,
                 hobbies, favorite_hobby, interests, favorite_interest, music_genre, frequent_locations,
                 bar_day_of_week, bar_time_of_day, bar_group_size, bar_clothing,
                 restaurant_day_of_week, restaurant_time_of_day, restaurant_group_size, restaurant_clothing,
                 university_day_of_week, university_time_of_day, university_group_size, university_clothing,
                 workplace_day_of_week, workplace_time_of_day, workplace_group_size, workplace_clothing,
                 community_day_of_week, community_time_of_day, community_group_size, community_clothing,
                 cafe_day_of_week, cafe_time_of_day, cafe_group_size, cafe_clothing,
                 attraction_day_of_week, attraction_time_of_day, attraction_group_size, attraction_clothing,
                 outdoor_day_of_week, outdoor_time_of_day, outdoor_group_size, outdoor_clothing,
                 gym_day_of_week, gym_time_of_day, gym_group_size, gym_clothing,
                 personality, listen_or_speak, social_media, favorite_social_media, music_listen_time):

        self.prolific_id = prolific_id
        self.age = age
        self.gender = gender
        self.height = height
        self.hair_type = hair_type
        self.hair_color = hair_color
        self.has_tattoos = has_tattoos
        self.education = education
        self.is_student = is_student
        self.is_in_workforce = is_in_workforce
        self.industry = industry
        self.hobbies = hobbies.split(';')
        self.favorite_hobby = favorite_hobby
        self.interests = interests.split(';')
        self.favorite_interest = favorite_interest
        self.music_genre = music_genre
        self.frequent_locations = frequent_locations.split(';')

        # Location #1: Bar.
        self.bar_day_of_week = bar_day_of_week
        self.bar_time_of_day = bar_time_of_day
        self.bar_group_size = bar_group_size
        self.bar_clothing = bar_clothing.split(';')

        # Location #2: Restaurant.
        self.restaurant_day_of_week = restaurant_day_of_week
        self.restaurant_time_of_day = restaurant_time_of_day
        self.restaurant_group_size = restaurant_group_size
        self.restaurant_clothing = restaurant_clothing.split(';')

        # Location #3: University.
        self.university_day_of_week = university_day_of_week
        self.university_time_of_day = university_time_of_day
        self.university_group_size = university_group_size
        self.university_clothing = university_clothing.split(';')

        # Location #4: Workplace.
        self.workplace_day_of_week = workplace_day_of_week
        self.workplace_time_of_day = workplace_time_of_day
        self.workplace_group_size = workplace_group_size
        self.workplace_clothing = workplace_clothing.split(';')

        # Location #5: Community.
        self.community_day_of_week = community_day_of_week
        self.community_time_of_day = community_time_of_day
        self.community_group_size = community_group_size
        self.community_clothing = community_clothing.split(';')

        # Location #6: Cafe.
        self.cafe_day_of_week = cafe_day_of_week
        self.cafe_time_of_day = cafe_time_of_day
        self.cafe_group_size = cafe_group_size
        self.cafe_clothing = cafe_clothing.split(';')

        # Location #7: Attraction.
        self.attraction_day_of_week = attraction_day_of_week
        self.attraction_time_of_day = attraction_time_of_day
        self.attraction_group_size = attraction_group_size
        self.attraction_clothing = attraction_clothing.split(';')

        # Location #8: Outdoor.
        self.outdoor_day_of_week = outdoor_day_of_week
        self.outdoor_time_of_day = outdoor_time_of_day
        self.outdoor_group_size = outdoor_group_size
        self.outdoor_clothing = outdoor_clothing.split(';')

        # Location #9: Gym.
        self.gym_day_of_week = gym_day_of_week
        self.gym_time_of_day = gym_time_of_day
        self.gym_group_size = gym_group_size
        self.gym_clothing = gym_clothing.split(';')

        self.personality = personality
        self.listen_or_speak = listen_or_speak
        self.social_media = social_media.split(';')
        self.favorite_social_media = favorite_social_media
        self.music_listen_time = music_listen_time


In [8]:
survey_one_results = []
survey_one_results_dict = {}

with open('survey1_twenty_initial_participants.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        result = SurveyOne(*row)
        survey_one_results.append(result)
        survey_one_results_dict[result.prolific_id] = result


In [9]:
class SurveyTwo:
    def __init__(self, timestamp, prolific_id, instructions, consent, decisions, explanations):
        self.timestamp = timestamp
        self.prolific_id = prolific_id
        self.instructions = instructions
        self.consent = consent
        self.decisions = decisions  # List to store scenario decisions.
        self.explanations = explanations  # List to store scenario explanations.

# Function to read the CSV and create SurveyTwo objects
def read_survey_two_data(csv_file):
    survey_data = []
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row

        for row in reader:
            timestamp, prolific_id, instructions, consent = row[:4]

            # Extract decisions and explanations, handling potential index errors.
            decisions = [row[i] for i in range(4, len(row), 2) if i < len(row)][:-1]
            explanations = [row[i] for i in range(5, len(row), 2) if i < len(row)]

            survey_data.append(SurveyTwo(timestamp, prolific_id, instructions, consent, decisions, explanations))
    return survey_data


In [10]:
survey_two_results = read_survey_two_data('survey2_four_initial_bad_participants.csv')


In [11]:
def get_empty_data_dict():
    data = {
        # Self (not dependent on location).  # *
        'self_age': [],
        'self_gender': [],
        'self_height': [],
        'self_hair_type': [],
        'self_hair_color': [],
        'self_tattoos': [],
        'self_education': [],
        'self_student': [],
        'self_workforce': [],
        'self_industry': [],
        'self_hobby': [],  # Favorite hobby.
        'self_interest': [],  # Favorite interest.
        'self_music_genre': [],
        'self_personality': [],
        'self_conversational_intensity': [],
        'self_social_media': [],  # Favorite social media.
        'self_music_listen_time': [],

        # Candidate (not dependent on location).  # *
        'candidate_age': [],
        'candidate_gender': [],
        'candidate_height': [],
        'candidate_hair_type': [],
        'candidate_hair_color': [],
        'candidate_tattoos': [],
        'candidate_education': [],
        'candidate_student': [],
        'candidate_workforce': [],
        'candidate_industry': [],
        'candidate_hobby': [],  # Favorite hobby.
        'candidate_interest': [],  # Favorite interest.
        'candidate_music_genre': [],
        'candidate_personality': [],
        'candidate_conversational_intensity': [],
        'candidate_social_media': [],  # Favorite social media.
        'candidate_music_listen_time': [],

        # Environment (non-deterministic).  # *
        'location': [],
        'weather': [],
        'human_congestion_level': [],
        'human_noise_level': [],
        'non_human_noise_level': [],
        'candidate_occluded': [],
        'gaze_self_to_candidate': [],
        'gaze_candidate_to_self': [],
        'proximity': [],

        # Environment (deterministic, dependent on location).  # *
        'day_of_week': [],  # Based on candidate.
        'time_of_day': [],  # Based on candidate.

        # Self (dependent on location).  # *
        'self_group_size': [],
        'self_clothing_athletic': [],  # Deterministic.
        'self_clothing_casual': [],  # Deterministic.
        'self_clothing_trendy': [],  # Deterministic.
        'self_clothing_formal': [],  # Deterministic.
        'self_clothing_designer': [],  # Deterministic.
        'self_clothing_hat': [],  # Deterministic.
        'self_clothing_eyeglasses': [],  # Deterministic.
        'self_clothing_sunglasses': [],  # Deterministic.
        'self_clothing_necklace': [],  # Deterministic.
        'self_clothing_luxury_watch': [],  # Deterministic.
        'self_clothing_rings': [],  # Deterministic.
        'self_clothing_earrings': [],  # Deterministic.
        'self_clothing_smart_watch': [],  # Deterministic.

        # Candidate (dependent on location).  # *
        'candidate_group_size': [],
        'candidate_clothing_athletic': [],  # Non-deterministic.
        'candidate_clothing_casual': [],  # Non-deterministic.
        'candidate_clothing_trendy': [],  # Non-deterministic.
        'candidate_clothing_formal': [],  # Non-deterministic.
        'candidate_clothing_designer': [],  # Non-deterministic.
        'candidate_clothing_hat': [],  # Non-deterministic.
        'candidate_clothing_eyeglasses': [],  # Non-deterministic.
        'candidate_clothing_sunglasses': [],  # Non-deterministic.
        'candidate_clothing_necklace': [],  # Non-deterministic.
        'candidate_clothing_luxury_watch': [],  # Non-deterministic.
        'candidate_clothing_rings': [],  # Non-deterministic.
        'candidate_clothing_earrings': [],  # Non-deterministic.
        'candidate_clothing_smart_watch': [],  # Non-deterministic.

        # Output label.  # *
        'self_decision': [],
    }

    return data


In [12]:
class ScenarioInfo:
    def __init__(self,
                 candidate_prolific_id,
                 location,
                 weather,
                 human_congestion_level,
                 human_noise_level,
                 non_human_noise_level,
                 candidate_occluded,
                 gaze_self_to_candidate,
                 gaze_candidate_to_self,
                 proximity,
                 candidate_clothing_athletic,
                 candidate_clothing_casual,
                 candidate_clothing_trendy,
                 candidate_clothing_formal,
                 candidate_clothing_designer,
                 candidate_clothing_hat,
                 candidate_clothing_eyeglasses,
                 candidate_clothing_sunglasses,
                 candidate_clothing_necklace,
                 candidate_clothing_luxury_watch,
                 candidate_clothing_rings,
                 candidate_clothing_earrings,
                 candidate_clothing_smart_watch):
        self.candidate_prolific_id = candidate_prolific_id
        self.location = location
        self.weather = weather
        self.human_congestion_level = human_congestion_level
        self.human_noise_level = human_noise_level
        self.non_human_noise_level = non_human_noise_level
        self.candidate_occluded = candidate_occluded
        self.gaze_self_to_candidate = gaze_self_to_candidate
        self.gaze_candidate_to_self = gaze_candidate_to_self
        self.proximity = proximity
        self.candidate_clothing_athletic = candidate_clothing_athletic
        self.candidate_clothing_casual = candidate_clothing_casual
        self.candidate_clothing_trendy = candidate_clothing_trendy
        self.candidate_clothing_formal = candidate_clothing_formal
        self.candidate_clothing_designer = candidate_clothing_designer
        self.candidate_clothing_hat = candidate_clothing_hat
        self.candidate_clothing_eyeglasses = candidate_clothing_eyeglasses
        self.candidate_clothing_sunglasses = candidate_clothing_sunglasses
        self.candidate_clothing_necklace = candidate_clothing_necklace
        self.candidate_clothing_luxury_watch = candidate_clothing_luxury_watch
        self.candidate_clothing_rings = candidate_clothing_rings
        self.candidate_clothing_earrings = candidate_clothing_earrings
        self.candidate_clothing_smart_watch = candidate_clothing_smart_watch


In [13]:
def append_scenario_info(data, scenario_info):
    data['location'].append(scenario_info.location)
    data['weather'].append(scenario_info.weather)
    data['human_congestion_level'].append(scenario_info.human_congestion_level)
    data['human_noise_level'].append(scenario_info.human_noise_level)
    data['non_human_noise_level'].append(scenario_info.non_human_noise_level)
    data['candidate_occluded'].append(scenario_info.candidate_occluded)
    data['gaze_self_to_candidate'].append(scenario_info.gaze_self_to_candidate)
    data['gaze_candidate_to_self'].append(scenario_info.gaze_candidate_to_self)
    data['proximity'].append(scenario_info.proximity)
    data['candidate_clothing_athletic'].append(scenario_info.candidate_clothing_athletic)
    data['candidate_clothing_casual'].append(scenario_info.candidate_clothing_casual)
    data['candidate_clothing_trendy'].append(scenario_info.candidate_clothing_trendy)
    data['candidate_clothing_formal'].append(scenario_info.candidate_clothing_formal)
    data['candidate_clothing_designer'].append(scenario_info.candidate_clothing_designer)
    data['candidate_clothing_hat'].append(scenario_info.candidate_clothing_hat)
    data['candidate_clothing_eyeglasses'].append(scenario_info.candidate_clothing_eyeglasses)
    data['candidate_clothing_sunglasses'].append(scenario_info.candidate_clothing_sunglasses)
    data['candidate_clothing_necklace'].append(scenario_info.candidate_clothing_necklace)
    data['candidate_clothing_luxury_watch'].append(scenario_info.candidate_clothing_luxury_watch)
    data['candidate_clothing_rings'].append(scenario_info.candidate_clothing_rings)
    data['candidate_clothing_earrings'].append(scenario_info.candidate_clothing_earrings)
    data['candidate_clothing_smart_watch'].append(scenario_info.candidate_clothing_smart_watch)


In [14]:
def append_self_clothing(data, self_user, location):
    data['self_clothing_athletic'].append(has_clothing('athletic clothes', self_user, location))
    data['self_clothing_casual'].append(has_clothing('casual clothes', self_user, location))
    data['self_clothing_trendy'].append(has_clothing('trendy clothes', self_user, location))
    data['self_clothing_formal'].append(has_clothing('formal clothes', self_user, location))
    data['self_clothing_designer'].append(has_clothing('designer clothes', self_user, location))
    data['self_clothing_hat'].append(has_clothing('hat', self_user, location))
    data['self_clothing_eyeglasses'].append(has_clothing('eyeglasses', self_user, location))
    data['self_clothing_sunglasses'].append(has_clothing('sunglasses', self_user, location))
    data['self_clothing_necklace'].append(has_clothing('necklace', self_user, location))
    data['self_clothing_luxury_watch'].append(has_clothing('luxury watch', self_user, location))
    data['self_clothing_rings'].append(has_clothing('rings', self_user, location))
    data['self_clothing_earrings'].append(has_clothing('earrings', self_user, location))
    data['self_clothing_smart_watch'].append(has_clothing('smart watch', self_user, location))


In [15]:
def create_dataset(survey_one_results_dict, survey_two_results, scenario_info_list):
    data = get_empty_data_dict()
    for survey_two_result in survey_two_results:
        self_prolific_id = survey_two_result.prolific_id
        self_decisions = survey_two_result.decisions
        if len(self_decisions) != len(scenario_info_list):
            raise Exception("Length of self decisions does not match length of scenario info list.")
        for i in range(len(self_decisions)):
            candidate_prolific_id = scenario_info_list[i].candidate_prolific_id

            # Self (not dependent on location).
            data['self_age'].append(int(survey_one_results_dict[self_prolific_id].age))
            data['self_gender'].append(survey_one_results_dict[self_prolific_id].gender)
            data['self_height'].append(feet_inches_to_inches(survey_one_results_dict[self_prolific_id].height))
            data['self_hair_type'].append(survey_one_results_dict[self_prolific_id].hair_type)
            data['self_hair_color'].append(survey_one_results_dict[self_prolific_id].hair_color)
            data['self_tattoos'].append(survey_one_results_dict[self_prolific_id].has_tattoos)
            data['self_education'].append(survey_one_results_dict[self_prolific_id].education)
            data['self_student'].append(survey_one_results_dict[self_prolific_id].is_student)
            data['self_workforce'].append(survey_one_results_dict[self_prolific_id].is_in_workforce)
            data['self_industry'].append(survey_one_results_dict[self_prolific_id].industry)
            data['self_hobby'].append(survey_one_results_dict[self_prolific_id].favorite_hobby)
            data['self_interest'].append(survey_one_results_dict[self_prolific_id].favorite_interest)
            data['self_music_genre'].append(survey_one_results_dict[self_prolific_id].music_genre)
            data['self_personality'].append(survey_one_results_dict[self_prolific_id].personality)
            data['self_conversational_intensity'].append(survey_one_results_dict[self_prolific_id].listen_or_speak)
            data['self_social_media'].append(survey_one_results_dict[self_prolific_id].favorite_social_media)
            data['self_music_listen_time'].append(survey_one_results_dict[self_prolific_id].music_listen_time)

            # Candidate (not dependent on location).
            data['candidate_age'].append(int(survey_one_results_dict[candidate_prolific_id].age))
            data['candidate_gender'].append(survey_one_results_dict[candidate_prolific_id].gender)
            data['candidate_height'].append(feet_inches_to_inches(survey_one_results_dict[candidate_prolific_id].height))
            data['candidate_hair_type'].append(survey_one_results_dict[candidate_prolific_id].hair_type)
            data['candidate_hair_color'].append(survey_one_results_dict[candidate_prolific_id].hair_color)
            data['candidate_tattoos'].append(survey_one_results_dict[candidate_prolific_id].has_tattoos)
            data['candidate_education'].append(survey_one_results_dict[candidate_prolific_id].education)
            data['candidate_student'].append(survey_one_results_dict[candidate_prolific_id].is_student)
            data['candidate_workforce'].append(survey_one_results_dict[candidate_prolific_id].is_in_workforce)
            data['candidate_industry'].append(survey_one_results_dict[candidate_prolific_id].industry)
            data['candidate_hobby'].append(survey_one_results_dict[candidate_prolific_id].favorite_hobby)
            data['candidate_interest'].append(survey_one_results_dict[candidate_prolific_id].favorite_interest)
            data['candidate_music_genre'].append(survey_one_results_dict[candidate_prolific_id].music_genre)
            data['candidate_personality'].append(survey_one_results_dict[candidate_prolific_id].personality)
            data['candidate_conversational_intensity'].append(survey_one_results_dict[candidate_prolific_id].listen_or_speak)
            data['candidate_social_media'].append(survey_one_results_dict[candidate_prolific_id].favorite_social_media)
            data['candidate_music_listen_time'].append(survey_one_results_dict[candidate_prolific_id].music_listen_time)

            # Environment (non-deterministic) & Candidate clothing (non-deterministic).
            append_scenario_info(data, scenario_info_list[i])

            # Environment (deterministic, dependent on location). -- Based on candidate.
            day_of_week, time_of_day, candidate_group_size = get_day_time_and_group_size(survey_one_results_dict[candidate_prolific_id], scenario_info_list[i].location)
            data['day_of_week'].append(day_of_week)
            data['time_of_day'].append(time_of_day)

            # Self (dependent on location).
            _, _, self_group_size = get_day_time_and_group_size(survey_one_results_dict[self_prolific_id], scenario_info_list[i].location)
            data['self_group_size'].append(self_group_size)
            append_self_clothing(data, survey_one_results_dict[self_prolific_id], scenario_info_list[i].location)

            # Candidate (dependent on location).
            data['candidate_group_size'].append(candidate_group_size)

            # Output label.
            data['self_decision'].append(self_decisions[i])

    return data


In [16]:
scenario_info_1 = ScenarioInfo('5e78e53a0b2d8247350c1c86',
                               'Sit-down restaurant',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_2 = ScenarioInfo('60dd16a6d16ec5f253b2e29e',
                               'Cafe / Coffee shop',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_3 = ScenarioInfo('5d68c88914867f000139f627',
                               'Sit-down restaurant',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_4 = ScenarioInfo('6620e04a8b73f1ffd8692f03',
                               'Gym',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_5 = ScenarioInfo('5f0a5a99dbbf721316f118e2',
                               'Cafe / Coffee shop',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_6 = ScenarioInfo('660752ba689e8457ca3487cd',
                               'Bar / Nightclub',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_7 = ScenarioInfo('637ea165e071484955b325f7',
                               'Attraction (museum, concert, movie theater, shopping mall, amusement park, etc.)',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_8 = ScenarioInfo('659711fa417392dfbe43d439',
                               'Sit-down restaurant',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_9 = ScenarioInfo('5eb3935fd0e02317909c5f32',
                               'Sit-down restaurant',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')
scenario_info_10 = ScenarioInfo('6604ae923fe580e8b2e745f2',
                               'Cafe / Coffee shop',
                               'weather',
                               'human_congestion_level',
                               'human_noise_level',
                               'non_human_noise_level',
                               'candidate_occluded',
                               'gaze_self_to_candidate',
                               'gaze_candidate_to_self',
                               'proximity',
                               'candidate_clothing_athletic',
                               'candidate_clothing_casual',
                               'candidate_clothing_trendy',
                               'candidate_clothing_formal',
                               'candidate_clothing_designer',
                               'candidate_clothing_hat',
                               'candidate_clothing_eyeglasses',
                               'candidate_clothing_sunglasses',
                               'candidate_clothing_necklace',
                               'candidate_clothing_luxury_watch',
                               'candidate_clothing_rings',
                               'candidate_clothing_earrings',
                               'candidate_clothing_smart_watch')

scenario_info_list = [scenario_info_1, scenario_info_2, scenario_info_3, scenario_info_4, scenario_info_5,
                      scenario_info_6, scenario_info_7, scenario_info_8, scenario_info_9, scenario_info_10]

dataset = create_dataset(survey_one_results_dict, survey_two_results, scenario_info_list)


In [17]:
def update_best_metrics_from_tuple(numbers_tuple, numbers_list, update_markers, update_string):
    if len(numbers_tuple) != len(numbers_list) or len(numbers_tuple) != len(update_markers):
        raise ValueError("All lists/tuple must have the same length")

    updated_list = []
    updated_markers = []
    for tuple_value, list_value, marker in zip(numbers_tuple, numbers_list, update_markers):
        if tuple_value > list_value:
            updated_list.append(tuple_value)
            updated_markers.append(update_string)
        else:
            updated_list.append(list_value)
            updated_markers.append(marker)  # Keep original marker

    return updated_list, updated_markers


In [18]:
def rf_cross_val():
    df = pd.DataFrame(dataset)
    
    # Preprocessing for model using all features (includes MR, right-time, and user features).
    X_mr = df[['self_age', 'self_gender', 'self_height', 'self_hair_type', 'self_hair_color',
            'self_tattoos', 'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality', 'self_conversational_intensity',
            'self_social_media', 'self_music_listen_time', 'candidate_age', 'candidate_gender', 'candidate_height',
            'candidate_hair_type', 'candidate_hair_color', 'candidate_tattoos', 'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_conversational_intensity', 'candidate_social_media', 'candidate_music_listen_time', 'location',
            'weather', 'human_congestion_level', 'human_noise_level', 'non_human_noise_level', 'candidate_occluded',
            'gaze_self_to_candidate', 'gaze_candidate_to_self', 'proximity', 'day_of_week', 'time_of_day',
            'self_group_size', 'self_clothing_athletic', 'self_clothing_casual', 'self_clothing_trendy', 'self_clothing_formal',
            'self_clothing_designer', 'self_clothing_hat', 'self_clothing_eyeglasses', 'self_clothing_sunglasses', 'self_clothing_necklace',
            'self_clothing_luxury_watch', 'self_clothing_rings', 'self_clothing_earrings', 'self_clothing_smart_watch', 'candidate_group_size',
            'candidate_clothing_athletic', 'candidate_clothing_casual', 'candidate_clothing_trendy', 'candidate_clothing_formal', 'candidate_clothing_designer',
            'candidate_clothing_hat', 'candidate_clothing_eyeglasses', 'candidate_clothing_sunglasses', 'candidate_clothing_necklace', 'candidate_clothing_luxury_watch',
            'candidate_clothing_rings', 'candidate_clothing_earrings', 'candidate_clothing_smart_watch']]
    y_mr = df['self_decision']
    y_mr_accept_reject = y_mr.copy()
    y_mr_accept_reject = y_mr_accept_reject.replace(['Meet (in-person)', 'Chat (via instant messaging)'], 'Accept')
    transformer_mr = ColumnTransformer(
        transformers=[('onehot', OneHotEncoder(), ['self_gender', 'self_hair_type', 'self_hair_color',
            'self_tattoos', 'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality', 'self_conversational_intensity',
            'self_social_media', 'self_music_listen_time', 'candidate_gender',
            'candidate_hair_type', 'candidate_hair_color', 'candidate_tattoos', 'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_conversational_intensity', 'candidate_social_media', 'candidate_music_listen_time', 'location',
            'weather', 'human_congestion_level', 'human_noise_level', 'non_human_noise_level', 'candidate_occluded',
            'gaze_self_to_candidate', 'gaze_candidate_to_self', 'proximity', 'day_of_week', 'time_of_day',
            'self_group_size', 'self_clothing_athletic', 'self_clothing_casual', 'self_clothing_trendy', 'self_clothing_formal',
            'self_clothing_designer', 'self_clothing_hat', 'self_clothing_eyeglasses', 'self_clothing_sunglasses', 'self_clothing_necklace',
            'self_clothing_luxury_watch', 'self_clothing_rings', 'self_clothing_earrings', 'self_clothing_smart_watch', 'candidate_group_size',
            'candidate_clothing_athletic', 'candidate_clothing_casual', 'candidate_clothing_trendy', 'candidate_clothing_formal', 'candidate_clothing_designer',
            'candidate_clothing_hat', 'candidate_clothing_eyeglasses', 'candidate_clothing_sunglasses', 'candidate_clothing_necklace', 'candidate_clothing_luxury_watch',
            'candidate_clothing_rings', 'candidate_clothing_earrings', 'candidate_clothing_smart_watch'])],
        remainder='passthrough'
    )
    X_mr_transformed = transformer_mr.fit_transform(X_mr)

    # Preprocessing for model which excludes MR features but includes right-time-features and user features.
    X_non_mr = df[['self_age', 'self_gender',
            'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality',
            'self_social_media', 'self_music_listen_time', 'candidate_age', 'candidate_gender',
            'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_social_media', 'candidate_music_listen_time', 'location',
            'weather', 'human_noise_level', 'non_human_noise_level',
            'day_of_week', 'time_of_day']]
    y_non_mr = df['self_decision']
    y_non_mr_accept_reject = y_non_mr.copy()
    y_non_mr_accept_reject = y_non_mr_accept_reject.replace(['Meet (in-person)', 'Chat (via instant messaging)'], 'Accept')
    transformer_non_mr = ColumnTransformer(
        transformers=[('onehot', OneHotEncoder(), ['self_gender',
            'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality',
            'self_social_media', 'self_music_listen_time', 'candidate_gender',
            'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_social_media', 'candidate_music_listen_time', 'location',
            'weather', 'human_noise_level', 'non_human_noise_level',
            'day_of_week', 'time_of_day'])],
        remainder='passthrough'
    )
    X_non_mr_transformed = transformer_non_mr.fit_transform(X_non_mr)

    # Preprocessing for model which excludes right-time features but includes MR and user features.
    X_mr_user = df[['self_age', 'self_gender', 'self_height', 'self_hair_type', 'self_hair_color',
            'self_tattoos', 'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality', 'self_conversational_intensity',
            'self_social_media', 'self_music_listen_time', 'candidate_age', 'candidate_gender', 'candidate_height',
            'candidate_hair_type', 'candidate_hair_color', 'candidate_tattoos', 'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_conversational_intensity', 'candidate_social_media', 'candidate_music_listen_time',
            'human_congestion_level', 'candidate_occluded',
            'gaze_self_to_candidate', 'gaze_candidate_to_self', 'proximity',
            'self_group_size', 'self_clothing_athletic', 'self_clothing_casual', 'self_clothing_trendy', 'self_clothing_formal',
            'self_clothing_designer', 'self_c lothing_hat', 'self_clothing_eyeglasses', 'self_clothing_sunglasses', 'self_clothing_necklace',
            'self_clothing_luxury_watch', 'self_clothing_rings', 'self_clothing_earrings', 'self_clothing_smart_watch', 'candidate_group_size',
            'candidate_clothing_athletic', 'candidate_clothing_casual', 'candidate_clothing_trendy', 'candidate_clothing_formal', 'candidate_clothing_designer',
            'candidate_clothing_hat', 'candidate_clothing_eyeglasses', 'candidate_clothing_sunglasses', 'candidate_clothing_necklace', 'candidate_clothing_luxury_watch',
            'candidate_clothing_rings', 'candidate_clothing_earrings', 'candidate_clothing_smart_watch']]
    y_mr_user = df['self_decision']
    y_mr_user_accept_reject = y_mr_user.copy()
    y_mr_user_accept_reject = y_mr_user_accept_reject.replace(['Meet (in-person)', 'Chat (via instant messaging)'], 'Accept')
    transformer_mr_user = ColumnTransformer(
        transformers=[('onehot', OneHotEncoder(), ['self_gender', 'self_hair_type', 'self_hair_color',
            'self_tattoos', 'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality', 'self_conversational_intensity',
            'self_social_media', 'self_music_listen_time', 'candidate_gender',
            'candidate_hair_type', 'candidate_hair_color', 'candidate_tattoos', 'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_conversational_intensity', 'candidate_social_media', 'candidate_music_listen_time',
            'human_congestion_level', 'candidate_occluded',
            'gaze_self_to_candidate', 'gaze_candidate_to_self', 'proximity',
            'self_group_size', 'self_clothing_athletic', 'self_clothing_casual', 'self_clothing_trendy', 'self_clothing_formal',
            'self_clothing_designer', 'self_clothing_hat', 'self_clothing_eyeglasses', 'self_clothing_sunglasses', 'self_clothing_necklace',
            'self_clothing_luxury_watch', 'self_clothing_rings', 'self_clothing_earrings', 'self_clothing_smart_watch', 'candidate_group_size',
            'candidate_clothing_athletic', 'candidate_clothing_casual', 'candidate_clothing_trendy', 'candidate_clothing_formal', 'candidate_clothing_designer',
            'candidate_clothing_hat', 'candidate_clothing_eyeglasses', 'candidate_clothing_sunglasses', 'candidate_clothing_necklace', 'candidate_clothing_luxury_watch',
            'candidate_clothing_rings', 'candidate_clothing_earrings', 'candidate_clothing_smart_watch'])],
        remainder='passthrough'
    )
    X_mr_user_transformed = transformer_mr_user.fit_transform(X_mr_user)

    # Preprocessing for model which only includes user features.
    X_user = df[['self_age', 'self_gender',
            'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality',
            'self_social_media', 'self_music_listen_time', 'candidate_age', 'candidate_gender',
            'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_social_media', 'candidate_music_listen_time']]
    y_user = df['self_decision']
    y_user_accept_reject = y_user.copy()
    y_user_accept_reject = y_user_accept_reject.replace(['Meet (in-person)', 'Chat (via instant messaging)'], 'Accept')
    transformer_user = ColumnTransformer(
        transformers=[('onehot', OneHotEncoder(), ['self_gender',
            'self_education', 'self_student', 'self_workforce', 'self_industry',
            'self_hobby', 'self_interest', 'self_music_genre', 'self_personality',
            'self_social_media', 'self_music_listen_time', 'candidate_gender',
            'candidate_education', 'candidate_student',
            'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
            'candidate_personality', 'candidate_social_media', 'candidate_music_listen_time'])],
        remainder='passthrough'
    )
    X_user_transformed = transformer_user.fit_transform(X_user)

    param_grid = {
        'n_estimators': [100, 500, 1000, 2000],
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    best_meet_chat_reject_mr_metrics = [0] * 10
    best_meet_chat_reject_mr_files = [''] * 10

    best_accept_reject_mr_metrics = [0] * 10
    best_accept_reject_mr_files = [''] * 10

    best_meet_chat_reject_non_mr_metrics = [0] * 10
    best_meet_chat_reject_non_mr_files = [''] * 10

    best_accept_reject_non_mr_metrics = [0] * 10
    best_accept_reject_non_mr_files = [''] * 10

##################################################

    best_meet_chat_reject_mr_user_metrics = [0] * 10
    best_meet_chat_reject_mr_user_files = [''] * 10

    best_accept_reject_mr_user_metrics = [0] * 10
    best_accept_reject_mr_user_files = [''] * 10

    best_meet_chat_reject_user_metrics = [0] * 10
    best_meet_chat_reject_user_files = [''] * 10

    best_accept_reject_user_metrics = [0] * 10
    best_accept_reject_user_files = [''] * 10

    for params in ParameterGrid(param_grid):
        timestamp = time.strftime("%Y%m%d-%H%M%S")
        filename = f"model_results/rf_results_{timestamp}_{params}.txt"
    
        with open(filename, 'w') as f:
            print(f'Writing file: {filename}')
            f.write(f"Hyperparameters: {params}\n")

##################################################

            # Cross-validation loop for model using all features (includes MR, right-time, and user features).
            f.write(f"~~~~~~~~~~\n\nModel using all features (includes MR, right-time, and user features):\n")
            fold_metrics = []
            fold_metrics_accept_reject = []
            for train_index, test_index in kf.split(X_mr_transformed):
                X_train, X_test = X_mr_transformed[train_index], X_mr_transformed[test_index]
                y_train, y_test = y_mr[train_index], y_mr[test_index]
                y_train_accept_reject, y_test_accept_reject = y_mr_accept_reject[train_index], y_mr_accept_reject[test_index]

                rf = RandomForestClassifier(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_test)

                rf_accept_reject = RandomForestClassifier(**params)
                rf_accept_reject.fit(X_train, y_train_accept_reject)
                y_pred_accept_reject = rf_accept_reject.predict(X_test)

                # Calculate metrics (meet, chat, reject).
                accuracy = accuracy_score(y_test, y_pred)
                precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test, y_pred, average='micro', zero_division=0)
                precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test, y_pred, average='micro', zero_division=0)
                recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test, y_pred, average='weighted')
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')

                fold_metrics.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (meet, chat, reject): {fold_metrics[-1]}\n")

                # Calculate metrics (accept, reject).
                accuracy = accuracy_score(y_test_accept_reject, y_pred_accept_reject)
                precision_weighted = precision_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                precision_macro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                recall_macro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test_accept_reject, y_pred_accept_reject, average='weighted')
                f1_micro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='micro')
                f1_macro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='macro')

                fold_metrics_accept_reject.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (accept, reject): {fold_metrics_accept_reject[-1]}\n")

            # Average MR metrics (meet, chat, reject).
            avg_metrics = tuple(np.mean(fold_metrics, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [MEET, CHAT, REJECT] METRICS (MR + right-time + user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/meet_chat_reject_mr_{timestamp}_{params}.joblib'
            joblib.dump(rf, model_filename)
            best_meet_chat_reject_mr_metrics, best_meet_chat_reject_mr_files = update_best_metrics_from_tuple(avg_metrics,
                    best_meet_chat_reject_mr_metrics, best_meet_chat_reject_mr_files, model_filename)
            f.write(f"\nbest_meet_chat_reject_mr_files: {best_meet_chat_reject_mr_files}\n")

            # Average MR metrics (accept, reject).
            avg_metrics = tuple(np.mean(fold_metrics_accept_reject, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [ACCEPT, REJECT] METRICS (MR + right-time + user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/accept_reject_mr_{timestamp}_{params}.joblib'
            joblib.dump(rf_accept_reject, model_filename)
            best_accept_reject_mr_metrics, best_accept_reject_mr_files = update_best_metrics_from_tuple(avg_metrics,
                    best_accept_reject_mr_metrics, best_accept_reject_mr_files, model_filename)
            f.write(f"\nbest_accept_reject_mr_files: {best_accept_reject_mr_files}\n")

##################################################

            # Cross-validation loop for model which excludes MR features but includes right-time-features and user features.
            f.write(f"~~~~~~~~~~\n\nModel which excludes MR features but includes right-time-features and user features:\n")
            fold_metrics = []
            fold_metrics_accept_reject = []
            for train_index, test_index in kf.split(X_non_mr_transformed):
                X_train, X_test = X_non_mr_transformed[train_index], X_non_mr_transformed[test_index]
                y_train, y_test = y_non_mr[train_index], y_non_mr[test_index]
                y_train_accept_reject, y_test_accept_reject = y_non_mr_accept_reject[train_index], y_non_mr_accept_reject[test_index]

                rf = RandomForestClassifier(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_test)

                rf_accept_reject = RandomForestClassifier(**params)
                rf_accept_reject.fit(X_train, y_train_accept_reject)
                y_pred_accept_reject = rf_accept_reject.predict(X_test)

                # Calculate metrics (meet, chat, reject).
                accuracy = accuracy_score(y_test, y_pred)
                precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test, y_pred, average='micro', zero_division=0)
                precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test, y_pred, average='micro', zero_division=0)
                recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test, y_pred, average='weighted')
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')

                fold_metrics.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (meet, chat, reject): {fold_metrics[-1]}\n")

                # Calculate metrics (accept, reject).
                accuracy = accuracy_score(y_test_accept_reject, y_pred_accept_reject)
                precision_weighted = precision_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                precision_macro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                recall_macro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test_accept_reject, y_pred_accept_reject, average='weighted')
                f1_micro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='micro')
                f1_macro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='macro')

                fold_metrics_accept_reject.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (accept, reject): {fold_metrics_accept_reject[-1]}\n")

            # Average non-MR metrics (meet, chat, reject).
            avg_metrics = tuple(np.mean(fold_metrics, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [MEET, CHAT, REJECT] METRICS (non-MR, w/ right-time + user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/meet_chat_reject_non_mr_{timestamp}_{params}.joblib'
            joblib.dump(rf, model_filename)
            best_meet_chat_reject_non_mr_metrics, best_meet_chat_reject_non_mr_files = update_best_metrics_from_tuple(avg_metrics,
                    best_meet_chat_reject_non_mr_metrics, best_meet_chat_reject_non_mr_files, model_filename)
            f.write(f"\nbest_meet_chat_reject_non_mr_files: {best_meet_chat_reject_non_mr_files}\n")

            # Average non-MR metrics (accept, reject).
            avg_metrics = tuple(np.mean(fold_metrics_accept_reject, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [ACCEPT, REJECT] METRICS (non-MR, w/ right-time + user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/accept_reject_non_mr_{timestamp}_{params}.joblib'
            joblib.dump(rf_accept_reject, model_filename)
            best_accept_reject_non_mr_metrics, best_accept_reject_non_mr_files = update_best_metrics_from_tuple(avg_metrics,
                    best_accept_reject_non_mr_metrics, best_accept_reject_non_mr_files, model_filename)
            f.write(f"\nbest_accept_reject_non_mr_files: {best_accept_reject_non_mr_files}\n")

##################################################

            # Cross-validation loop for model which excludes right-time features but includes MR and user features.
            f.write(f"~~~~~~~~~~\n\nModel which excludes right-time features but includes MR and user features:\n")
            fold_metrics = []
            fold_metrics_accept_reject = []
            for train_index, test_index in kf.split(X_mr_user_transformed):
                X_train, X_test = X_mr_user_transformed[train_index], X_mr_user_transformed[test_index]
                y_train, y_test = y_mr_user[train_index], y_mr_user[test_index]
                y_train_accept_reject, y_test_accept_reject = y_mr_user_accept_reject[train_index], y_mr_user_accept_reject[test_index]

                rf = RandomForestClassifier(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_test)

                rf_accept_reject = RandomForestClassifier(**params)
                rf_accept_reject.fit(X_train, y_train_accept_reject)
                y_pred_accept_reject = rf_accept_reject.predict(X_test)

                # Calculate metrics (meet, chat, reject).
                accuracy = accuracy_score(y_test, y_pred)
                precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test, y_pred, average='micro', zero_division=0)
                precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test, y_pred, average='micro', zero_division=0)
                recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test, y_pred, average='weighted')
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')

                fold_metrics.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (meet, chat, reject): {fold_metrics[-1]}\n")

                # Calculate metrics (accept, reject).
                accuracy = accuracy_score(y_test_accept_reject, y_pred_accept_reject)
                precision_weighted = precision_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                precision_macro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                recall_macro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test_accept_reject, y_pred_accept_reject, average='weighted')
                f1_micro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='micro')
                f1_macro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='macro')

                fold_metrics_accept_reject.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (accept, reject): {fold_metrics_accept_reject[-1]}\n")

            # Average MR metrics (meet, chat, reject).
            avg_metrics = tuple(np.mean(fold_metrics, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [MEET, CHAT, REJECT] METRICS (MR + user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/meet_chat_reject_mr_user_{timestamp}_{params}.joblib'
            joblib.dump(rf, model_filename)
            best_meet_chat_reject_mr_user_metrics, best_meet_chat_reject_mr_user_files = update_best_metrics_from_tuple(avg_metrics,
                    best_meet_chat_reject_mr_user_metrics, best_meet_chat_reject_mr_user_files, model_filename)
            f.write(f"\nbest_meet_chat_reject_mr_user_files: {best_meet_chat_reject_mr_user_files}\n")

            # Average MR metrics (accept, reject).
            avg_metrics = tuple(np.mean(fold_metrics_accept_reject, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [ACCEPT, REJECT] METRICS (MR + user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/accept_reject_mr_user_{timestamp}_{params}.joblib'
            joblib.dump(rf_accept_reject, model_filename)
            best_accept_reject_mr_user_metrics, best_accept_reject_mr_user_files = update_best_metrics_from_tuple(avg_metrics,
                    best_accept_reject_mr_user_metrics, best_accept_reject_mr_user_files, model_filename)
            f.write(f"\nbest_accept_reject_mr_user_files: {best_accept_reject_mr_user_files}\n")

##################################################

            # Cross-validation loop for model which only includes user features.
            f.write(f"~~~~~~~~~~\n\nModel which only includes user features:\n")
            fold_metrics = []
            fold_metrics_accept_reject = []
            for train_index, test_index in kf.split(X_user_transformed):
                X_train, X_test = X_user_transformed[train_index], X_user_transformed[test_index]
                y_train, y_test = y_user[train_index], y_user[test_index]
                y_train_accept_reject, y_test_accept_reject = y_user_accept_reject[train_index], y_user_accept_reject[test_index]

                rf = RandomForestClassifier(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_test)

                rf_accept_reject = RandomForestClassifier(**params)
                rf_accept_reject.fit(X_train, y_train_accept_reject)
                y_pred_accept_reject = rf_accept_reject.predict(X_test)

                # Calculate metrics (meet, chat, reject).
                accuracy = accuracy_score(y_test, y_pred)
                precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test, y_pred, average='micro', zero_division=0)
                precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test, y_pred, average='micro', zero_division=0)
                recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test, y_pred, average='weighted')
                f1_micro = f1_score(y_test, y_pred, average='micro')
                f1_macro = f1_score(y_test, y_pred, average='macro')

                fold_metrics.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (meet, chat, reject): {fold_metrics[-1]}\n")

                # Calculate metrics (accept, reject).
                accuracy = accuracy_score(y_test_accept_reject, y_pred_accept_reject)
                precision_weighted = precision_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                precision_micro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                precision_macro = precision_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                recall_weighted = recall_score(y_test_accept_reject, y_pred_accept_reject, average='weighted', zero_division=0)
                recall_micro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='micro', zero_division=0)
                recall_macro = recall_score(y_test_accept_reject, y_pred_accept_reject, average='macro', zero_division=0)
                f1_weighted = f1_score(y_test_accept_reject, y_pred_accept_reject, average='weighted')
                f1_micro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='micro')
                f1_macro = f1_score(y_test_accept_reject, y_pred_accept_reject, average='macro')

                fold_metrics_accept_reject.append((accuracy,
                                     precision_weighted, precision_micro, precision_macro,
                                     recall_weighted, recall_micro, recall_macro,
                                     f1_weighted, f1_micro, f1_macro))
                f.write(f"Fold metrics (accept, reject): {fold_metrics_accept_reject[-1]}\n")

            # Average MR metrics (meet, chat, reject).
            avg_metrics = tuple(np.mean(fold_metrics, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [MEET, CHAT, REJECT] METRICS (user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/meet_chat_reject_user_{timestamp}_{params}.joblib'
            joblib.dump(rf, model_filename)
            best_meet_chat_reject_user_metrics, best_meet_chat_reject_user_files = update_best_metrics_from_tuple(avg_metrics,
                    best_meet_chat_reject_user_metrics, best_meet_chat_reject_user_files, model_filename)
            f.write(f"\nbest_meet_chat_reject_user_files: {best_meet_chat_reject_user_files}\n")

            # Average MR metrics (accept, reject).
            avg_metrics = tuple(np.mean(fold_metrics_accept_reject, axis=0))
            f.write("\n~~~~~~~~~~\nAVERAGE [ACCEPT, REJECT] METRICS (user)\n~~~~~~~~~~")
            f.write(f"\nAvg metrics: {avg_metrics}\n")
            f.write(f"Avg accuracy: {avg_metrics[0]}\n")
            f.write(f"Avg precision (weighted): {avg_metrics[1]}\n")
            f.write(f"Avg precision (micro): {avg_metrics[2]}\n")
            f.write(f"Avg precision (macro): {avg_metrics[3]}\n")
            f.write(f"Avg recall (weighted): {avg_metrics[4]}\n")
            f.write(f"Avg recall (micro): {avg_metrics[5]}\n")
            f.write(f"Avg recall (macro): {avg_metrics[6]}\n")
            f.write(f"Avg F1 (weighted): {avg_metrics[7]}\n")
            f.write(f"Avg F1 (micro): {avg_metrics[8]}\n")
            f.write(f"Avg F1 (macro): {avg_metrics[9]}\n")
            model_filename = f'saved_models/accept_reject_user_{timestamp}_{params}.joblib'
            joblib.dump(rf_accept_reject, model_filename)
            best_accept_reject_user_metrics, best_accept_reject_user_files = update_best_metrics_from_tuple(avg_metrics,
                    best_accept_reject_user_metrics, best_accept_reject_user_files, model_filename)
            f.write(f"\nbest_accept_reject_user_files: {best_accept_reject_user_files}\n")


In [19]:
rf_cross_val()


Writing file: model_results/rf_results_20240424-002413_{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}.txt
Writing file: model_results/rf_results_20240424-002415_{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}.txt
Writing file: model_results/rf_results_20240424-002424_{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}.txt
Writing file: model_results/rf_results_20240424-002442_{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 2000}.txt
Writing file: model_results/rf_results_20240424-002518_{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}.txt
Writing file: model_results/rf_results_20240424-002520_{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators':

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(dataset)

# Preprocessing
X = df[['self_age', 'self_gender', 'self_height', 'self_hair_type', 'self_hair_color',
        'self_tattoos', 'self_education', 'self_student', 'self_workforce', 'self_industry',
        'self_hobby', 'self_interest', 'self_music_genre', 'self_personality', 'self_conversational_intensity',
        'self_social_media', 'self_music_listen_time', 'candidate_age', 'candidate_gender', 'candidate_height',
        'candidate_hair_type', 'candidate_hair_color', 'candidate_tattoos', 'candidate_education', 'candidate_student',
        'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
        'candidate_personality', 'candidate_conversational_intensity', 'candidate_social_media', 'candidate_music_listen_time', 'location',
        'weather', 'human_congestion_level', 'human_noise_level', 'non_human_noise_level', 'candidate_occluded',
        'gaze_self_to_candidate', 'gaze_candidate_to_self', 'proximity', 'day_of_week', 'time_of_day',
        'self_group_size', 'self_clothing_athletic', 'self_clothing_casual', 'self_clothing_trendy', 'self_clothing_formal',
        'self_clothing_designer', 'self_clothing_hat', 'self_clothing_eyeglasses', 'self_clothing_sunglasses', 'self_clothing_necklace',
        'self_clothing_luxury_watch', 'self_clothing_rings', 'self_clothing_earrings', 'self_clothing_smart_watch', 'candidate_group_size',
        'candidate_clothing_athletic', 'candidate_clothing_casual', 'candidate_clothing_trendy', 'candidate_clothing_formal', 'candidate_clothing_designer',
        'candidate_clothing_hat', 'candidate_clothing_eyeglasses', 'candidate_clothing_sunglasses', 'candidate_clothing_necklace', 'candidate_clothing_luxury_watch',
        'candidate_clothing_rings', 'candidate_clothing_earrings', 'candidate_clothing_smart_watch']]
y = df['self_decision']

transformer = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(), ['self_gender', 'self_hair_type', 'self_hair_color',
        'self_tattoos', 'self_education', 'self_student', 'self_workforce', 'self_industry',
        'self_hobby', 'self_interest', 'self_music_genre', 'self_personality', 'self_conversational_intensity',
        'self_social_media', 'self_music_listen_time', 'candidate_gender',
        'candidate_hair_type', 'candidate_hair_color', 'candidate_tattoos', 'candidate_education', 'candidate_student',
        'candidate_workforce', 'candidate_industry', 'candidate_hobby', 'candidate_interest', 'candidate_music_genre',
        'candidate_personality', 'candidate_conversational_intensity', 'candidate_social_media', 'candidate_music_listen_time', 'location',
        'weather', 'human_congestion_level', 'human_noise_level', 'non_human_noise_level', 'candidate_occluded',
        'gaze_self_to_candidate', 'gaze_candidate_to_self', 'proximity', 'day_of_week', 'time_of_day',
        'self_group_size', 'self_clothing_athletic', 'self_clothing_casual', 'self_clothing_trendy', 'self_clothing_formal',
        'self_clothing_designer', 'self_clothing_hat', 'self_clothing_eyeglasses', 'self_clothing_sunglasses', 'self_clothing_necklace',
        'self_clothing_luxury_watch', 'self_clothing_rings', 'self_clothing_earrings', 'self_clothing_smart_watch', 'candidate_group_size',
        'candidate_clothing_athletic', 'candidate_clothing_casual', 'candidate_clothing_trendy', 'candidate_clothing_formal', 'candidate_clothing_designer',
        'candidate_clothing_hat', 'candidate_clothing_eyeglasses', 'candidate_clothing_sunglasses', 'candidate_clothing_necklace', 'candidate_clothing_luxury_watch',
        'candidate_clothing_rings', 'candidate_clothing_earrings', 'candidate_clothing_smart_watch'])],
    remainder='passthrough'
)

X_transformed = transformer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)


In [None]:
# Model training
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust hyperparameters as needed
rf_clf.fit(X_train, y_train)

# Prediction and evaluation
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

joblib.dump(rf_clf, 'saved_models/my_model.joblib')


In [None]:
loaded_model = joblib.load('saved_models/my_model.joblib')

# Prediction and evaluation
y_pred = loaded_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
