In [1]:
import pandas as pd
import os
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import random
import shutil
import itertools
import numpy as np

This script creates all permutations of the selected search categories and randomly puts them in a folder structure. This makes it possible to keep track of all the results and makes it possible to later combine the results of the search and the search parameters.

In [2]:
today = datetime.today()

# create random birthdays for different age groups
date_of_birth = [
    pd.date_range(start=today - relativedelta(years=30), end=today - relativedelta(years=18, days=30), periods=80).to_list(),
    pd.date_range(start=today - relativedelta(years=40), end=today - relativedelta(years=30), periods=80).to_list(),
    pd.date_range(start=today - relativedelta(years=50), end=today - relativedelta(years=40), periods=80).to_list(),
    pd.date_range(start=today - relativedelta(years=60), end=today - relativedelta(years=50), periods=80).to_list(),
    pd.date_range(start=today - relativedelta(years=91), end=today - relativedelta(years=60), periods=80).to_list()]

# set the search parameters
# date_of_birth and date_of_drivers_license are representatives to late assess the dates
search_params = {
    'date_of_birth':list(range(0, 5)), 
    'gender': ['m', 'f'], 
    'nationality':['Schweiz', 'Italien', 'Portugal', 'Kosovo', 'Frankreich'],
    'date_of_drivers_license':list(range(0, 3))}

# 3 methods which create date_of_drivers_license, so that they are representative of the three categories (less then 1 year ago, 1 to 3 years ago and longer then 3 years ago)
# the methods are also assuring, that the date of birth and the date of drivers license are at least 18 year appart, which is the legal driving age in Switzerland

def create_dod(dob):
    dod_list = pd.date_range(start=dob + relativedelta(years=18), end= datetime.today()-relativedelta(years=3), periods=80).to_list()
    return(random.choice(dod_list))

def create_dod2(dob):
    if dob + timedelta(days=18*365.25) < datetime.today() - timedelta(days=1*365.25):
        dod_list = pd.date_range(start=dob + relativedelta(years=18), end= datetime.today(), periods=80).to_list()
        return(random.choice(dod_list))        
    else:
        dod_list = pd.date_range(start=datetime.today() - relativedelta(years=1), end= datetime.today(), periods=80).to_list()
        return(random.choice(dod_list))

def create_dod3(dob):
    if dob + timedelta(days=18*365.25) < datetime.today() - timedelta(days=3*365.25):
        dod_list = pd.date_range(start=dob + relativedelta(years=18), end= datetime.today()-relativedelta(years=1), periods=80).to_list()
        return(random.choice(dod_list))  
    else:
        dod_list = pd.date_range(start=datetime.today() - relativedelta(years=3), end= datetime.today()-relativedelta(years=1), periods=80).to_list()
        return(random.choice(dod_list))

# method which creates the dataframe with all the permutations of the categories of the search parameters

def get_random_sample(search_params):
    df = pd.DataFrame()
    gen_list = list(
        itertools.product(
            search_params['date_of_birth'],
            search_params['date_of_drivers_license'], 
            search_params['gender'], 
            search_params['nationality']))

    df = pd.DataFrame(gen_list, columns =['date_of_birth', 'date_of_drivers_license', 'gender', 'nationality'])

    # assign random date of birth for every age group
    df['date_of_birth'] = [random.choice(date_of_birth[x]) for x in df['date_of_birth']]

    # assign corresponding date of drivers license to age group:
    for x, dob in enumerate(df['date_of_birth']):
        if datetime.today() - dob < timedelta(days=19*365.25):
            if df['date_of_drivers_license'][x] == 0:
                df['date_of_drivers_license'][x] = create_dod2(dob)
            else:
                df.drop(index=x, inplace=True)
        elif datetime.today() - dob < timedelta(days=21*365.25):
            if df['date_of_drivers_license'][x] == 0:
                df['date_of_drivers_license'][x] = create_dod2(dob)
            elif df['date_of_drivers_license'][x] == 1:
                df['date_of_drivers_license'][x] = create_dod3(dob)
            else:
                df.drop(index=x, inplace=True)
        else:
            if df['date_of_drivers_license'][x] == 0:
                df['date_of_drivers_license'][x] = create_dod2(dob)
            elif df['date_of_drivers_license'][x] == 1:
                df['date_of_drivers_license'][x] = create_dod3(dob)
            elif df['date_of_drivers_license'][x] == 2:
                df['date_of_drivers_license'][x] = create_dod(dob)
    

    df['date_of_birth'] = pd.to_datetime(df['date_of_birth']).dt.date
    df['date_of_drivers_license'] = pd.to_datetime(df['date_of_drivers_license']).dt.date
    print(len(df))
    return df

df = get_random_sample(search_params)

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_of_drivers_license'][x] = create_dod2(dob)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_of_drivers_license'][x] = create_dod(dob)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_of_drivers_license'][x] = create_dod(dob)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_of_dr

147


Unnamed: 0,date_of_birth,date_of_drivers_license,gender,nationality
0,1997-11-07,2020-06-24,m,Schweiz
1,2001-03-03,2021-08-10,m,Italien
2,2003-08-02,2021-11-03,m,Portugal
3,2002-05-18,2021-12-26,m,Kosovo
4,2000-09-19,2022-07-14,m,Frankreich
...,...,...,...,...
145,1944-03-09,1971-09-14,f,Schweiz
146,1956-05-08,2010-03-12,f,Italien
147,1953-08-09,2005-04-24,f,Portugal
148,1937-11-28,1985-12-08,f,Kosovo


In [73]:
# check if all dates of birth and drivers license are at least 18 years appart
for x in range(0, len(df)):
    if df['date_of_drivers_license'].iloc[x] - df['date_of_birth'].iloc[x] < timedelta(days=18*365.25):
        print(df['date_of_birth'][x])
        print(df['date_of_drivers_license'][x])

In [77]:
# method which creates folder structure and randomly distributes the search parameters on the course participants.

def build_folders(df):
    try:
        shutil.rmtree('search_data')
    except:
        pass

    os.mkdir('search_data')

    names = ['Stephan', 'Nicolas', 'Vale', 'Samuel']

    for name in names:
        os.mkdir(f'search_data/{name}')

    df = df.sample(frac=1)

    df_split = np.array_split(df, 4)

    for i, df in enumerate(df_split):
        df = df.sort_values(by=['gender', 'nationality', 'date_of_birth', 'date_of_drivers_license'])
        for x, row in enumerate(df.iterrows()):
            folder = f'search_data/{names[i]}/search_{x}'
            os.mkdir(folder)
            df_new = df[x:x+1]
            df_new.to_csv(''.join([folder, f'/search_params{x}.csv']), index=False)

build_folders(df)