Constantes: tamanho do Mock e instância do Faker


In [None]:
from faker import Faker
USER_COUNT = 105231
fake = Faker()

# Funções usadas ao longo do Notebook

Gerar dia válido dentro de um determinado mês. Prioriza primeiros dias.

In [None]:
import numpy as np
import calendar
from scipy.stats import expon

def generate_days(
        month: tuple[int, int],
        size: int,
        start_day: int=1
    ) -> np.ndarray | None:
    _, last_day = calendar.monthrange(*month)
    if start_day == last_day:
        return np.full((size,), start_day)
    elif start_day > last_day:
        return None
    available_days = np.arange(start=start_day, stop=last_day + 1)
    weights = np.copy(available_days)
    weights = weights / np.sum(weights)
    weights = expon.pdf(weights)
    weights = weights / np.sum(weights)
    return np.random.choice(available_days, size=size, p=weights)

# Dados de Usuário


Iremos gerar um DataFrame Pandas para conter informações de usuário, assim como o endereço do mesmo.


In [None]:
import pandas as pd

users = pd.DataFrame({
    'id': [fake.uuid4() for _ in range(USER_COUNT)],
    'created_at': None,
    'birthday': None,
    'city': None,
    'state': None,
    'country': None
})
display(users)

Gerando _created_at_ (omitindo _updated_at_ pois ele não aparece em nenhum lugar do relatório)


Utilizando distribuição ponderada, priorizando os meses 2, 3 e 4.


In [None]:
import numpy as np
from matplotlib import pyplot as plt

available_months = np.array([1, 2, 3, 4, 5, 6])
weights_months = np.array([1/6, 1/5, 1/5, 1/4, 1/7, 1/5])
weights_months = weights_months + \
    np.random.normal(0, 0.0125, size=weights_months.shape)
weights_months = weights_months / np.sum(weights_months)
months = np.random.choice(available_months, p=weights_months, size=USER_COUNT)
month_counts = np.bincount(months)[1:]
accumulated_month_counts = np.cumsum(month_counts)
plt.hist(months)
plt.show()

Utilizando distribuição ponderada, priorizando dias iniciais do mês


Horas não são aleatórias: priorizar horários de pico


In [None]:
import numpy as np
from scipy.stats import skewnorm


def gen_random_minutes(size):
    minutes = skewnorm.rvs(a=-0.5, loc=960, scale=240, size=size)
    while True:
        minutes = np.round(minutes, 0).astype(int)
        minutes_outside_interval = np.argwhere(
            (minutes < 0) | (minutes >= 1440)).flatten()
        if (minutes_outside_interval.size == 0):
            break
        minutes[minutes_outside_interval] = skewnorm.rvs(
            a=-0.5, loc=960, scale=240, size=len(minutes_outside_interval))
    return minutes


minutes = gen_random_minutes(USER_COUNT)
hours = minutes // 60
minutes = minutes % 60

Adicionando created_at


In [None]:
from datetime import datetime
import numpy as np

current_idx = 0
for month, user_count_month in zip(available_months, month_counts):
    random_days = generate_days((2023, month), size=user_count_month)
    c_at_array = np.empty((user_count_month,), dtype=object)
    for i in range(user_count_month):
        c_at_array[i] = datetime(
            2023,
            month,
            random_days[i],
            hours[current_idx + i],
            minutes[current_idx + i],
            np.random.randint(0, 60)
        ).isoformat()
    pd_idx = users.index[current_idx:current_idx+user_count_month]
    users.loc[pd_idx, 'created_at'] = c_at_array
    current_idx += user_count_month

Gerando datas de nascimento


Ano será escolhido através de uma distribuição que prioriza fim dos anos 90 e anos 2000


In [None]:
from matplotlib import pyplot as plt
from scipy.stats import skewnorm

alpha = -1.7
loc = 2000
scale = 8
random_years = skewnorm.rvs(a=alpha, loc=loc, scale=scale, size=USER_COUNT)
while True:
    random_years = np.round(random_years, 0).astype(int)
    outside_interval = np.argwhere(random_years > 2005).flatten()
    if (outside_interval.size == 0):
        break
    random_years[outside_interval] = skewnorm.rvs(
        a=alpha, loc=loc, scale=scale, size=len(outside_interval))
# Count the number of users created in each year
# Year is x axis, count is y axis
minimum_year = np.min(random_years)
maximum_year = np.max(random_years)
print(f'Ano mínimo: {minimum_year}')
print(f'Ano máximo: {maximum_year}')
plt.hist(random_years, bins=range(minimum_year, maximum_year))
plt.show()

Mês e dia são completamente randômicos, utiliza distribuição uniforme


In [None]:
random_dates_with_year = np.empty((USER_COUNT,), dtype=object)
for i in range(USER_COUNT):
    year = random_years[i]
    lower_bound = datetime(year, 1, 1)
    upper_bound = datetime(year, 12, 31)
    random_date = fake.date_between(
        start_date=lower_bound, end_date=upper_bound).strftime('%Y-%m-%d')
    random_dates_with_year[i] = random_date
users['birthday'] = random_dates_with_year

Ordenando DataFrame de forma que created_at seja crescente


In [None]:
users = users.sort_values(by='created_at')
users = users.reset_index(drop=True)
display(users)

# Endereços


Queremos mudar a distribuição de endereços com base na data do onboarding. Portanto, é interessante antes criar uma função que recebe alguns parâmetros, como:

- Importância da população para decidir a cidade
- Presença de cidades fora do Brasil?
- etc


## Cidades do Brasil

Utilizando .csv de municipios contidos [aqui](http://blog.mds.gov.br/redesuas/wp-content/uploads/2018/06/Lista_Munic%C3%ADpios_com_IBGE_Brasil_Versao_CSV.csv)


In [None]:
# Renomeie o CSV
municipios = pd.read_csv('municipios.csv', sep=';')
# Ignorando duas colunas
municipios = municipios.drop(columns=[
    'ConcatUF+Mun',
    'Unnamed: 9']).sort_values(by=['População 2010'], ascending=False)
# Ignorando munícipios sem informação de população
municipios = municipios.dropna(subset=['População 2010'])
municipios = municipios.reset_index(drop=True)
display(municipios.head(5))
# Transformando Presidente Prudente em uma cidade com 2M de habitantes
# Só pelo lulz
municipios.loc[
    municipios['Município'] == 'Presidente Prudente',
    'População 2010'
  ] = 2000000
display(municipios[municipios['Município'] == 'Presidente Prudente'])

## Cidades fora do Brasil

Utilizando csv que pode ser encontrado [aqui](https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.76.zip)


In [None]:
world_cities = pd.read_csv('worldcities.csv')
# Ignorando cidades sem dados populacionais
world_cities = world_cities.dropna(subset=['population'])
# Ignorando cidades brasileiras
world_cities = world_cities[world_cities['country'] != 'Brazil']
world_cities = world_cities.reset_index(drop=True)
display(world_cities.head(5))

Países da América Latina


In [None]:
latin_america_countries = [
    'Argentina',
    'Bolivia',
    'Chile',
    'Colombia',
    'Ecuador',
    'Paraguay',
    'Peru',
    'Uruguay'
]

Finalmente, preparando a função


In [None]:
def generate_random_cities(size, population_importance=1.6, non_brazilian_percent=0.0, latin_america_importance=10.0):
    brazilian_percent = 1.0 - non_brazilian_percent
    brazilian_size = int(size * brazilian_percent)
    non_brazilian_size = size - brazilian_size
    cities = pd.DataFrame(columns=['city', 'state', 'country'])
    # Cidades brasileiras
    brazilian_cities = pd.DataFrame(columns=['city', 'state', 'country'])
    brazilian_population_power = np.power(
        municipios['População 2010'], population_importance)
    brazilian_population_weights = brazilian_population_power / \
        np.sum(brazilian_population_power)
    brazilian_cities_idx = np.random.choice(
        municipios.index, p=brazilian_population_weights, size=brazilian_size)
    brazilian_cities['city'] = municipios.loc[brazilian_cities_idx,
                                              'Município'].values
    brazilian_cities['state'] = municipios.loc[brazilian_cities_idx, 'UF'].values
    brazilian_cities['country'] = 'Brazil'
    # Cidades não brasileiras
    non_brazilian_cities = pd.DataFrame(columns=['city', 'state', 'country'])
    non_brazilian_population_power = np.power(
        world_cities['population'], population_importance)
    latin_american_countries_idx = world_cities['country'].isin(
        latin_america_countries)
    non_brazilian_population_power[latin_american_countries_idx] = np.power(
        non_brazilian_population_power[latin_american_countries_idx], latin_america_importance)
    non_brazilian_population_weights = non_brazilian_population_power / \
        np.sum(non_brazilian_population_power)
    non_brazilian_cities_idx = np.random.choice(
        world_cities.index, p=non_brazilian_population_weights, size=non_brazilian_size)
    non_brazilian_cities['city'] = world_cities.loc[non_brazilian_cities_idx, 'city'].values
    non_brazilian_cities['state'] = ''
    non_brazilian_cities['country'] = world_cities.loc[non_brazilian_cities_idx, 'country'].values
    # Concatenando
    cities = pd.concat([brazilian_cities, non_brazilian_cities])
    cities = cities.reset_index(drop=True)
    return cities

Primeiros 2 meses: focar em capitais do Brasil


In [None]:
start_idx = 0
end_idx = accumulated_month_counts[1]
count = end_idx - start_idx
cities = generate_random_cities(count, population_importance=2.5)
users.loc[users.index[start_idx: end_idx], [
    'city', 'state', 'country']] = cities.values

2 próximos meses: cidades brasileiras com menor população


In [None]:
start_idx = accumulated_month_counts[1]
end_idx = accumulated_month_counts[3]
count = end_idx - start_idx
cities = generate_random_cities(count, population_importance=1.5)
users.loc[users.index[start_idx: end_idx], [
    'city', 'state', 'country']] = cities.values

2 últimos meses: cidades no exterior


In [None]:
start_idx = accumulated_month_counts[3]
end_idx = accumulated_month_counts[-1]
count = end_idx - start_idx
cities = generate_random_cities(
    count, population_importance=1.0, non_brazilian_percent=0.15, latin_america_importance=4.0)
users.loc[users.index[start_idx: end_idx], [
    'city', 'state', 'country']] = cities.values

Resultado final


In [None]:
display(users['country'].value_counts())
display(users['state'].value_counts())
display(users['city'].value_counts())

# Salvando dados de usuário em CSV


In [None]:
users.to_csv('users.csv', index=False)

# Transferências


## Função que gera transferências com determinada distribuição de valores e horas


In [None]:
import numpy as np
from datetime import datetime, timedelta
import pandas as pd
import calendar


def generate_transfers(
        user_ids,
        transfer_quantity,
        loc=50,
        scale=25,
        num_outliers=0
    ):
    transfers = pd.DataFrame(
        columns=[
            'id_from',
            'id_to',
            'hour',
            'minute',
            'scheduled_date',
            'value',
            'status'
        ]
    )
    ids_from, ids_to = user_ids
    total_size = transfer_quantity
    values = np.random.normal(
        loc=loc,
        scale=scale,
        size=total_size - num_outliers
    )
    values = np.append(values, np.random.normal(
            loc=3000,
            scale=1000,
            size=num_outliers
        )
    )
    values = np.abs(values)
    values = np.round(values, 2)
    status = np.random.choice(
        ['FAILED', 'DONE', 'SCHEDULED'],
        p=[0.001, 0.6, 0.399], size=total_size)
    # Chose random ids
    transfers_ids_from = np.random.choice(ids_from, size=total_size)
    transfers_ids_to = np.random.choice(ids_to, size=total_size)
    transfers['id_from'] = transfers_ids_from
    transfers['id_to'] = transfers_ids_to
    transfers['value'] = values
    transfers['status'] = status
    # Horário das transferências
    minutes = gen_random_minutes(total_size)
    hours = minutes // 60
    minutes = minutes % 60
    transfers['hour'] = hours
    transfers['minute'] = minutes
    return transfers


def decide_transfers_dates(transfers: pd.DataFrame):
    users_created_at = users[['id', 'created_at']]
    transfer_with_user_info = pd\
        .merge(
            transfers,
            users_created_at,
            left_on='id_from',
            right_on='id',
            how='inner')\
        .drop('id', axis=1)
    transfer_with_user_info = pd\
        .merge(
            transfer_with_user_info,
            users_created_at,
            left_on='id_to',
            right_on='id',
            how='inner')\
        .drop('id', axis=1)
    transfer_with_user_info['lower_bound'] = transfer_with_user_info\
        .apply(
            lambda row: max(
                row['created_at_x'],
                row['created_at_y']
            ),
            axis=1
    )
    transfer_with_user_info = transfer_with_user_info\
        .drop(
            ['created_at_x', 'created_at_y'],
            axis=1
        )
    def gen_month_day(hour, minute, lower_bound):
        lower_bound = datetime.fromisoformat(lower_bound)
        month = np.random.randint(lower_bound.month, 8)
        _, last_day = calendar.monthrange(2023, month)
        if month == 7:
            day = 1
        else:
            if month > lower_bound.month:
                start_day = 1
            else:
                start_day = lower_bound.day
            date = datetime(2023, month, start_day, int(hour), int(minute))
            if (date <= lower_bound):
                start_day += 1
                if start_day > last_day:
                    month += 1
                    start_day = 1
            day = generate_days((2023, month), 1, start_day=start_day)[0]
        date = datetime(2023, month, day, hour, minute)
        if (date > lower_bound):
            return (day, month)
    gen_month_day_vectorized = np.vectorize(gen_month_day)
    temp = np.array([
        transfer_with_user_info['hour'],
        transfer_with_user_info['minute'],
        transfer_with_user_info['lower_bound']
    ])
    day, month = gen_month_day_vectorized(*temp)
    temp_t = temp.transpose()
    temp = np.array([
        month,
        day,
        temp_t[:, 0],
        temp_t[:, 1]
    ])
    def gen_date_time(month, day, hour, minute):
        return datetime(
            2023,
            month,
            day,
            hour,
            minute,
            np.random.randint(0, 60)).isoformat()
    gen_date_time_vectorized = np.vectorize(gen_date_time)
    date_times = gen_date_time_vectorized(*temp)
    new_transfers = transfers.copy()
    new_transfers.drop(['hour', 'minute'], axis=1, inplace=True)
    new_transfers['time'] = date_times
    return new_transfers

In [None]:
def age_from_birthday(birthday_str):
    birthday = datetime.fromisoformat(birthday_str)
    difference = datetime.now() - birthday
    age = difference.days // 365
    return age


users['age'] = users['birthday'].apply(age_from_birthday)
display(users)

Usuários na faixa de 18-25: valores mais baixos


In [None]:
user_subpopulation = users['age'].between(18, 25)
subpopulation_size = user_subpopulation.count()
active_sample_size = int(0.6 * subpopulation_size)
active_users_ids = np.random.choice(users[user_subpopulation]['id'], size=active_sample_size)
transfers_amount = 5 * active_sample_size
transfers_sample = generate_transfers(
    (active_users_ids, users['id']), transfers_amount, loc=80, scale=50, num_outliers=int(transfers_amount * 0.01))

In [None]:
transfers_sample = decide_transfers_dates(transfers_sample)
display(transfers_sample)

In [None]:
from datetime import datetime

def generate_scheduled_date(status, time):
    if status == 'SCHEDULED':
        return fake.date_between(
            datetime.fromisoformat(time),
            datetime(2023, 7, 31))
    else:
        return None

generate_scheduled_date_vectorized = np.vectorize(generate_scheduled_date)
input_matrix = np.array([
    transfers_sample['status'],
    transfers_sample['time']
])
scheduled = generate_scheduled_date_vectorized(*input_matrix)
transfers_sample['scheduled_date'] = scheduled
display(transfers_sample)

In [None]:
transfers = pd.concat([transfers_sample], axis=1)
transfers = transfers.reset_index()

Usuários na faixa de 26-60: valores mais altos


In [None]:
user_subpopulation = users['age'].between(26, 60)
subpopulation_size = user_subpopulation.count()
active_sample_size = int(0.6 * subpopulation_size)
active_users_ids = np.random.choice(users[user_subpopulation]['id'], size=active_sample_size)
transfers_amount = 5 * active_sample_size
transfers_sample = generate_transfers(
    (active_users_ids, users['id']), transfers_amount, loc=500, scale=200, num_outliers=int(transfers_amount * 0.01))
transfers_sample = decide_transfers_dates(transfers_sample)
generate_scheduled_date_vectorized = np.vectorize(generate_scheduled_date)
input_matrix = np.array([
    transfers_sample['status'],
    transfers_sample['time']
])
scheduled = generate_scheduled_date_vectorized(*input_matrix)
transfers_sample['scheduled_date'] = scheduled

In [None]:
transfers = pd.concat([transfers, transfers_sample], axis=1)
transfers = transfers.reset_index()
display(transfers)

Usuários na faixa de 60+: valores mais baixos


In [None]:
user_subpopulation = users['age'].between(61, 999)
subpopulation_size = user_subpopulation.count()
active_sample_size = int(0.25 * subpopulation_size)
active_users_ids = np.random.choice(users[user_subpopulation]['id'], size=active_sample_size)
transfers_amount = 3 * active_sample_size
transfers_sample = generate_transfers(
    (active_users_ids, users['id']), transfers_amount, loc=200, scale=80, num_outliers=int(transfers_amount * 0.001))
transfers_sample = decide_transfers_dates(transfers_sample)
generate_scheduled_date_vectorized = np.vectorize(generate_scheduled_date)
input_matrix = np.array([
    transfers_sample['status'],
    transfers_sample['time']
])
scheduled = generate_scheduled_date_vectorized(*input_matrix)
transfers_sample['scheduled_date'] = scheduled

In [None]:
transfers = pd.concat([transfers, transfers_sample], axis=1)
transfers = transfers.reset_index()
display(transfers)