In [1]:
import pandas as pd
import numpy as np
import string
from pathlib import Path
import csv

seed = 42

n_users = 1_000_000

output_dir = Path("generated_data")
output_dir.mkdir(parents=True, exist_ok=True)

def save_file(data, file_name):
    with open(output_dir / file_name, "w") as f:
        csv.writer(f).writerows(data)
    
def generate_ids(array):
    return np.arange(1, array.shape[0] + 1)

def read_column_data(file_name):
    return pd.read_csv(file_name, names = ["line"])["line"].values

printable = set(string.printable)
def filter_not_printable(str_arr):
    return np.array([string for string in str_arr if not any(char for char in string if char not in printable)])

## Users

In [2]:
password = "$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6"

user_passwords = np.repeat(password, n_users)
user_passwords[:5]

array(['$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
       '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
       '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
       '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
       '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6'],
      dtype='<U60')

In [3]:
roles = "USER"

user_roles = np.repeat(roles, n_users)
user_roles[:5]

array(['USER', 'USER', 'USER', 'USER', 'USER'], dtype='<U4')

In [4]:
np.random.seed(seed)

chars = np.array(list(string.ascii_uppercase + string.digits))
login_min_length = 5
login_max_length = 15

def generate_login(*args):
    return "".join(
        np.random.choice(chars, size = np.random.randint(login_min_length, login_max_length))
    ) + str(int(args[0]))

user_logins = np.fromfunction(np.vectorize(generate_login), shape = (n_users,))
user_logins[:5]

array(['VBX3BU60', 'VY01POCGUIRDYN1', 'IZBT1G2', '8NQ9DBFD2RZ73',
       '9N4OHNWUPRXZY24'], dtype='<U20')

In [5]:
user_ids = generate_ids(user_logins)
user_ids[:5]

array([1, 2, 3, 4, 5])

In [6]:
user_header = np.array(["ID", "LOGIN", "PASSWORD", "ROLES"])
users = np.stack([
    user_ids,
    user_logins,
    user_passwords,
    user_roles
]).T
users = np.vstack([user_header, users])
users[:5]

array([['ID', 'LOGIN', 'PASSWORD', 'ROLES'],
       ['1', 'VBX3BU60',
        '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
        'USER'],
       ['2', 'VY01POCGUIRDYN1',
        '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
        'USER'],
       ['3', 'IZBT1G2',
        '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
        'USER'],
       ['4', '8NQ9DBFD2RZ73',
        '$2a$10$IXXF/ArqWDCv6Jnms79V.OTke3P.qCf1fVnqfOs/q2iWtyyeuzaz6',
        'USER']], dtype='<U60')

In [7]:
save_file(users, "user.csv")

## Cities

In [8]:
city_names = filter_not_printable(pd.unique(pd.read_csv("data/world-cities.csv")["name"]))
city_names[:5]

array(['les Escaldes', 'Andorra la Vella', 'Umm al Qaywayn',
       'Ras al-Khaimah', 'Dubai'], dtype='<U49')

In [9]:
city_ids = generate_ids(city_names)
city_ids[:5]

array([1, 2, 3, 4, 5])

In [10]:
city_header = np.array(["ID", "NAME"])
cities = np.stack([city_ids, city_names]).T
cities = np.vstack([city_header, cities])
cities[:5]

array([['ID', 'NAME'],
       ['1', 'les Escaldes'],
       ['2', 'Andorra la Vella'],
       ['3', 'Umm al Qaywayn'],
       ['4', 'Ras al-Khaimah']], dtype='<U49')

In [11]:
save_file(cities, "city.csv")

## Interests

In [12]:
interest_names = filter_not_printable(np.unique(np.char.lower(read_column_data("data/hobbies.csv").astype(str))))
interest_names[:5]

array([' abandoned animals', ' field', 'acting', 'action figures',
       'adventure park'], dtype='<U38')

In [13]:
interest_ids = generate_ids(interest_names)
city_ids[:5]

array([1, 2, 3, 4, 5])

In [14]:
interest_header = np.array(["ID", "NAME"])
interests = np.stack([interest_ids, interest_names]).T
interests = np.vstack([interest_header, interests])
interests[:5]

array([['ID', 'NAME'],
       ['1', ' abandoned animals'],
       ['2', ' field'],
       ['3', 'acting'],
       ['4', 'action figures']], dtype='<U38')

In [15]:
save_file(interests, "interest.csv")

## Genders

In [16]:
gender_names = np.array(["Male", "Female"])
gender_names[:5]

array(['Male', 'Female'], dtype='<U6')

In [17]:
gender_ids = generate_ids(gender_names)
gender_ids[:5]

array([1, 2])

In [18]:
gender_header = np.array(["ID", "NAME"])
genders = np.stack([gender_ids, gender_names]).T
genders = np.vstack([gender_header, genders])
genders[:5]

array([['ID', 'NAME'],
       ['1', 'Male'],
       ['2', 'Female']], dtype='<U21')

In [19]:
save_file(genders, "gender.csv")

## User Personal Details

In [20]:
get_second = np.vectorize(lambda r: r.strip().split(" ")[1].capitalize())
last_names = get_second(read_column_data("data/facebook-lastnames-withcount.txt"))
last_names[:5]

array(['Smith', 'Johnson', 'Jones', 'Williams', 'Brown'], dtype='<U30')

In [21]:
male_first_names = read_column_data("data/names/male.txt")
male_first_names[:5]

array(['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot'], dtype=object)

In [22]:
female_first_names = read_column_data("data/names/female.txt")
female_first_names[:5]

array(['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi'], dtype=object)

In [23]:
np.random.seed(seed)

n_male = np.random.randint(n_users/4, 3*n_users/4)
n_female = n_users - n_male

print(f"n = {n_users}, n_male = {n_male}, n_female = {n_female}")

n = 1000000, n_male = 371958, n_female = 628042


In [24]:
picked_genders = np.concatenate([np.full(n_male, 1), np.full(n_female, 2)])
picked_genders[:5]

array([1, 1, 1, 1, 1])

In [25]:
np.random.seed(seed)
picked_male_names = np.random.choice(male_first_names, n_male)

np.random.seed(seed)
picked_female_names = np.random.choice(female_first_names, n_female)

picked_first_names = np.concatenate([picked_male_names, picked_female_names])
picked_first_names[:5]

array(['Forbes', 'Jean-Francois', 'Hazel', 'Harald', 'Marmaduke'],
      dtype=object)

In [26]:
np.random.seed(seed)

picked_last_names = np.random.choice(last_names, n_users)
picked_last_names[:5]

array(['Wardyto', 'Hertam', 'Cattirisetti', 'Brockin', 'Folus'],
      dtype='<U30')

In [27]:
np.random.seed(seed)

picked_city_ids = np.random.choice(city_ids, n_users)
picked_city_ids[:5]

array([15796,   861,  5391, 11965, 11285])

In [28]:
np.random.seed(seed)

day_range = np.arange(35_000)
start_date = np.datetime64('1900-01-01')
def random_date(*args):
    random_date = start_date + np.random.choice(day_range)
    return random_date

user_birth_dates = np.fromfunction(np.vectorize(random_date), shape = (n_users,))
user_birth_dates[:5]

array(['1902-05-11', '1930-11-24', '1917-02-26', '1946-02-19',
       '1960-02-18'], dtype='datetime64[D]')

In [29]:
user_personal_details_header = np.array([
    "ID",
    "USER_ID",
    "FIRST_NAME",
    "LAST_NAME", 
    "BIRTH_DATE",
    "GENDER_ID",
    "CITY_ID"
])
user_personal_details = np.stack([
    user_ids,
    user_ids,
    picked_first_names,
    picked_last_names,
    user_birth_dates,
    picked_genders,
    picked_city_ids
]).T
user_personal_details = np.vstack([user_personal_details_header, user_personal_details])
user_personal_details[:5]

array([['ID', 'USER_ID', 'FIRST_NAME', 'LAST_NAME', 'BIRTH_DATE',
        'GENDER_ID', 'CITY_ID'],
       [1, 1, 'Forbes', 'Wardyto', datetime.date(1902, 5, 11), 1, 15796],
       [2, 2, 'Jean-Francois', 'Hertam', datetime.date(1930, 11, 24), 1,
        861],
       [3, 3, 'Hazel', 'Cattirisetti', datetime.date(1917, 2, 26), 1,
        5391],
       [4, 4, 'Harald', 'Brockin', datetime.date(1946, 2, 19), 1, 11965]],
      dtype=object)

In [30]:
save_file(user_personal_details, "user_personal_details.csv")

## Users Interests

In [31]:
np.random.seed(seed)

interests_cnts = np.arange(1, 15)

def user_to_interests(user_id):
    user_interests = np.unique(np.random.choice(interest_ids, np.random.choice(interests_cnts)))
    return np.stack([np.repeat(user_id, user_interests.shape[0]), user_interests]).T

user_interests_header = np.array(["USER_ID", "INTEREST_ID"])
user_interests = np.concatenate(list(map(user_to_interests, user_ids)))
user_interests = np.vstack([user_interests_header, user_interests])
user_interests[:5]

array([['USER_ID', 'INTEREST_ID'],
       ['1', '21'],
       ['1', '72'],
       ['1', '107'],
       ['1', '122']], dtype='<U21')

In [32]:
save_file(user_interests, "user_to_interest.csv")

## Followers

In [33]:
np.random.seed(seed)

followed_cnt = np.arange(0, 10)
def follwed_by_user(user_id):
    follwed_user_ids = np.unique(np.random.choice(user_ids, np.random.choice(followed_cnt)))
    return np.array(np.stack([np.repeat(user_id, follwed_user_ids.shape[0]), follwed_user_ids]).T)

followings_header = np.array(["FOLLOWER_ID", "FOLLOWED_ID"])
followings = np.concatenate(list(map(follwed_by_user, user_ids)))
followings = np.vstack([followings_header, followings])
followings[:5]

array([['FOLLOWER_ID', 'FOLLOWED_ID'],
       ['1', '110269'],
       ['1', '131933'],
       ['1', '259179'],
       ['1', '365839']], dtype='<U21')

In [34]:
save_file(followings, "follower.csv")

## Copy generated data to application

In [35]:
! cp generated_data/* ../social-network/src/main/resources/db/changelog/data/