#### Imports & Set-Up

In [34]:
import pandas as pd
from faker import Faker
from faker.providers import BaseProvider
import numpy as np
from datetime import date
import requests
import json
import chart_studio.plotly as py
import cufflinks as cf
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

# Make Plotly work in your Jupyter Notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
# Use Plotly locally
cf.go_offline()

In [None]:
template_style = "seaborn"
pd.set_option("display.max_columns", None)

#### Creating DF [feat. Faker & Numpy]

Well, we are going to create a DF containing **name**, **age**, **birthdate**, **gender**, **nationality**, **height**, **weight**, **eye color**, **hair color**, and **blood group** for 1.000 people.
We're gonna use numpy and Faker to generate the data.

In [None]:
locales = ["az_AZ", "cs_CZ", "da_DK", "de_AT", "de_CH", "de_DE", "en", "en_AU", "en_CA", "en_GB", "en_IE", "en_IN", "en_NZ", "en_PH", "en_TH", "en_US", "et_EE", "fi_FI", "fr", "fr_CA", "fr_CH", "fr_FR", "ga_IE", "hr_HR", "hu_HU", "id_ID", "it_IT", "lt_LT", "lv_LV", "nl_NL", "no_NO", "pl_PL", "pt_BR", "pt_PT", "ro_RO", "sl_SI", "sv_SE", "tl_PH", "tr_TR"]
blood_groups = ['AB-', 'AB+', 'B-', 'O+', 'A+', 'O-', 'A-', 'B+']
columns = ["Name", "Birthdate", "Age", "Gender", "Blood_Group", "Height_cm", "Weight_lbs", "Eye_Color", "Hair_Color", "Nationality"]
eye_colors = ["Blue", "Green", "Brown", "Hazel", "Gray", "Amber", "Black"]
hair_colors = ["Blond", "Brown", "Auburn", "White", "Red", "Black"]

let's predefine some functions to make our data more realistic

- heigt and weight

In [None]:
# make height and weight more realistic based on age and gender
def realistic_height_weight(age: int, gender: str) -> dict:
    # for men (up to 200 cm) [weight in kg]
    gen_m = {"M", "U"}
    gen_f = {"F", "U"}
    if age <= 10 and gender in gen_m:
        h = np.random.randint(low=80, high=110)
        w = np.random.randint(low=10, high=25)
    if 10 < age <= 20 and gender in gen_m:
        h = np.random.randint(low=111, high=175)
        w = np.random.randint(low=26, high=60)
    if age > 20 and gender in gen_m:
        h = np.random.randint(low=161, high=200)
        w = np.random.randint(low=51, high=100)
    # for women (up to 180 cm)
    if age <= 10 and gender in gen_f:
        h = np.random.randint(low=80, high=110)
        w = np.random.randint(low=10, high=24)
    if 10 < age <= 20 and gender in gen_f:
        h = np.random.randint(low=111, high=160)
        w = np.random.randint(low=25, high=52)
    if age > 20 and gender in gen_f:
        h = np.random.randint(low=141, high=180)
        w = np.random.randint(low=41, high=80)
    return {
        "height": h,
        "weight": round(w * 2.20462262185, 2)}

- gender [with API]

In [None]:
# using Genderize.io API
endpoint = "https://api.genderize.io/?name="
def get_gender(name):
    response = requests.get(f"{endpoint}{name}").text
    return json.loads(response)["gender"][0].upper() if "male" in response else "U"

- name, birthdate, age, eye & hair color

In [None]:
def get_name(fake: dict) -> str:
    return f"{fake.first_name()} {fake.last_name()}"

def get_birthdate(fake: dict) -> date:
    return fake.date_of_birth(None, 12, 85)

def get_age(birthdate: date) -> int:
    return ((date.today() - birthdate) / 365.25).days

def get_attr(attr: list) -> str:
    return np.random.choice(attr)

In [None]:
exc_prefixes = ['PLT OFF', 'Mrs', 'doktor', 'CPL', 'RNDr.', 'PHRA', 'prof', 'CPO 1', 'ADM', 'rva', 'CAPT', 'CPO 3', 'PO 1', 'PFC', 'Dott.', 'MR', 'PHRAKHU SAMU', 'da', 'Prof.', 'PHRAMAHA', 'Sr.', 'LAC', 'FLG OFF', 'Xanım', 'Ms', 'S M 1', 'Rouva', 'tri', 'Mrs.', 'GP CAPT', 'S M 3', 'PHRAPALAD', 'prof.', 'Sutan', 'Hj.', 'PHRAKHU PALAD', 'Univ.Prof.', 'MAJ', 'MISS', 'POL SGT MAJ', 'S M 2', 'POL GEN', 'POL SUB LT', 'pan', 'SEA-MAN', 'Tohtori', 'do', 'Ms.', 'Sra.', 'Dt.', 'PHRA ATHIKAN', 'Hr', 'Le', 'Tgk.', 'Mr.', 'PO 3', 'Ing.', 'POL SGT', 'POL MAJ', 'PHRABAIDIKA', 'PHRAKHU BAIDIKA', 'PVT', 'pani', 'dr.', 'SAMANERA', 'PHRASAMU', 'LT GEN', 'Cut', 'POL CAPT', 'Mgr.', 'R ADM', 'PHRAKHU DHAMMADHORN', 'POL LT GEN', 'FS 1', 'AM', 'du', 'Drs.', 'R.A.', 'FS 3', 'Bc.', 'POL CPL', 'CPO 2', 'WG CDR', 'GEN', 'Puti', 'POL L/C', 'Cənab', 'AVM', 'POL LT COL', 'drg.', 'POL COL', 'POL LT', 'proua', 'MAJ GEN', 'Sig.', 'Bay', 'Müəllim', 'KH.', 'Bəy', 'LT JG', 'AMN', 'Herr', 'Fru', 'Mag.', 'arkkit.', 'Dipl.-Ing.', 'de', 'POL MAJ GEN', 'POL CONST', 'hr', 'R.', 'hra', 'LT COL', 'FS 2', 'slečna', 'Misc.', 'LT', 'L CDR', 'Mx.', 'Frau', 'Dr', 'REV', 'H.', 'JUDr.', 'SGT', 'COL', 'T.', 'Ir.', 'Dr.', 'Ind.', 'Herra', 'M R', 'Bayan', 'CHAO ATHIKAN', 'paní', 'Sig.ra', 'POL SEN SGT MAJ', 'PHRAKHU VINAIDHORN', 'Srta.', 'CDR', 'de la', 'MUDr.', 'SQN LDR', 'PO 2', 'FLT LT', 'R.M.', 'V ADM', 'Dra.', 'ACM', 'Mr', 'M L', 'Miss', 'SUB LT', 'dr', 'härra', 'Prof. Dr.', 'pr']
exc_suffixes = ['S.H.', 'DVM', 'PhD', 'S.E.', 'MD', 'B.Eng.', 'DI', 'M.Farm', 'M.Kom.', 'Ph.D.', 'IV', 'M.Pd', 'S.Sos', 'MSc', 'II', 'Th.D.', 'M.TI.', 'S.Pd', 'BSc', 'S.I.Kom', 'S.Farm', 'S.Kom', 'CSc.', 'S.T.', 'III', 'S.E.I', 'Jr.', 'DiS.', 'S.Gz', 'S.IP', 'S.Psi', 'V', 'M.Ak', 'MBA.', 'S.Ked', 'M.M.', 'B.A.', 'S.Pt', 'DDS', 'B.Sc.']

finally... the actual DF

In [None]:
# define [HUGE] function to generate the DF
df = pd.DataFrame(columns=columns)
def data(size: int) -> pd.DataFrame:
    for i in range(size):
        fake = Faker(locales)
        name = get_name(fake)
        birthdate = get_birthdate(fake)
        age = get_age(birthdate)
        gender = get_attr(["M", "F"])
        h_w = realistic_height_weight(age, gender)
        df.loc[i] = [name,
                     birthdate,
                     age,
                     gender,
                     get_attr(blood_groups),
                     h_w["height"],
                     h_w["weight"],
                     get_attr(eye_colors),
                     get_attr(hair_colors),
                     "NA"]
        print(i+1, end=" ")
    return df

In [None]:
# function call
rows = 100
df_ert = data(size=rows)

In [None]:
df_ert.info()

- let's accomodate the data types

In [None]:
df_ert = df_ert.astype({
    "Birthdate": np.datetime64,
    "Age": np.uint8,
    "Height_cm": np.uint8,
    "Weight_lbs": np.float16})

In [None]:
df_ert.info()

In [33]:
df_ert.head(10)

Unnamed: 0,Name,Birthdate,Age,Gender,Blood_Group,Height_cm,Weight_lbs,Eye_Color,Hair_Color,Nationality
0,Egbert Täsche,2003-12-16,18,F,AB+,149,70.5625,Green,White,
1,Irina James,1966-10-24,55,F,O-,172,145.5,Black,Blond,
2,Kazimierz Archer,1983-09-16,38,F,AB-,162,141.125,Green,Brown,
3,Linde Huber,1956-03-22,66,M,AB+,179,147.75,Blue,Auburn,
4,Sandis Zielke,1983-06-28,38,M,O+,194,180.75,Blue,Red,
5,Marina Kohler,1938-02-04,84,M,B-,198,180.75,Brown,Auburn,
6,H.-Dieter Korutürk,1995-04-28,27,M,AB-,191,202.875,Gray,Brown,
7,Helene Deladoëy,1946-09-14,75,F,O+,175,121.25,Black,Red,
8,Romana Matinawin,1962-10-01,59,F,O+,169,163.125,Amber,Brown,
9,Petr Lūsis,1966-10-18,55,F,A+,172,116.8125,Gray,Blond,


> #### Now that we have our DF (and a nice model for future application), let's go ahead and analyse it !