# Use Faker to Generate Common Data <a class="tocSkip">

https://faker.readthedocs.io/en/master/index.html

In [39]:
!pip install faker



In [40]:
!faker --version

faker 8.12.0


## Dynamic Way to Call Function with Parameters

In [1]:
from faker import Faker

fake = Faker()

Load function parameters from JSON object

In [57]:
import json
param = {"start_date":'-30y', "end_date":'today'}
param_str = json.dumps(param)
param_str
param_dict = json.loads(param_str)
param_dict

{'start_date': '-30y', 'end_date': 'today'}

Use function name to call a function.

In [80]:
# Name of the function
func_name = 'date_between'

# Make sure object has the function 
if not hasattr(fake, func_name):
    raise Exception(f'Invalid attribute name {func_name} for object {fake}')

method_to_call = getattr(fake, func_name)
# Make sure the attribute is callable
if not callable(method_to_call):
    raise Exception(f'Invalid function {method_to_call} for object {fake}')

result = method_to_call(**param_dict)
result.strftime('%Y-%m-%d')

'1993-02-15'

### List all Methods of an Object

In [81]:
from inspect import signature

In [98]:
sig = signature(fake.date_between)

In [104]:
sig.parameters

mappingproxy({'start_date': <Parameter "start_date='-30y'">,
              'end_date': <Parameter "end_date='today'">})

In [115]:
[(v.name, v.default) for k,v in sig.parameters.items()]

[('start_date', '-30y'), ('end_date', 'today')]

In [95]:
sig.from_function()

## Basic Faker Functions

In [33]:
from faker import Faker

In [20]:
# initialize a generator
fake = Faker()
#create some fake data
print(fake.name())
print(fake.date_between(start_date='-30y', end_date='today'))
print(fake.color_name())

Spencer Strickland
1997-07-26
Moccasin


In [21]:
fake.profile()

{'job': 'Educational psychologist',
 'company': 'Hart Ltd',
 'ssn': '127-40-2824',
 'residence': '58029 Alejandro Crescent Apt. 145\nCarolside, NM 56860',
 'current_location': (Decimal('86.164442'), Decimal('-63.944189')),
 'blood_group': 'O-',
 'website': ['https://craig.com/'],
 'username': 'kallen',
 'name': 'William Anderson',
 'sex': 'M',
 'address': '36533 Bell Island\nReginaland, AK 44147',
 'mail': 'reeveskelly@gmail.com',
 'birthdate': datetime.date(1963, 11, 1)}

In [22]:
[fake.color_name() for x in range(4)]

['DimGray', 'LightSkyBlue', 'Bisque', 'DarkGray']

Use Faker to create objects with different attributes

In [25]:
fake_workers = [
    {'Worker Name':fake.name(), 
     'Hire Date':fake.date_between(start_date='-30y', end_date='today')
    } for x in range(5)]
                 
fake_workers

[{'Worker Name': 'Krystal James', 'Hire Date': datetime.date(2001, 6, 17)},
 {'Worker Name': 'James Parrish', 'Hire Date': datetime.date(2001, 9, 12)},
 {'Worker Name': 'Amanda Stein', 'Hire Date': datetime.date(2021, 7, 28)},
 {'Worker Name': 'John Riggs', 'Hire Date': datetime.date(1994, 4, 8)},
 {'Worker Name': 'Julia Lewis', 'Hire Date': datetime.date(1994, 3, 18)}]

### Random Generation of Data with Various Probability

Use `numpy.random.choice()` for sampling of data with different frequencies.


In [27]:
import numpy as np

In [29]:
colors = [fake.color_name() for x in range(4)]

Randomly select items from `colors` with different probability for each of the 4 items.

In [30]:
[np.random.choice(colors, p=[0.1, 0.6, 0.1, 0.2]) for x in range(10)]

['Lavender',
 'Lavender',
 'LightBlue',
 'LightBlue',
 'Lavender',
 'DarkCyan',
 'DarkCyan',
 'LightBlue',
 'Lavender',
 'Lavender']

## Community Provider - SG NRIC


https://en.wikipedia.org/wiki/National_Registration_Identity_Card

### Provider

In [97]:
from faker.providers import BaseProvider
from faker import Faker
from datetime import datetime
from dateutil.relativedelta import relativedelta


class SgNricProvider(BaseProvider):
    """
    A Faker provider for Singapore NRIC (National Registration Identity Card) Number
    """

    def __init__(self, generator):
        super().__init__(generator)
        self.fake = Faker()
    
    
    @staticmethod
    def first_characters():
        """
        Return list of possible first character.        
        """
        return {
            "S": "Singapore citizens and permanent residents born before 1 January 2000.",
            "T": "Singapore citizens and permanent residents born on or after 1 January 2000",
            "F": "Foreigners issued with long-term passes before 1 January 2000", 
            "G": "Foreigners issued with long-term passes from 1 January 2000 to 31 December 2021",
            "M": "Foreigners issued with long-term passes on or after 1 January 2022"
        }
            
    @classmethod
    def checksum(cls, nric: str)->str:
        """
        Generate checksum from 7 digits
        """
        LETTERS = {
         1: 'A', 2: 'B', 3: 'C', 4: 'D', 5: 'E',
         6: 'F', 7: 'G', 8: 'H', 9: 'I', 10: 'Z', 11: 'J'
        }
        if len(nric) != 7 or (not nric.isdigit()):
            raise Exception('NRIC must be 7 digits')

        s = list(nric)
        weights = [2, 7, 6, 5, 4, 3, 2]
        sum = 0
        for i in range(len(weights)):
            sum = sum + int(s[i]) * weights[i]
            r = sum % 11
            c = 11 - r
        return LETTERS[c]

    
    def sg_nric_type_s(self, min_age=None, max_age=115):
        """
        "S": "Singapore citizens and permanent residents born before 1 January 2000.",
        Singapore citizens and permanent residents born on or after 1 January 1968 are issued NRIC numbers starting with their year of birth.
        For Singapore citizens and permanent residents born on or before 31 December 1967, the NRIC numbers commonly begin with 0 or 1.
        Non-native residents born before 1968 are assigned the heading numbers 2 or 3 upon attaining permanent residency or citizenship.
        """
        # Get birthday between 
        end_date = datetime(1999, 12, 31)
        current = datetime.today()
        difference = relativedelta(current, end_date)
        
        if not min_age:
            min_age = difference.years        
        elif min_age and (min_age < difference.years):
            print(f'Minimum age for S type NRIC is {difference.years}.')
            min_age = difference.years
        
        if min_age > max_age:
            return None

        birthday = self.fake.date_of_birth(tzinfo=None, minimum_age=min_age, maximum_age=max_age)

        nric = f'{datetime.strftime(birthday,"%y")}#####'
        nric = self.fake.numerify(nric)
        
        if birthday < datetime(1967, 12, 31).date():
            nric = self.fake.random_element(elements=('0', '1', '2', '3')) + nric[1:]
        checksum = self.checksum(nric)
        nric_full = 'S'+nric+checksum
        return {'birthday': datetime.strftime(birthday, '%Y-%m-%d'), "nric": nric_full}
        
    
    def sg_nric_type_t(self, min_age=None, max_age=None):
        """
        "T": "Singapore citizens and permanent residents born on or after 1 January 2000",
        """
        start_date = datetime(2000,1,1)
        current = datetime.today()
        difference = relativedelta(current, start_date)
        
        if not min_age or (min_age < 0):
            min_age = 0
        
        if not max_age:
            max_age = difference.years
        elif max_age and (max_age > difference.years):
            print(f'Maximum age for T type NRIC is {difference.years}.')
            max_age = difference.years

        if min_age > max_age:
            return None
        
        birthday = self.fake.date_of_birth(tzinfo=None, minimum_age=min_age, maximum_age=max_age)

        nric = f'{datetime.strftime(birthday,"%y")}#####'
        nric = self.fake.numerify(nric)        
        checksum = self.checksum(nric)
        nric_full = 'T'+nric+checksum
        return {'birthday': datetime.strftime(birthday, '%Y-%m-%d'), "nric": nric_full}

    
    def sg_nric_type_f(self, min_age=None, max_age=115):
        """
        "F": "Foreigners issued with long-term passes before 1 January 2000", 
        """
        # Get birthday between 
        end_date = datetime(1999, 12, 31)
        current = datetime.today()
        difference = relativedelta(current, end_date)
        
        if not min_age:
            min_age = difference.years        
        elif min_age and (min_age < difference.years):
            print(f'Minimum age for F type NRIC is {difference.years}.')
            min_age = difference.years

        if min_age > max_age:
            return None

        birthday = self.fake.date_of_birth(tzinfo=None, minimum_age=min_age, maximum_age=max_age)

        nric = self.fake.numerify('#######')
        checksum = self.checksum(nric)
        nric_full = 'F'+nric+checksum
        return {'birthday': datetime.strftime(birthday, '%Y-%m-%d'), "nric": nric_full}
        
    
    def sg_nric_type_g(self, min_age=None, max_age=None):
        """
        "G": "Foreigners issued with long-term passes from 1 January 2000 to 31 December 2021",
        """
        start_date = datetime(2000,1,1)
        end_date = datetime(2021,12,31)
        current = datetime.today()
        diff_max = relativedelta(current, start_date)
        diff_min = relativedelta(current, end_date)
        
        if not min_age or (min_age < 0):
            min_age = diff_min.years
        elif min_age and (min_age < diff_min.years):
            print(f'Minimum age for G type NRIC is {diff_min.years}.')
            min_age = diff_min.years

        if min_age > max_age:
            return None

        if not max_age:
            max_age = diff_max.years
        elif max_age and (max_age > diff_max.years):
            print(f'Maximum age for G type NRIC is {diff_max.years}.')
            max_age = diff_max.years

        birthday = self.fake.date_of_birth(tzinfo=None, minimum_age=min_age, maximum_age=max_age)

        nric = self.fake.numerify('#######')
        checksum = self.checksum(nric)
        nric_full = 'G'+nric+checksum
        return {'birthday': datetime.strftime(birthday, '%Y-%m-%d'), "nric": nric_full}

    
    def sg_nric_type_m(self, min_age=None, max_age=115):
        """
        "M": "Foreigners issued with long-term passes on or after 1 January 2022"
        """
        start_date = datetime(2022,1,1)
        current = datetime.today()
        difference = relativedelta(current, start_date)
        
        if not min_age or (min_age < 0):
            min_age = 0
        
        if not max_age:
            max_age = difference.years
        elif max_age and (max_age > difference.years):
            print(f'Maximum age for T type NRIC is {difference.years}.')
            max_age = difference.years

        if min_age > max_age:
            return None

        birthday = self.fake.date_of_birth(tzinfo=None, minimum_age=min_age, maximum_age=max_age)

        nric = f'#######'
        nric = self.fake.numerify(nric)        
        checksum = self.checksum(nric)
        nric_full = 'M'+nric+checksum
        return {'birthday': datetime.strftime(birthday, '%Y-%m-%d'), "nric": nric_full}

    
    def sg_nric(self, categories=['S', 'T', 'F', 'G', 'M'], count=5, min_age=0, max_age=115):
        """
        Return fake SG NRIC number.
        """
        result = []
        while len(result) < count:
            category = self.fake.random_element(elements=categories)
            method_name = f'sg_nric_type_{category.lower()}'
            # Make sure object has the function 
            if not hasattr(self, method_name):
                print(f'Invalid attribute name {method_name} for object {self}')
                continue

            method_to_call = getattr(self, method_name)
            # Make sure the attribute is callable
            if not callable(method_to_call):
                print(f'Invalid function {method_to_call} for object {self}')
                continue

            nric = method_to_call(min_age=min_age, max_age=max_age)
            if nric:
                result.append(nric)
        
        return result

### Test

In [98]:
fake = Faker()
fake.add_provider(SgNricProvider)
fake.sg_nric()

Maximum age for T type NRIC is 0.
Maximum age for T type NRIC is 21.
Maximum age for T type NRIC is 21.


[{'birthday': '2021-03-28', 'nric': 'M7577745J'},
 {'birthday': '2012-10-10', 'nric': 'T1264033A'},
 {'birthday': '2005-03-19', 'nric': 'T0571172Z'},
 {'birthday': '1939-11-25', 'nric': 'S2992344B'},
 {'birthday': '1953-03-04', 'nric': 'S1354649E'}]

## Community Provider - SG Address

Use data from kaggle https://www.kaggle.com/mylee2009/singapore-postal-code-mapper

In [22]:
random.sample(range(1,row_total), count)

[4346, 1378, 13460, 881, 14626, 21699, 21256, 20922, 15831, 22978]

In [40]:
from pathlib import Path
import csv
import random

filename = 'sg_zipcode_mapper.csv'
cur_path = Path.cwd().joinpath(filename)
cur_path.resolve()

count = 10
lines = []

with open(cur_path) as f:
    # Find total number of rows
    row_total = sum(1 for line in f)
    # Generate random line number
    chosen = sorted(random.sample(range(1,row_total), count))
    
    f.seek(0)
    header = f.readline().strip().split(',')
        
    for offset in chosen:
        f.seek(offset)
        f.readline()
        line = f.readline().strip()
        lines.append(line.split(','))

result = []
for line in lines:
    j = zip(header,line)
    obj = {k: v for k, v in j}
    print(obj)


{'postal': '648165', 'latitude': '1.354618893', 'longtitude': '103.7093813', 'blk_no': '10', 'road_name': 'BULIM AVENUE'}
{'postal': '629523', 'latitude': '1.310227558', 'longtitude': '103.6660628', 'blk_no': '10', 'road_name': 'GUL CRESCENT'}
{'postal': '638377', 'latitude': '1.33555153', 'longtitude': '103.6445896', 'blk_no': '10', 'road_name': 'TUAS WEST ROAD'}
{'postal': '823105', 'latitude': '1.397672035', 'longtitude': '103.9055988', 'blk_no': '105C', 'road_name': 'EDGEFIELD PLAINS'}
{'postal': '98144', 'latitude': '1.238961766', 'longtitude': '103.833709', 'blk_no': '106', 'road_name': 'COVE DRIVE'}
{'postal': '327953', 'latitude': '1.325827815', 'longtitude': '103.8642351', 'blk_no': '10A', 'road_name': "SAINT MICHAEL'S ROAD"}
{'postal': '487511', 'latitude': '1.337277524', 'longtitude': '103.9532552', 'blk_no': '11', 'road_name': 'SEA BREEZE GROVE'}
{'postal': '266661', 'latitude': '1.328234457', 'longtitude': '103.7995336', 'blk_no': '11', 'road_name': 'SECOND AVENUE'}
{'post

### Provider

In [1]:
from pathlib import Path
import csv
import random
from faker.providers import BaseProvider
from faker import Faker

class SgAddressProvider(BaseProvider):
    
    def __init__(self, generator):
        super().__init__(generator)
        self.data_file = 'sg_addresses.csv'
    
    
    def sg_address(self, count: int = 5):
        cur_path = Path.cwd().joinpath(self.data_file)
        cur_path.resolve()
        lines = []

        with open(cur_path) as f:
            # Find total number of rows
            row_total = sum(1 for line in f)
            # Generate random line number
            chosen = sorted(random.sample(range(1,row_total), count))

            f.seek(0)
            header = f.readline().strip().split(',')

            for offset in chosen:
                f.seek(offset)
                f.readline()
                line = f.readline().strip()
                lines.append(line.split(','))

        result = []
        for line in lines:
            obj = {k: v for k, v in zip(header,line)}
            result.append(obj)

        return result

### Test

In [3]:
fake = Faker()
fake.add_provider(SgAddressProvider)

fake.sg_address(5)

[{'postal': '787916',
  'latitude': '1.397338704',
  'longtitude': '103.8204766',
  'blk_no': '10',
  'road_name': 'SPRINGLEAF VIEW'},
 {'postal': '821107',
  'latitude': '1.397198794',
  'longtitude': '103.9068258',
  'blk_no': '107A',
  'road_name': 'EDGEFIELD PLAINS'},
 {'postal': '548042',
  'latitude': '1.360564746',
  'longtitude': '103.8825666',
  'blk_no': '10F',
  'road_name': 'KOVAN ROAD'},
 {'postal': '339155',
  'latitude': '1.315343054',
  'longtitude': '103.8683213',
  'blk_no': '11',
  'road_name': 'KALLANG PLACE'},
 {'postal': '467109',
  'latitude': '1.319583452',
  'longtitude': '103.9406904',
  'blk_no': '110',
  'road_name': 'SENNETT AVENUE'}]

## Community Provider - SG Train Stations

## Community Provider - SG Phone