# How To Make a Fake Data Set in Python and Pandas

## 3. Generate data with Faker

In [1]:
from faker import Faker
fake = Faker('it_IT')
for _ in range(5):
    print(fake.name())

Lina Ossola-Acerbi
Leopoldo Lanfranchi
Gianpaolo Versace
Sandro Galuppi
Augusto Mercati-Barcella


In [2]:
from faker import Faker
fake = Faker(['it_IT', 'en_US', 'ja_JP'])
for _ in range(10):
    print(fake.name())

John Harris
Isa Benigni-Dellucci
James Nguyen
Marcello Bonanno
Timothy Palmer
Alfredo Battelli
石川 里佳
Gelsomina Sbarbaro
Sig.ra Sophia Camuccini
Dino Schiaparelli-Morosini


### 3.1. Locales

In [3]:
from faker import Faker

en_us_faker = Faker('en_US')
it_it_fake = Faker('it_IT')

print(f'{en_us_faker.city()}, USA')
print(f'{it_it_fake.city()}, Italy')

Ryanport, USA
Sesto Augusto, Italy


### 3.2. Providers

In [4]:
from faker.providers import DynamicProvider

skill_provider = DynamicProvider(
     provider_name="skills",
     elements=["Python", "Pandas", "Linux", "SQL", "Data Mining"],
)


fake = Faker('en_US')
fake.add_provider(skill_provider)

fake.skills()

'Data Mining'

## 4. Create DataFrame with Fake Data

In [5]:
import csv
import pandas as pd
from faker import Faker
import datetime
import random
from faker.providers import DynamicProvider

skill_provider = DynamicProvider(
     provider_name="skills",
     elements=["Python", "Pandas", "Linux", "SQL", "Data Mining"],
)


def fake_data_generation(records):
    fake = Faker('en_US')
    
    employee = []
    
    fake.add_provider(skill_provider)

    for i in range(records):
        first_name = fake.first_name()
        last_name = fake.last_name()



        employee.append({
                "First Name": first_name,
                "Last Name": last_name,
                "Birth Date" : fake.date(pattern="%Y-%m-%d", end_datetime=datetime.date(1995, 1,1)),
                "Email": str.lower(f"{first_name}.{last_name}@fake_domain-2.com"),
                "Hobby": fake.word(),
                "Experience" : random.randint(0,15),
                "Start Year": fake.year(),
                "Sallary": random.randrange(75000,150000, 5000),
                "City" : fake.city(),
                "Nationality" : fake.country(),
                "Skill": fake.skills()
                })
        
    return employee


df = pd.DataFrame(fake_data_generation(50))
df.head()

Unnamed: 0,First Name,Last Name,Birth Date,Email,Hobby,Experience,Start Year,Sallary,City,Nationality,Skill
0,Craig,Hale,1988-01-09,craig.hale@fake_domain-2.com,fact,4,1977,100000,Rebeccastad,Gambia,Python
1,Frank,Davis,1976-09-18,frank.davis@fake_domain-2.com,event,13,2007,100000,Port Emilyborough,Serbia,Linux
2,Elijah,Murphy,1973-05-28,elijah.murphy@fake_domain-2.com,as,5,1979,90000,North Brandiberg,Kazakhstan,Pandas
3,Jesse,Fox,1984-10-14,jesse.fox@fake_domain-2.com,upon,5,1987,130000,Amyshire,Botswana,Pandas
4,Yesenia,Bradford,1985-02-06,yesenia.bradford@fake_domain-2.com,would,9,2003,110000,East Brett,Pitcairn Islands,SQL


## 5. Create big CSV file with Fake Data

In [6]:
import csv
from faker import Faker
import datetime
import random
from faker.providers import DynamicProvider

skill_provider = DynamicProvider(
     provider_name="skills",
     elements=["Python", "Pandas", "Linux", "SQL", "Data Mining"],
)


def fake_data_generation(records, headers):
    fake = Faker('en_US')
    
    fake.add_provider(skill_provider)

    
    with open("employee.csv", 'wt') as csvFile:
        writer = csv.DictWriter(csvFile, fieldnames=headers)
        writer.writeheader()
        for i in range(records):
            first_name = fake.first_name()
            last_name = fake.last_name()
            
            
            
            writer.writerow({
                    "First Name": first_name,
                    "Last Name": last_name,
                    "Birth Date" : fake.date(pattern="%Y-%m-%d", end_datetime=datetime.date(1995, 1,1)),
                    "Email": str.lower(f"{first_name}.{last_name}@fake_domain-2.com"),
                    "Hobby": fake.word(),
                    "Experience" : random.randint(0,15),
                    "Start Year": fake.year(),
                    "Sallary": random.randrange(75000,150000, 5000),
                    "City" : fake.city(),
                    "Nationality" : fake.country(),
                    "Skill": fake.skills()
                    })
    

number_records = 100
fields = ["First Name", "Last Name", "Birth Date", "Email", "Hobby", "Experience",
           "Start Year", "Sallary", "City", "Nationality", "Skill"]

fake_data_generation(number_records, fields)
