# How To Make a Fake Data Set in Python and Pandas

## 3. Generate data with Faker

In [1]:
from faker import Faker
fake = Faker('it_IT')
for _ in range(5):
    print(fake.name())

Vincenza Grisoni
Sig.ra Barbara Filippelli
Ninetta Trupiano
Ruggero Parri
Marina Bottaro-Fattori


In [2]:
from faker import Faker
fake = Faker(['it_IT', 'en_US', 'ja_JP'])
for _ in range(10):
    print(fake.name())

Matthew Greene
Santino Sinisi
Arnaldo Necci
Barbara Rose
加藤 知実
山本 修平
Ms. Maria Taylor DVM
Christopher Taylor
Beverly Martinez
佐藤 春香


### 3.1. Locales

In [3]:
from faker import Faker

en_us_faker = Faker('en_US')
it_it_fake = Faker('it_IT')

print(f'{en_us_faker.city()}, USA')
print(f'{it_it_fake.city()}, Italy')

North James, USA
Settimo Sylvia terme, Italy


### 3.2. Providers

In [4]:
from faker.providers import DynamicProvider

skill_provider = DynamicProvider(
     provider_name="skills",
     elements=["Python", "Pandas", "Linux", "SQL", "Data Mining"],
)


fake = Faker('en_US')
fake.add_provider(skill_provider)

fake.skills()

'SQL'

## 4. Create DataFrame with Fake Data

In [5]:
import csv
import pandas as pd
from faker import Faker
import datetime
import random
from faker.providers import DynamicProvider

skill_provider = DynamicProvider(
     provider_name="skills",
     elements=["Python", "Pandas", "Linux", "SQL", "Data Mining"],
)


def fake_data_generation(records):
    fake = Faker('en_US')
    
    employee = []
    
    fake.add_provider(skill_provider)

    for i in range(records):
        first_name = fake.first_name()
        last_name = fake.last_name()



        employee.append({
                "First Name": first_name,
                "Last Name": last_name,
                "Birth Date" : fake.date(pattern="%Y-%m-%d", end_datetime=datetime.date(1995, 1,1)),
                "Email": str.lower(f"{first_name}.{last_name}@fake_domain-2.com"),
                "Hobby": fake.word(),
                "Experience" : random.randint(0,15),
                "Start Year": fake.year(),
                "Salary": random.randrange(75000,150000, 5000),
                "City" : fake.city(),
                "Nationality" : fake.country(),
                "Skill": fake.skills()
                })
        
    return employee


df = pd.DataFrame(fake_data_generation(50))
df.head()

Unnamed: 0,First Name,Last Name,Birth Date,Email,Hobby,Experience,Start Year,Salary,City,Nationality,Skill
0,Austin,Ibarra,1976-09-26,austin.ibarra@fake_domain-2.com,challenge,13,1978,140000,Kerrfurt,Iceland,Data Mining
1,Gerald,Gilmore,1971-01-18,gerald.gilmore@fake_domain-2.com,difficult,4,2007,115000,Douglasstad,Japan,SQL
2,Aaron,Pineda,1980-03-12,aaron.pineda@fake_domain-2.com,ability,14,2001,85000,Lukeberg,Christmas Island,Pandas
3,Anthony,Ross,1974-05-05,anthony.ross@fake_domain-2.com,nation,8,1976,105000,Bakerview,Thailand,SQL
4,Mark,Hanson,1975-04-03,mark.hanson@fake_domain-2.com,war,1,2015,95000,Calebview,Pakistan,SQL


## 5. Create big CSV file with Fake Data

In [6]:
import csv
from faker import Faker
import datetime
import random
from faker.providers import DynamicProvider

skill_provider = DynamicProvider(
     provider_name="skills",
     elements=["Python", "Pandas", "Linux", "SQL", "Data Mining"],
)


def fake_data_generation(records, headers):
    fake = Faker('en_US')
    
    fake.add_provider(skill_provider)

    
    with open("employee.csv", 'wt') as csvFile:
        writer = csv.DictWriter(csvFile, fieldnames=headers)
        writer.writeheader()
        for i in range(records):
            first_name = fake.first_name()
            last_name = fake.last_name()
            
            
            
            writer.writerow({
                    "First Name": first_name,
                    "Last Name": last_name,
                    "Birth Date" : fake.date(pattern="%Y-%m-%d", end_datetime=datetime.date(1995, 1,1)),
                    "Email": str.lower(f"{first_name}.{last_name}@fake_domain-2.com"),
                    "Hobby": fake.word(),
                    "Experience" : random.randint(0,15),
                    "Start Year": fake.year(),
                    "Salary": random.randrange(75000,150000, 5000),
                    "City" : fake.city(),
                    "Nationality" : fake.country(),
                    "Skill": fake.skills()
                    })
    

number_records = 100
fields = ["First Name", "Last Name", "Birth Date", "Email", "Hobby", "Experience",
           "Start Year", "Salary", "City", "Nationality", "Skill"]

fake_data_generation(number_records, fields)
