# Create a Fake Dataset

This notebook will create a fake, 10k row dataset for use in examples showing how to clean and standardize data.

## Requirements

* Python >= 3.5.1
* pandas >= 0.17.1
* numpy >= 1.10.4
* faker >= 0.5.3

## The Code

In [None]:
# Import the libraries we'll need
from faker import Factory    # pip install fake-factory
import pandas as pd
import numpy as np
import random

In [None]:
# Create and initialize a faker generator
fake = Factory.create()

In [None]:
# Create a randomly formatted name
def random_name():
    r = random.randint(1,2)
    if r == 1:
        full_name = fake.first_name() + " " + fake.last_name()
    else:
        full_name = " " + fake.first_name() + " " + fake.last_name()
    return full_name

In [None]:
# Create a randomly formatted date
def random_date():
    r = random.randint(1,3)
    if r == 1:
        random_date = fake.date()
    elif r == 2:
        random_date = fake.date_time()
    else:
        random_date = "{}/{}/{}".format(fake.month(),
                                        fake.day_of_month(),
                                        fake.year())
    return random_date

In [None]:
# Create a random row of data
def create_random_row():
    row = [
        random_name(),
        fake.street_address(),
        fake.city(),
        fake.state() ,
        fake.postcode(),
        fake.phone_number() ,
        fake.free_email(),
        fake.company(),
        fake.street_address(),
        fake.city(),
        fake.state(),
        fake.postcode(),
        fake.phone_number() ,
        fake.company_email(),
        random_date()
    ]
    return row

In [None]:
# List of columns for our dataframe
cols = [
    'name',
    'address',
    'city',
    'state',
    'zip',
    'phone',
    'email',
    'work',
    'work address',
    'work city',
    'work state',
    'work zipcode',
    'work phone',
    'work email',
    'account created on'
]

In [None]:
# Create an empty dataframe from the column list
df = pd.DataFrame(columns=cols)

In [None]:
# Add the fake data to the dataframe; 10k rows is good
# FYI: this can take a little bit
for i in range(0,10000):
    r = create_random_row()
    df.loc[i] = r
df.head()

In [None]:
# Set some random number of values to NaN
for i in range(0,random.randint(0,1746)):
    row = random.randint(0,10000)
    column = random.randint(0,14)
    df.ix[row,column] = np.nan

In [None]:
# Save the data to a csv file for use
df.to_csv('dss_dirty_data_example.csv', index=False)