# Create a Fake Dataset

This notebook will create a fake, 10k row dataset for use in examples showing how to clean and standardize data.

## Requirements

* Python >= 3.5.1
* pandas >= 0.17.1
* numpy >= 1.10.4
* faker >= 0.5.3

## The Code

In [3]:
# Import the libraries we'll need
from faker import Factory    # pip install fake-factory
import pandas as pd
import numpy as np
import random

In [4]:
# Create and initialize a faker generator
fake = Factory.create()

In [5]:
# Create a randomly formatted name
def random_name():
    r = random.randint(1,2)
    if r == 1:
        full_name = fake.first_name() + " " + fake.last_name()
    else:
        full_name = " " + fake.first_name() + " " + fake.last_name()
    return full_name

In [6]:
# Create a randomly formatted date
def random_date():
    r = random.randint(1,3)
    if r == 1:
        random_date = fake.date()
    elif r == 2:
        random_date = fake.date_time()
    else:
        random_date = "{}/{}/{}".format(fake.month(),
                                        fake.day_of_month(),
                                        fake.year())
    return random_date

In [7]:
# Create a random row of data
def create_random_row():
    row = [
        random_name(),
        fake.street_address(),
        fake.city(),
        fake.state() ,
        fake.postcode(),
        fake.phone_number() ,
        fake.free_email(),
        fake.company(),
        fake.street_address(),
        fake.city(),
        fake.state(),
        fake.postcode(),
        fake.phone_number() ,
        fake.company_email(),
        random_date()
    ]
    return row

In [8]:
# List of columns for our dataframe
cols = [
    'name',
    'address',
    'city',
    'state',
    'zip',
    'phone',
    'email',
    'work',
    'work address',
    'work city',
    'work state',
    'work zipcode',
    'work phone',
    'work email',
    'account created on'
]

In [9]:
# Create an empty dataframe from the column list
df = pd.DataFrame(columns=cols)

In [10]:
# Add the fake data to the dataframe; 10k rows is good
# FYI: this can take a little bit
for i in range(0,10000):
    r = create_random_row()
    df.loc[i] = r
df.head()

Unnamed: 0,name,address,city,state,zip,phone,email,work,work address,work city,work state,work zipcode,work phone,work email,account created on
0,Deonte Stark,278 Mueller Plains,North Euna,Alabama,03404-4384,(180)940-9676x4495,shanna73@hotmail.com,Hahn-Mayer,8177 Weber Throughway Apt. 341,Jaronton,Maine,51589-1424,01240240340,heller.kirstin@glover.com,2001-09-06 06:15:24
1,Faustino Boyer,70244 Skiles Falls Suite 030,North Altohaven,California,01522-1310,(308)699-6239x81011,ferrell81@gmail.com,Buckridge Inc,236 Kessler Center,New Gavynshire,Missouri,52234-6972,(486)896-6855x446,esta.dicki@bechtelar.com,1983-04-25
2,Eddy Bogisich,33431 Dollie Squares Apt. 654,Port Campbell,New York,37768-2026,01142290278,mirna.dickens@yahoo.com,"Green, Bradtke and Fritsch",042 Gottlieb Mountain Apt. 404,Rodriguezton,New York,72843,410.238.3556,wwalter@carter.com,2013-03-15
3,Mervyn Kreiger,376 Dorinda Stream,Shaniquafort,South Carolina,39347-4438,869-985-6299,emmerich.griselda@hotmail.com,Witting PLC,521 Kemmer Manors,Nerytown,Kentucky,68774,(212)169-8190,greyson39@purdy.com,1971-04-27 14:05:06
4,Katlyn Doyle,4650 Beer Crossing Suite 848,North Dellside,Kentucky,25794,1-184-809-6600,junious.oreilly@gmail.com,"Gaylord, Parisian and Walter",586 Frank Lights,Lake Cynthiafurt,Texas,78014-9162,(929)923-2385x053,sankunding@predovic.org,1999-03-21 14:43:15


In [11]:
# Set some random number of values to NaN
for i in range(0,random.randint(0,1746)):
    row = random.randint(0,10000)
    column = random.randint(0,14)
    df.ix[row,column] = np.nan

In [12]:
# Save the data to a csv file for use
df.to_csv('dss_dirty_data_example.csv', index=False)