# A notebook to generate fake input data for use with DESaster

Likely you need to install the Python library `Faker`

In [1]:
from faker import Factory
from faker import Faker
import pandas as pd
import names
import random
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform, norm, beta, weibull_min, rv_discrete
from scipy.spatial.distance import cdist, euclidean

%matplotlib inline

A function of functions for generating the data. *Note that this is meant to generate useable fake data, but not necessarily realistic. Various formulas (e.g., to estimate house value based on income) are the result of googling--nothing should be considered fully justified.* *Also note: The required data of DESaster may not exactly match what is produced by this function; modifications may be required.*

In [2]:
def fake_inputs(tenure, income_values, income_counts, lat0, long0, radius):
    
    num_entities = len(income_values)
    fake = Faker()
    
    def rand_lat_long(lat0, long0, radius, size):                        #Choose your own radius
        radiusInDegrees=radius/111300            
        r = radiusInDegrees
        lat = []
        long = []

        for i in range(1,num_entities):                 #Choose number of Lat Long to be generated

            u = float(random.uniform(0.0,1.0))
            v = float(random.uniform(0.0,1.0))

            w = r * math.sqrt(u)
            t = 2 * math.pi * v
            x = w * math.cos(t) 
            y = w * math.sin(t)

            lat.append(x + lat0)
            long.append(y + long0)

        return lat, long

    def calc_income(income_series, count_series):
        x = income_series
        p_x = count_series / count_series.sum()
        income_dist = rv_discrete(name='Income', values=(x, p_x))
        return income_dist.rvs()

    def calc_savings(income):
        savings_dist = beta(a=2, b=1, loc=5000, scale = 200000)
        max_pdf = savings_dist.pdf(205000)
        inc_pdf = savings_dist.pdf(income)
        savings_rate = 0.25*(inc_pdf / max_pdf)
        return int(savings_rate * income)

    def calc_house_value(income):
        min_house_price_multiplier = 2
        return int(income*uniform.rvs(loc = min_house_price_multiplier, scale = 1))

    def calc_house_area(value):
        dollar_per_sf = 150
        if int(value / dollar_per_sf) < 500:
            return 500
        else:
            return int(value / dollar_per_sf)

    def calc_mortgage_payment(value):
        monthly_rate = 0.05/12
        num_payments = 30*12
        down_payment = 0.1
        loan_value = value - value * down_payment
        return int(-np.pmt(monthly_rate,num_payments, loan_value))

    def set_occupancy(income):
        if income >= 50000:
            return 'Single Family Dwelling'
        elif uniform.rvs(0,1) >= 0.2:
            return 'Single Family Dwelling'
        else:
            return 'Mobile Home'

    def set_listing():
        if uniform.rvs(0,1) >= 0.3333:
            return False
        else:
            return True

    def calc_credit():
        return int(uniform.rvs(550, 300))

    def calc_insurance(income):
        insurance_dist = beta(a=2, b=1, loc=5000, scale = 200000)
        max_pdf = insurance_dist.pdf(205000)
        inc_pdf = insurance_dist.pdf(income)
        if inc_pdf / max_pdf > 0.5:
            return 0.8
        else:
            return 0.0

    def calc_bedrooms(area):
        if area <= 500:
            return 0
        else:
            bedrooms_pct = 0.3
            avg_sf = 200
            return int((bedrooms_pct * area) / avg_sf ) 

    def calc_bathrooms(area):
        if area <= 500:
            return 1
        else:
            bathrooms_pct = 0.1
            avg_sf = 100
            return max(int((bathrooms_pct * area) / avg_sf ), 1)
    
    def calc_damage(origin, target):
        distances = cdist(origin, target)[0]
        bins = list(np.linspace(0,max(distances),6))
        labels = ['Complete', 'Extensive', 'Moderate', 'Slight','None']
        damage = pd.cut(distances, bins=bins, labels=labels).get_values()
        return damage
    
    index = np.linspace(0,num_entities-1,10, dtype=int)
    
    lat, long = rand_lat_long(lat0, long0, radius, num_entities)

    names = []
    addresses = []
    incomes = []
    savings = []
    house_values = []
    house_areas = []
    house_ages = []
    mortgages = []
    occupancies = []
    listings = []
    credit_scores = []
    insurance = []
    bedrooms = []
    bathrooms = []
    landlords = []
    tenant_incomes = []
    tenant_insurance = []
    tenant_savings = []
    tenant_credit = []
    damages = []

    for i in range(1,num_entities):
        names.append(fake.name())
        addresses.append(fake.street_address())
        incomes.append(calc_income(income_values, income_counts))
        savings.append(int(calc_savings(incomes[i-1])))
        house_values.append(calc_house_value(incomes[i-1]))
        house_areas.append(calc_house_area(house_values[i-1]))
        house_ages.append(int(fake.year()))
        mortgages.append(calc_mortgage_payment(house_values[i-1]))
        occupancies.append(set_occupancy(incomes[i-1]))
        listings.append(set_listing())
        credit_scores.append(calc_credit())
        insurance.append(calc_insurance(incomes[i-1]))
        bedrooms.append(calc_bedrooms(house_areas[i-1]))
        bathrooms.append(calc_bathrooms(house_areas[i-1]))
        damages.append(calc_damage([(lat0, long0)], [(lat[i-1], long[i-1])]))
       
        if tenure.lower() == 'rent' or tenure.lower() == 'renter':
            landlords.append(fake.name())
            tenant_incomes.append(calc_income(income_values, income_counts))
            tenant_insurance.append(calc_insurance(tenant_incomes[i-1]))
            tenant_savings.append(int(calc_savings(tenant_incomes[i-1])))
            tenant_credit.append(calc_credit())

    if tenure.lower() == 'own' or tenure.lower() == 'owner':
        df = pd.DataFrame({'Name': names, 'Address': addresses, 'Income': incomes, 'Owner Savings': savings, 
                           'Owner Insurance': insurance, 'Owner Credit': credit_scores,
                           'Occupancy': occupancies, 'Value': house_values, 'Monthly Cost': mortgages, 'Area': house_areas, 
                           'Bedrooms': bedrooms, 'Bathrooms': bathrooms, 'Year Built': house_ages, 'Listing': listings, 
                           'Damage State': damages, 'Latitude': lat, 'Longitude': long},
                          columns=['Name', 'Income', 'Owner Savings', 'Owner Credit', 'Owner Insurance', 
                                   'Address', 'Occupancy', 'Area', 'Bedrooms', 'Bathrooms', 'Value', 'Monthly Cost', 
                                   'Year Built', 'Listing', 'Damage State', 'Latitude', 'Longitude'])
    
    elif tenure.lower() == 'rent' or tenure.lower() == 'renter':
         df = pd.DataFrame({'Name': names, 'Tenant Income': tenant_incomes, 'Tenant Insurance': tenant_insurance,
                            'Tenant Savings': tenant_savings, 'Tenant Credit': tenant_credit,
                            'Address': addresses, 'Landlord': landlords, 'Income': incomes, 'Owner Savings': savings, 
                           'Owner Insurance': insurance, 'Owner Credit': credit_scores,
                           'Occupancy': occupancies, 'Value': house_values, 'Monthly Cost': mortgages, 'Area': house_areas, 
                           'Bedrooms': bedrooms, 'Bathrooms': bathrooms, 'Year Built': house_ages, 'Listing': listings,
                            'Damage State': damages, 'Latitude': lat, 'Longitude': long},
                          columns=['Name', 'Tenant Income', 'Tenant Insurance', 'Tenant Savings', 'Tenant Credit',
                                   'Landlord', 'Income', 'Owner Savings', 'Owner Credit', 'Owner Insurance', 
                                   'Address', 'Occupancy', 'Area', 'Bedrooms', 'Bathrooms', 'Value', 'Monthly Cost', 
                                   'Year Built', 'Listing', 'Damage State', 'Latitude', 'Longitude'])
    
    else:
        print('Specified entity class not supported.')
        return
    
    return df

The function requires inputting an excel file of income data. The data structure is two columns labeled `Income` and `Count`, with the first being a list of income bins and the second being the number of households with each income bin.

In [4]:
income_df = pd.read_excel('income.xlsx') # Excel file with income bins and household counts
income_values = income_df['Income']
income_counts = income_df['Count']
lat0 = 40.84 # Latitude value where the circle for locating households will be centered.
long0 = -73.87 # Longitude value where the circle for locating households will be centered.
radius = 1000 # Radius value for the circle for locating households. ...can't remember the units. Meters?
fake_df = fake_inputs('own', income_values, income_counts, lat0, long0, radius)

fake_df.head(10)

Unnamed: 0,Name,Income,Owner Savings,Owner Credit,Owner Insurance,Address,Occupancy,Area,Bedrooms,Bathrooms,Value,Monthly Cost,Year Built,Listing,Damage State,Latitude,Longitude
0,Kimberly Stewart,40000,1749,657,0.0,416 Robert Ramp Suite 753,Mobile Home,669,1,1,100397,485,1992,True,[None],40.839719,-73.863039
1,Jeremy Foley,120000,17249,672,0.8,2893 Banks Roads,Single Family Dwelling,2368,3,2,355336,1716,1998,False,[None],40.83591,-73.876892
2,Kimberly Scott,35000,1312,637,0.0,6847 James Cliffs,Single Family Dwelling,587,0,1,88080,425,2013,False,[None],40.846045,-73.869348
3,Jessica Ferguson,30000,937,847,0.0,72902 Tammy Row,Single Family Dwelling,511,0,1,76724,370,1972,False,[None],40.836827,-73.865343
4,David Chandler,205000,51250,757,0.8,43191 Christopher Burg Suite 860,Single Family Dwelling,3524,5,3,528721,2554,1998,True,[None],40.837222,-73.867616
5,Christopher Ellis,50000,2812,702,0.0,99872 Williams Stravenue Suite 024,Single Family Dwelling,872,1,1,130911,632,1973,True,[None],40.84055,-73.865947
6,Willie Taylor,35000,1312,560,0.0,7450 Blackburn Squares Suite 810,Single Family Dwelling,634,0,1,95166,459,2004,True,[None],40.84178,-73.871845
7,Nicole Bradley,95000,10687,597,0.0,643 Hunter Village,Single Family Dwelling,1571,2,1,235794,1139,1970,False,[None],40.839396,-73.871227
8,Jared Mendez,75000,6562,644,0.0,67759 Hunter Green Apt. 434,Single Family Dwelling,1417,2,1,212699,1027,2013,False,[None],40.833594,-73.874365
9,Molly Porter,175000,37187,761,0.8,3230 Garcia Plaza,Single Family Dwelling,3375,5,3,506339,2446,2013,False,[None],40.841308,-73.866227


Export fake data as Excel file

In [6]:
file_name = 'fake_data.xlsx'
file_path = '/Users/geomando/Downloads/'
fake_df.to_excel(file_path + file_name)

Export fake data as csv

In [7]:
file_name = 'fake_data.csv'
file_path = '/Users/geomando/Downloads/'
fake_df.to_csv(file_path + file_name)