In [3]:
from faker import Factory
from faker import Faker
import pandas as pd
import names
import random
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform, norm, beta, weibull_min, rv_discrete
from scipy.spatial.distance import cdist, euclidean

%matplotlib inline

In [26]:
def fake_inputs(tenure, income_values, income_counts, lat0, long0, radius):
    
    num_entities = len(income_values)
    fake = Faker()
    
    def rand_lat_long(lat0, long0, radius, size):                        #Choose your own radius
        radiusInDegrees=radius/111300            
        r = radiusInDegrees
        lat = []
        long = []

        for i in range(1,num_entities):                 #Choose number of Lat Long to be generated

            u = float(random.uniform(0.0,1.0))
            v = float(random.uniform(0.0,1.0))

            w = r * math.sqrt(u)
            t = 2 * math.pi * v
            x = w * math.cos(t) 
            y = w * math.sin(t)

            lat.append(x + lat0)
            long.append(y + long0)

        return lat, long

    def calc_income(income_series, count_series):
        x = income_series
        p_x = count_series / count_series.sum()
        income_dist = rv_discrete(name='Income', values=(x, p_x))
        return income_dist.rvs()

    def calc_savings(income):
        savings_dist = beta(a=2, b=1, loc=5000, scale = 200000)
        max_pdf = savings_dist.pdf(205000)
        inc_pdf = savings_dist.pdf(income)
        savings_rate = 0.25*(inc_pdf / max_pdf)
        return int(savings_rate * income)

    def calc_house_value(income):
        min_house_price_multiplier = 2
        return int(income*uniform.rvs(loc = min_house_price_multiplier, scale = 1))

    def calc_house_area(value):
        dollar_per_sf = 150
        if int(value / dollar_per_sf) < 500:
            return 500
        else:
            return int(value / dollar_per_sf)

    def calc_mortgage_payment(value):
        monthly_rate = 0.05/12
        num_payments = 30*12
        down_payment = 0.1
        loan_value = value - value * down_payment
        return int(-np.pmt(monthly_rate,num_payments, loan_value))

    def set_occupancy(income):
        if income >= 50000:
            return 'Single Family Dwelling'
        elif uniform.rvs(0,1) >= 0.2:
            return 'Single Family Dwelling'
        else:
            return 'Mobile Home'

    def set_listing():
        if uniform.rvs(0,1) >= 0.3333:
            return False
        else:
            return True

    def calc_credit():
        return int(uniform.rvs(550, 300))

    def calc_insurance(income):
        insurance_dist = beta(a=2, b=1, loc=5000, scale = 200000)
        max_pdf = insurance_dist.pdf(205000)
        inc_pdf = insurance_dist.pdf(income)
        if inc_pdf / max_pdf > 0.5:
            return 0.8
        else:
            return 0.0

    def calc_bedrooms(area):
        if area <= 500:
            return 0
        else:
            bedrooms_pct = 0.3
            avg_sf = 200
            return int((bedrooms_pct * area) / avg_sf ) 

    def calc_bathrooms(area):
        if area <= 500:
            return 1
        else:
            bathrooms_pct = 0.1
            avg_sf = 100
            return max(int((bathrooms_pct * area) / avg_sf ), 1)
    
    def calc_damage(origin, target):
        distances = cdist(origin, target)[0]
        bins = list(np.linspace(0,max(distances),6))
        labels = ['Complete', 'Extensive', 'Moderate', 'Slight','None']
        damage = pd.cut(distances, bins=bins, labels=labels).get_values()
        return damage
    
    index = np.linspace(0,num_entities-1,10, dtype=int)
    
    lat, long = rand_lat_long(lat0, long0, radius, num_entities)

    names = []
    addresses = []
    incomes = []
    savings = []
    house_values = []
    house_areas = []
    house_ages = []
    mortgages = []
    occupancies = []
    listings = []
    credit_scores = []
    insurance = []
    bedrooms = []
    bathrooms = []
    landlords = []
    tenant_incomes = []
    tenant_insurance = []
    tenant_savings = []
    tenant_credit = []
    damages = []

    for i in range(1,num_entities):
        names.append(fake.name())
        addresses.append(fake.street_address())
        incomes.append(calc_income(income_values, income_counts))
        savings.append(int(calc_savings(incomes[i-1])))
        house_values.append(calc_house_value(incomes[i-1]))
        house_areas.append(calc_house_area(house_values[i-1]))
        house_ages.append(int(fake.year()))
        mortgages.append(calc_mortgage_payment(house_values[i-1]))
        occupancies.append(set_occupancy(incomes[i-1]))
        listings.append(set_listing())
        credit_scores.append(calc_credit())
        insurance.append(calc_insurance(incomes[i-1]))
        bedrooms.append(calc_bedrooms(house_areas[i-1]))
        bathrooms.append(calc_bathrooms(house_areas[i-1]))
        damages.append(calc_damage([(lat0, long0)], [(lat[i-1], long[i-1])]))
       
        if tenure.lower() == 'rent' or tenure.lower() == 'renter':
            landlords.append(fake.name())
            tenant_incomes.append(calc_income(income_values, income_counts))
            tenant_insurance.append(calc_insurance(tenant_incomes[i-1]))
            tenant_savings.append(int(calc_savings(tenant_incomes[i-1])))
            tenant_credit.append(calc_credit())

    if tenure.lower() == 'own' or tenure.lower() == 'owner':
        df = pd.DataFrame({'Name': names, 'Address': addresses, 'Income': incomes, 'Owner Savings': savings, 
                           'Owner Insurance': insurance, 'Owner Credit': credit_scores,
                           'Occupancy': occupancies, 'Value': house_values, 'Monthly Cost': mortgages, 'Area': house_areas, 
                           'Bedrooms': bedrooms, 'Bathrooms': bathrooms, 'Year Built': house_ages, 'Listing': listings, 
                           'Damage State': damages, 'Latitude': lat, 'Longitude': long},
                          columns=['Name', 'Income', 'Owner Savings', 'Owner Credit', 'Owner Insurance', 
                                   'Address', 'Occupancy', 'Area', 'Bedrooms', 'Bathrooms', 'Value', 'Monthly Cost', 
                                   'Year Built', 'Listing', 'Damage State', 'Latitude', 'Longitude'])
    
    elif tenure.lower() == 'rent' or tenure.lower() == 'renter':
         df = pd.DataFrame({'Name': names, 'Tenant Income': tenant_incomes, 'Tenant Insurance': tenant_insurance,
                            'Tenant Savings': tenant_savings, 'Tenant Credit': tenant_credit,
                            'Address': addresses, 'Landlord': landlords, 'Income': incomes, 'Owner Savings': savings, 
                           'Owner Insurance': insurance, 'Owner Credit': credit_scores,
                           'Occupancy': occupancies, 'Value': house_values, 'Monthly Cost': mortgages, 'Area': house_areas, 
                           'Bedrooms': bedrooms, 'Bathrooms': bathrooms, 'Year Built': house_ages, 'Listing': listings,
                            'Damage State': damages, 'Latitude': lat, 'Longitude': long},
                          columns=['Name', 'Tenant Income', 'Tenant Insurance', 'Tenant Savings', 'Tenant Credit',
                                   'Landlord', 'Income', 'Owner Savings', 'Owner Credit', 'Owner Insurance', 
                                   'Address', 'Occupancy', 'Area', 'Bedrooms', 'Bathrooms', 'Value', 'Monthly Cost', 
                                   'Year Built', 'Listing', 'Damage State', 'Latitude', 'Longitude'])
    
    else:
        print('Specified entity class not supported.')
        return
    
    return df

In [27]:
income_df = pd.read_excel('income.xlsx')
income_values = income_df['Income']
income_counts = income_df['Count']
num_entities = 10
lat0 = 40.84
long0 = -73.87
radius = 1000

fake_df = fake_inputs('own', income_values, income_counts, lat0, long0, radius)

fake_df.head(10)

Unnamed: 0,Name,Income,Owner Savings,Owner Credit,Owner Insurance,Address,Occupancy,Area,Bedrooms,Bathrooms,Value,Monthly Cost,Year Built,Listing,Damage State,Latitude,Longitude
0,Summer Maddox,5000,0,737,0.0,251 Joseph Road,Mobile Home,500,0,1,13349,64,2001,False,[None],40.841793,-73.877905
1,Erica Miller,25000,625,552,0.0,8167 Thompson Grove,Single Family Dwelling,500,0,1,54359,262,1995,False,[None],40.846497,-73.873548
2,Emily Brandt,10000,62,817,0.0,51483 Eric Lakes Suite 083,Single Family Dwelling,500,0,1,29649,143,2016,False,[None],40.833008,-73.871267
3,Sheri Macdonald,15000,187,637,0.0,6707 Wilcox Station,Single Family Dwelling,500,0,1,34040,164,2005,False,[None],40.846701,-73.864639
4,Dorothy Hernandez,140000,23625,665,0.8,11470 Wilson Village Suite 242,Single Family Dwelling,2451,3,2,367673,1776,2009,False,[None],40.839898,-73.864891
5,Linda Garcia,60000,4125,775,0.0,615 Michael Meadow,Single Family Dwelling,902,1,1,135308,653,1978,False,[None],40.846012,-73.866496
6,Benjamin Butler,15000,187,692,0.0,780 Jackson Drives,Single Family Dwelling,500,0,1,30984,149,1988,True,[None],40.84293,-73.872659
7,Benjamin Kelly,100000,11874,668,0.0,62124 David Estates Suite 408,Single Family Dwelling,1516,2,1,227516,1099,1974,True,[None],40.843956,-73.869556
8,Roberta Baker,35000,1312,764,0.0,613 Molina Stream,Single Family Dwelling,697,1,1,104573,505,1983,True,[None],40.841084,-73.877018
9,Miranda Evans,75000,6562,615,0.0,684 Lopez Squares Suite 359,Single Family Dwelling,1372,2,1,205889,994,1972,False,[None],40.843092,-73.866815
