## REVENUE DATA GENERATOR

### Importing essential libraries

In [3]:

import numpy as np 
import pandas as pd 
from random import randint
import matplotlib.pyplot as plt 
%matplotlib inline

### Importing a relevant datasets

In [4]:
# A randomly generated local book store data

mock_data = pd.read_csv("mock_data.csv")
mock_data.head()

Unnamed: 0.1,Unnamed: 0,ISBN,store_name,gender_of_owner
0,0,357765302-7,Waters-Wunsch,Female
1,1,998155268-2,VonRueden-Jenkins,Male
2,2,515154998-7,"Stanton, Gislason and Deckow",Female
3,3,997744982-1,Auer-Lind,Female
4,4,484474585-9,Zulauf and Sons,Female


In [5]:
# Dropping unnecessary feature 

mock_data.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# looking at the first element of mock_data for checking

mock_data.head()

Unnamed: 0,ISBN,store_name,gender_of_owner
0,357765302-7,Waters-Wunsch,Female
1,998155268-2,VonRueden-Jenkins,Male
2,515154998-7,"Stanton, Gislason and Deckow",Female
3,997744982-1,Auer-Lind,Female
4,484474585-9,Zulauf and Sons,Female


In [7]:
# Importing bay area zip code data

bay_zip = pd.read_csv("bayarea_zipcodes.csv")
bay_zip.head()

Unnamed: 0,PO_NAME,the_geom,ZIP,STATE,Area__,Length__
0,NAPA,MULTIPOLYGON (((-122.10329200180091 38.5132829...,94558,CA,12313260000.0,995176.225313
1,FAIRFIELD,MULTIPOLYGON (((-121.947475002335 38.301511000...,94533,CA,991786100.0,200772.556587
2,DIXON,MULTIPOLYGON (((-121.65335500334429 38.3133870...,95620,CA,7236950000.0,441860.2014
3,SONOMA,MULTIPOLYGON (((-122.406843003057 38.155681999...,95476,CA,3001414000.0,311318.546326
4,NAPA,MULTIPOLYGON (((-122.29368500225117 38.1552379...,94559,CA,1194302000.0,359104.646602


### Systemic random revenue data generation

In [8]:
# Initial random revenue generation for the year 2010 -> considering local businesses with $50,000 - $200,000 revenue in 2010

revenue_2010 = []
for i in range(mock_data.shape[0]):
    k = randint(50000, 200000)
    revenue_2010.append(k)

In [9]:
# A Function to Generate a new list of revenue for local bussinesses based on the previous year revenue

# Input: 'lst' -> a list of revenue from previous year 
       # Bigger local stores making comparatively more revenue than small ones

# Ouput: Returns a lisf of revenue for the current year

def gen_data(lst):

    new_data = list()

    for i in range(1000):

        if lst[i] > 150000:
            k = randint(10000, 20000)
            new_data.append(lst[i]+k)

        elif lst[i] > 100000:
            k = randint(10000, 15000)
            new_data.append(lst[i]+k)
        
        else:
            k = randint(5000, 10000)
            new_data.append(lst[i]+k)
    return new_data

In [10]:
# Generating revenues for 2011
revenue_2011 = gen_data(revenue_2010)

In [11]:
# Generating revenues for 2012
revenue_2012 = gen_data(revenue_2011)

In [12]:
# Generating revenues for 2013
revenue_2013 = gen_data(revenue_2012)

In [13]:
# Generating revenues for 2014
revenue_2014 = gen_data(revenue_2013)

In [14]:
# Generating revenues for 2015
revenue_2015 = gen_data(revenue_2014)

In [15]:
# Generating revenues for 2016
revenue_2016 = gen_data(revenue_2015)

In [16]:
# Generating revenues for 2017
revenue_2017 = gen_data(revenue_2016)

In [17]:
# Generating revenues for 2018
revenue_2018 = gen_data(revenue_2017)

In [18]:
# Generating revenues for 2019
revenue_2019 = gen_data(revenue_2018)

In [19]:
# Generating revenues for 2020

# A randomly generated data considering size of companies and how much they could lose


revenue_2020 = []
for i in range(len(revenue_2019)):

    if revenue_2019[i] < 150000:
        k = randint(40000, 70000)
        revenue_2020.append(revenue_2019[i] - k)
    
    elif revenue_2019[i] < 200000:
        k = randint(60000, 100000)
        revenue_2020.append(revenue_2019[i] - k)

    else:
        k = randint(100000, 150000)
        revenue_2020.append(revenue_2019[i] - k)


### Data Aggregations

In [20]:
# Extracting and convering to list of bay area zip codes from the bay area zipcodes dataset

zip_codes = list(bay_zip['ZIP'])
len(zip_codes)

187

In [21]:
# Assigning local bookstores with random zip codes for prototype

zip = list()

for i in range(len(revenue_2020)):
    k = randint(0,186)
    zip.append(zip_codes[k])

In [22]:
# Creating a dataframe of the revenue lists and zip codes

revenue_df = pd.DataFrame({ '2010_revenue_till_june': revenue_2010, 
                            '2011_revenue_till_june': revenue_2011,
                            '2012_revenue_till_june': revenue_2012, 
                            '2013_revenue_till_june': revenue_2013, 
                            '2014_revenue_till_june': revenue_2014, 
                            '2015_revenue_till_june': revenue_2015, 
                            '2016_revenue_till_june': revenue_2016, 
                            '2017_revenue_till_june': revenue_2017, 
                            '2018_revenue_till_june': revenue_2018, 
                            '2019_revenue_till_june': revenue_2019, 
                            '2020_revenue_till_june': revenue_2020,
                            'zip_codes': zip})
revenue_df.head()

Unnamed: 0,2010_revenue_till_june,2011_revenue_till_june,2012_revenue_till_june,2013_revenue_till_june,2014_revenue_till_june,2015_revenue_till_june,2016_revenue_till_june,2017_revenue_till_june,2018_revenue_till_june,2019_revenue_till_june,2020_revenue_till_june,zip_codes
0,100819,112509,123944,137538,149566,164316,179411,198312,218097,230448,129431,94621
1,113084,126738,139531,153422,167274,186978,197318,215656,231218,248714,110034,94546
2,146726,158604,177580,194941,208128,218759,230549,247263,263878,279697,173066,94116
3,115260,127096,138848,149751,161231,176138,187878,198611,214950,231737,102909,94970
4,119924,130434,141699,154061,165971,176935,190425,204749,214778,228993,123748,94558


In [23]:
# Concatening mock_data dataseta and the newly formed dataset (revenue_df)

revenue_data = pd.concat([mock_data, revenue_df], axis=1)

In [24]:
# Taking a look at the first five local book stores
revenue_data.head()

Unnamed: 0,ISBN,store_name,gender_of_owner,2010_revenue_till_june,2011_revenue_till_june,2012_revenue_till_june,2013_revenue_till_june,2014_revenue_till_june,2015_revenue_till_june,2016_revenue_till_june,2017_revenue_till_june,2018_revenue_till_june,2019_revenue_till_june,2020_revenue_till_june,zip_codes
0,357765302-7,Waters-Wunsch,Female,100819,112509,123944,137538,149566,164316,179411,198312,218097,230448,129431,94621
1,998155268-2,VonRueden-Jenkins,Male,113084,126738,139531,153422,167274,186978,197318,215656,231218,248714,110034,94546
2,515154998-7,"Stanton, Gislason and Deckow",Female,146726,158604,177580,194941,208128,218759,230549,247263,263878,279697,173066,94116
3,997744982-1,Auer-Lind,Female,115260,127096,138848,149751,161231,176138,187878,198611,214950,231737,102909,94970
4,484474585-9,Zulauf and Sons,Female,119924,130434,141699,154061,165971,176935,190425,204749,214778,228993,123748,94558


### Saving the final generated dataset

In [25]:
# Saving dataframe as a csv data
revenue_data.to_csv('generated_revenue_data.csv')