In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from copy import deepcopy
import json

## Dependent and Independent variables

In [2]:
YEAR_TO_USE = 2020
COLUMNS_TO_USE = [
    # business related, can include more if needed
    'PubPriv', 'EmpHereC', 'NAICS', 'Name_4',
    # demographic related
    'pop_black', 'pop_asian', 'pop_white_all', 'pop_hispanic', 
    'median_age', 'per_capita_income', 'edu_highschool_percent', 'edu_bachelor_percent'
]
DEPENDENT_COLUMN = ['SalesGrowth']

## Load survival data and location data

In [3]:
survival_data = pd.read_csv('Survival_LA_City.csv')
location_data = pd.read_csv('location_census_CA.csv')
location_data = location_data[location_data['year'] == YEAR_TO_USE]
location_data = location_data.rename(columns={'GEOID': 'ZipCode'})

  survival_data = pd.read_csv('Survival_LA_City.csv')


## Merge the two dataset

In [4]:
data = pd.merge(survival_data, location_data, on='ZipCode', how='left')
data = data[COLUMNS_TO_USE + DEPENDENT_COLUMN]
data = data.dropna()
data = pd.get_dummies(data, drop_first=True)

## Run the linear model

In [5]:
X = np.asarray(data.drop(DEPENDENT_COLUMN, axis=1)).astype(np.float64)
Y = np.asarray(data[DEPENDENT_COLUMN].values.squeeze()).astype(np.float64)

model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.863
Model:                            OLS   Adj. R-squared (uncentered):              0.863
Method:                 Least Squares   F-statistic:                          1.474e+04
Date:                Wed, 13 Dec 2023   Prob (F-statistic):                        0.00
Time:                        17:25:25   Log-Likelihood:                     -7.8339e+05
No. Observations:              615555   AIC:                                  1.567e+06
Df Residuals:                  615292   BIC:                                  1.570e+06
Df Model:                         263                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### To use the linear model
- Pick a type of business that one wants to open, this will determine the 'Name_4' column
- Encode this type of business as a dummy variable
- Choose the values of 'PubPriv', 'EmpHereC', 'NAICS' as the business owner wants
- Then, for each zip code in each cluster, e.g., cluster 0, we do:
    + Take the demographic information of that zip code and include it as the demographic-related independent variables
    + Input this to the linear to model to obtain the prediction
    + Repeat for each zip code in the cluster
    + Then we can rank between different zip codes inside a cluster, for example, if the data corresponds to zip code 90024 leads to the highest sales growth then it means the owner should open the business in this zip code (if he/she chooses this cluster to do business)

### Some helper functions

In [6]:
def get_dummy_df(df, columns):
    # Encode new data point
    df = pd.get_dummies(df)
    
    new_frame = df.copy()

    # Align columns
    # Add missing columns as zeros
    for col in columns:
        if col not in new_frame:
            new_frame = new_frame.copy()
            new_frame[col] = 0

    # Reorder columns to match original dataframe
    new_frame = new_frame.reindex(columns=columns)
    return new_frame

### Load cluster data

In [7]:
cluster_data = []
for i in range(6):
    cluster_data.append(pd.read_csv('cluster_' + str(i) + '.csv'))

### Enter your business information

In [8]:
new_business = {
    'PubPriv': 'Y',
    'EmpHereC': 2,
    'NAICS': 621111,
    'Name_4': 'Restaurants and Other Eating Places',
    'pop_black': None, 
    'pop_asian': None, 
    'pop_white_all': None, 
    'pop_hispanic': None, 
    'median_age': None, 
    'per_capita_income': None,
    'edu_highschool_percent': None, 
    'edu_bachelor_percent': None
}

### Loop through each cluster and zip code inside the cluster

In [9]:
cluster_prediction = {
    'cluster_0': {},
    'cluster_1': {},
    'cluster_2': {},
    'cluster_3': {},
    'cluster_4': {},
    'cluster_5': {},
}

for i, cluster in enumerate(cluster_data):
    # loop through each zip code in cluster
    # and extract the demographic data
    for index, row in cluster.iterrows():
        new_business_zip_code = deepcopy(new_business)
        new_business_zip_code['pop_black'] = row['pop_black']
        new_business_zip_code['pop_asian'] = row['pop_asian']
        new_business_zip_code['pop_white_all'] = row['pop_white_all']
        new_business_zip_code['pop_hispanic'] = row['pop_hispanic']
        new_business_zip_code['median_age'] = row['median_age']
        new_business_zip_code['per_capita_income'] = row['per_capita_income']
        new_business_zip_code['edu_highschool_percent'] = row['edu_highschool_percent']
        new_business_zip_code['edu_bachelor_percent'] = row['edu_bachelor_percent']
        new_business_zip_code = pd.DataFrame([new_business_zip_code])
        new_business_with_dummy = get_dummy_df(new_business_zip_code, data.columns)
        
        # make prediction from the linear model above
        new_business_with_dummy_arr = np.asanyarray(new_business_with_dummy.drop(DEPENDENT_COLUMN, axis=1)).astype(np.float64)
        prediction = results.predict(new_business_with_dummy_arr)
        
        # save the prediction
        cluster_prediction['cluster_' + str(i)][row['GEOID']] = prediction[0]

In [10]:
# save the prediction to json file with nice format
# sort each cluster by the prediction value
for cluster in cluster_prediction:
    cluster_prediction[cluster] = dict(sorted(cluster_prediction[cluster].items(), key=lambda item: item[1], reverse=True))

with open('cluster_prediction.json', 'w') as fp:
    json.dump(cluster_prediction, fp, indent=4)