## Create Analysis File to run Regressions

In [79]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display

%matplotlib inline 

In [80]:
init_df = pd.read_csv('final_analysis_file_1.csv')

In [81]:
init_df.head()

Unnamed: 0.2,Unnamed: 0,Award Mean,Award Sum,Company Count,Score1,Score2,Score5,Score6,Score7,Unnamed: 0.1,...,city_state,creative_employees,creative_establishments,empowerment_zone,performance_amount,performance_count,recipient_amount,recipient_count,regular_employees,regular_establishments
0,0,128725.714286,901080.0,7.0,2.35135,2.67576,2.328623,2.718282,5.046905,0,...,santaclara_ca,2222.0,188.0,0,8192436.0,2.0,174193400.0,873.0,157590.0,6512.0
1,1,0.0,0.0,0.0,1.713367,2.368286,1.425096,2.691711,4.116807,1,...,armonk_ny,,,1,0.0,0.0,0.0,0.0,,
2,2,166188.029412,5650393.0,34.0,2.571461,2.718282,2.348135,2.672531,5.020666,2,...,houston_tx,45344.0,11568.0,1,401235700.0,134.0,2363753000.0,3752.0,2895473.0,219720.0
3,3,238464.416667,5723146.0,24.0,2.718282,2.702366,2.718282,2.546716,5.264998,3,...,sanjose_ca,10299.0,1751.0,0,207137.3,10.0,337722500.0,849.0,427571.0,34442.0
4,4,129946.0,129946.0,1.0,2.240925,2.344083,2.410876,2.464089,4.874966,4,...,boise_id,,,0,0.0,0.0,200692000.0,690.0,,


In [82]:
init_df['Score_invented'] = init_df['Score1'] + init_df['Score2']
init_df['Score_assigned'] = init_df['Score7']
init_df.drop(['Score1', 'Score2', 'Score5', 'Score6', 'Score7', 'Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)

In [83]:
init_df.shape

(13870, 16)

In [84]:
init_df.dropna(inplace = True)
init_df.shape

(8864, 16)

In [85]:
for year in range(2001, 2015):
    temp = init_df.loc[init_df.Year == year]
    print(temp.shape)

(622, 16)
(723, 16)
(736, 16)
(757, 16)
(758, 16)
(725, 16)
(749, 16)
(750, 16)
(743, 16)
(753, 16)
(753, 16)
(795, 16)
(0, 16)
(0, 16)


In [86]:
init_df.columns

Index(['Award Mean', 'Award Sum', 'Company Count', 'Year', 'city_state',
       'creative_employees', 'creative_establishments', 'empowerment_zone',
       'performance_amount', 'performance_count', 'recipient_amount',
       'recipient_count', 'regular_employees', 'regular_establishments',
       'Score_invented', 'Score_assigned'],
      dtype='object')

In [87]:
init_df['percent_creative_class'] = init_df['creative_employees'] / (init_df['creative_employees'] + init_df['regular_employees'])
init_df['scaled_perc_creative_class'] = (init_df['percent_creative_class'] * np.log(init_df['creative_employees'] + init_df['regular_employees'])) / 10

init_df['creative_establishment_ratio'] = init_df['creative_establishments'] / (init_df['regular_establishments'] + init_df['creative_establishments'])
init_df['company_count_perc'] = init_df['Company Count'] / (init_df['regular_establishments'] + init_df['creative_establishments'])

init_df['recipient_mean'] = init_df['recipient_amount'] / init_df['recipient_count']
init_df['performance_mean'] = init_df['performance_amount'] / init_df['performance_count']
init_df = init_df.fillna(0)



In [88]:
#test 2006

init_df.isna().sum()

Award Mean                      0
Award Sum                       0
Company Count                   0
Year                            0
city_state                      0
creative_employees              0
creative_establishments         0
empowerment_zone                0
performance_amount              0
performance_count               0
recipient_amount                0
recipient_count                 0
regular_employees               0
regular_establishments          0
Score_invented                  0
Score_assigned                  0
percent_creative_class          0
scaled_perc_creative_class      0
creative_establishment_ratio    0
company_count_perc              0
recipient_mean                  0
performance_mean                0
dtype: int64

In [93]:
init_list = []
years = list(init_df.Year.unique())
for year in years:
    temp = init_df.loc[init_df.Year == year].reset_index()
    temp['Award Mean'] = temp['Award Mean']**(1/np.log(temp['Award Mean'].max()))
    temp['recipient_mean'] = temp.recipient_mean**(1/np.log(temp.recipient_mean.max()))
    temp['performance_mean'] = temp.performance_mean**(1/np.log(temp.performance_mean.max()))
    temp = temp.fillna(0) #NaNs occur at performance and recipient means => just 0 values.
    cities = temp['city_state']
    empowerment_zone = temp['empowerment_zone']
    temp = temp.drop(['city_state', 'empowerment_zone', 'index'], axis = 1)
    columns = temp.columns
    
    
    scaler = StandardScaler()
    print(scaler.fit(temp))
    temp_scaled = scaler.transform(temp)
    temp_scaled = pd.DataFrame(scaler.fit_transform(temp_scaled))
    
    temp_scaled.columns = columns
    temp_scaled['city_state'] = cities
    temp_scaled['empowerment_zone'] = empowerment_zone
    temp_scaled['Year'] = year
    
    init_list.append(temp_scaled)
    del temp, temp_scaled
    

scaled_df = pd.concat(init_list, ignore_index = True)

StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)


In [96]:
scaled_df.rename(columns={'Award Mean': 'award_mean', 'Award Sum': 'award_sum', 'Company Count': 'company_count'},
                inplace = True)

In [97]:
scaled_df.head()

Unnamed: 0,award_mean,award_sum,company_count,Year,creative_employees,creative_establishments,performance_amount,performance_count,recipient_amount,recipient_count,...,Score_invented,Score_assigned,percent_creative_class,scaled_perc_creative_class,creative_establishment_ratio,company_count_perc,recipient_mean,performance_mean,city_state,empowerment_zone
0,0.894098,-0.113118,0.023536,2001,-0.012626,-0.15668,-0.012333,-0.11559,0.007619,0.292107,...,2.446393,3.777137,-0.040364,0.058301,-0.670775,-0.102112,0.947511,1.977185,santaclara_ca,0
1,0.931571,1.148251,1.543761,2001,3.079767,7.763752,6.779082,1.943065,3.864189,2.339072,...,2.864905,3.722613,0.039257,0.374503,0.476911,-0.352623,1.09534,1.933588,houston_tx,1
2,0.985766,1.167573,0.980715,2001,0.566597,0.931162,-0.150312,0.009177,0.29565,0.275043,...,3.073506,4.230321,0.465051,0.754969,0.391391,-0.20861,1.034955,1.331896,sanjose_ca,0
3,0.994786,1.663402,1.318543,2001,-0.070427,-0.165032,0.897621,-0.006419,2.037776,0.32979,...,2.87646,3.667892,0.032812,0.100076,-0.619882,0.987267,1.207631,2.048469,sunnyvale_ca,0
4,0.939301,0.717026,0.92441,2001,21.998783,17.759643,-0.114134,1.77151,1.18176,2.536729,...,2.290799,3.106763,1.330379,2.403113,-0.274577,-0.384821,0.953053,1.321175,newyork_ny,1


In [98]:
scaled_df.to_csv('regression_ready_df.csv')