# Visualizing Income Inequality in the United Stated

Author: Sang Truong and Dr. Humberto Barreto, Department of Economics and Management

DePauw University, Greencastle, Indiana, 46135, Summer 2019

References:

*   Color generator: https://www.strangeplanet.fr/work/gradient-generator/index.php

*   AmChart documentation:  https://docs.amcharts.com/3/javascriptcharts/AmGraph

*   Jackblun's graph: https://jackblun.github.io/Globalinc/html/fig_1980.html

# Deflator generator

In [0]:
import pandas as pd
import json
from collections import OrderedDict
from google.colab import drive
from scipy import stats
drive.mount('/content/gdrive')
in_path = 'gdrive/My Drive/Colab Notebooks/code/'

# Import raw data for deflator and CPI
raw_deflator = pd.read_csv(in_path + "raw_deflator.csv")
cpi = pd.read_csv(in_path + 'cpi_deflator.csv')

# Eliminate RENTGRW = 0
raw_deflator = raw_deflator[raw_deflator.RENTGRS != 0]

# Select the household head
raw_deflator = raw_deflator[raw_deflator.RELATE == 1]

# Generate constants
stateList = [1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,
             28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,
             50,51,53,54,55,56]

yearList = [1970, 1980, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
        2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]

# Generate deflator grid
deflator = pd.DataFrame(columns=stateList)
deflator[1] = yearList

# Iterate through each year in yearList
r = 0
for y in yearList:
  # Generate year dataframe 
  year = raw_deflator[raw_deflator.YEAR == y]
  
  # Iterate through each state
  c = 0
  for s in stateList:
    state = year[year.STATEFIP == s]
    # Calculate median of RENTGRS
    deflator.iloc[r, c] = state.median().RENTGRS
    c = c + 1
  r = r + 1

# Calculate COLI
for c in range (0, 51):
  for r in range(0, len(deflator)):
    deflator.iloc[r, c] = 0.44*deflator.iloc[r, c] + 0.56

# Compute average of COLI
deflator['AVERAGE'] = deflator.mean(axis = 1)

# Normalize COLI with average COLI
for c in range (0, 51):
  for r in range(0, len(deflator)):
    deflator.iloc[r, c] = deflator.iloc[r, c]/deflator.iloc[r, 51]

deflator = deflator.rename(index = {0: 1970, 1: 1980, 2: 1990, 3: 2000, 4: 2001,
                         5: 2002, 6: 2003, 7: 2004, 8: 2005, 9: 2006,
                         10: 2007, 11: 2008, 12: 2009, 13: 2010, 14: 2011,
                         15: 2012, 16: 2013, 17: 2014, 18: 2015, 19: 2016,
                         20: 2017})

deflator['Year'] = yearList

deflator.to_csv(in_path + 'original_COLI.csv')
# I didn't know why this work, but we have to export, and import the deflator sheet so the
# regression steps will work
deflator = pd.read_csv(in_path + 'original_COLI.csv', index_col = 0)

# Generate regression grid
reg = pd.DataFrame(index = stateList, columns = [['Coefficient', 'Intercept', 'Rsquared']])

for i in stateList:
  coefficient, intercept, r, p_value, std_err = stats.linregress(deflator['Year'], deflator[str(i)])
  reg.loc[i, 'Coefficient'] = coefficient
  reg.loc[i, 'Intercept'] = intercept
  reg.loc[i, 'Rsquared'] = r**2
  
reg.to_csv(in_path + 'reg.csv')
  
# # Generate prediction grid
# pred = pd.DataFrame(index = range (1978, 2019), columns = [['YEAR', 'STATEFIP', 'PRED_COLI']])  
# for y in range (1978, 2019):
#   line = pred[pred.index == y].copy()
#   line.name = y
#   #50 repetition, for 51 states, since we already have 1 line. 
#   for s in range (0, 50): pred = pred.append(line)
  
# pred.reset_index(drop = True, inplace = True)    

# i = 0
# for y in range (1978, 2019):
#   for s in stateList:
#     pred.loc[i, 'YEAR'] = y
#     pred.loc[i, 'STATEFIP'] = s
#     # If condition to use the original data for year in YearList
#     if y in yearList: pred.loc[i, 'PRED_COLI'] = deflator.loc[y, str(s)]
#     else: pred.loc[i, 'PRED_COLI'] = (reg.loc[s, 'Coefficient'])*y + (reg.loc[s, 'Intercept'])    
#     i = i + 1
    
# for i in range(0, len(pred)):
#   for y in range(1978, 2019):
#     pred[pred.YEAR == y].mean().YEAR
#     pred.iloc[i, 2] =  pred.iloc[i, 2]

# # # Export deflator file
# # pred.to_csv(in_path + 'pred.csv')

# # CAN'T MERGE DATAFRAME HERE DUE TO SOME CRYPTIC ERROR -- GO TO EXCEL AND MERGE IT -- FIGURE OUT LATER.
# # There must be something wrong about the datatype of 2 dataframes.
  
# # # Merge predicted COLI with CPI
# # pred = pd.merge(pred, cpi, on = ['YEAR'])

# # # Normalize COLI with CPI
# # for i in range (len(pred)): pred.iloc[i, 2] = pred.iloc[i, 2]/pred.iloc[i, 3]

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data processor

## year_d

In [0]:
import pandas as pd
import json
from collections import OrderedDict
from google.colab import drive
drive.mount('/content/gdrive')
in_path = 'gdrive/My Drive/Colab Notebooks/code/'
out_path = 'gdrive/My Drive/Colab Notebooks/oneYear/year_d/'

# Import raw data state codes, color codes, and deflator
raw = pd.read_csv(in_path + "raw.csv")

# Select data with HFLAG != 1 and then drop HFLAG
raw = raw[raw.HFLAG != 1]
raw = raw.drop(columns = ['HFLAG'])

codeFirst_oneYear = pd.read_csv(in_path + "codeFirst_oneYear.csv", index_col = 0)
codeThird_oneYear = pd.read_csv(in_path + "codeThird_oneYear.csv", index_col = 0)
deflator = pd.read_csv(in_path + "COLI.csv")

# Generate constants
stateList = [1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,
             28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,
             50,51,53,54,55,56]
decile = [0.05,0.15,0.25,0.35,0.45,0.50,0.55,0.65,0.75,0.85,0.95]
value = ['','','','','','','','','','','','','','','','','','','','','','',
         '','','','','','','','','','','','','','','','','','','','','','',
         '','','','','','','']

# Iterate through each year
for y in range(1978, 1979):
  
  # Generate result grid, decile-column
  result = pd.DataFrame(columns=stateList)
  for i in range(0, 11): result.loc[i] = value
  
  # Generate year dataframe
  year = raw[raw.YEAR == y]
  
  # Generate effective household size
  year.insert(0, 'SIZE', '')
  year.insert(0, 'EFFHSIZE','')

  for i in range(0, len(year)):
    if year.loc[i, 'PERNUM'] == 1: year.loc[i, 'SIZE'] = 1
    elif year.loc[i, 'AGE'] > 16: year.loc[i, 'SIZE'] = 0.7
    else: year.loc[i, 'SIZE'] = 0.5

  for i in range(0, len(year)):
    if year.loc[i, 'PERNUM'] == 1:
      j = 0
      year.loc[i, 'EFFHSIZE'] = year.loc[i, 'SIZE']
    else:
      j = j + 1
      year.loc[i-j, 'EFFHSIZE'] = year.loc[i-j, 'EFFHSIZE'] + year.loc[i, 'SIZE']

  # Eliminate observations that has PERNUM != 1
  person = year[year.PERNUM == 1]
  person = person.drop(columns = ['PERNUM'])
  
  # Merge 2 file: for raw, every row that has STATEFIP and YEAR match with 
  # that row in deflator will get the same deflator value.
  person = pd.merge(person, deflator, on = ["YEAR", "STATEFIP"])
  
  # Generate deflated household income column
  person.insert(0, 'DHHINCOME', '')
  
  # Normalize HHINCOME to DHHINCOME (deflate with COLI and EFFSIZE)
  for i in range (0, len(person)):
    person.loc[i, 'DHHINCOME'] = person.loc[i, 'HHINCOME']/(person.loc[i, 'DEFLATOR']*person.loc[i, 'EFFHSIZE'])
  
  # Iterate through each state
  c = 0
  for i in stateList:
    # Generate state dataframe
    state = person[person.STATEFIP == i]
    state = state.reset_index(drop = True)

    # Sort state dataframe by DHHINCOME
    state = state.sort_values('DHHINCOME')
    
    # Calculate cumulated weight and Percentage
    state.insert(0, 'CUMWT', '')
    state.insert(0, 'PERCENT', '')
    state.loc[0, 'CUMWT'] = state.loc[0, 'ASECWTH']
    state.loc[0, 'PERCENT'] = state.loc[0, 'CUMWT']/(state.sum().ASECWTH)
    for i in range (1, len(state)):
      state.loc[i, 'CUMWT'] = state.loc[i-1, 'CUMWT'] + state.loc[i, 'ASECWTH']
      state.loc[i, 'PERCENT'] = state.loc[i, 'CUMWT']/(state.sum().ASECWTH)
    
    # Calculate decile
    r = 0
    for d in decile:
      for i in range (0, len(state)):
        if (d < state.loc[i, 'PERCENT']):
          result.iloc[r,c] = state.loc[i, 'DHHINCOME']
          r = r + 1
          break
    c = c + 1
    
  # Transpose result table: column-decile
  result = result.transpose()

  # Type casting result.index (type casting STATEFIP) to integer
  result.index = result.index.map(int)

  # Merge state dataframe with code dataframe
  result = pd.merge(codeFirst_oneYear, result, left_index = True, right_index = True)
  result = pd.merge(result, codeThird_oneYear, left_index = True, right_index = True)

  # Compute state population and normalized state population
  result.insert(0, 'POP','')
  for i in stateList:
    state = year[year.STATEFIP == i]
    result.loc[i, 'POP'] = state.sum().ASECWT

  result.insert(0, 'NORMPOP', '')
  for i in stateList:
    result.loc[i, 'NORMPOP'] = round(result.loc[i, 'POP']/(result['POP'].min()))

  # Replicate each state's dataline with its respected replication number
  for i in stateList:
    rep = result.loc[i,'NORMPOP'] - 1
    rep = int(rep)
    line = result[result.index == i].copy()
    line.name = i
    line.loc[i, 'Label'] = ''
    for i in range(0,rep): result = result.append(line)
          
  result = result.drop(columns = ['POP', 'NORMPOP'])
        
  # Sort the result by median
  result = result.sort_values(by=['5'], ascending = True)

  # Rename index column and role
  result = result.rename(index = str, columns = {0: "5p", 1: "15p", 2: "25p", 3: "35p", 4: "45p", 5: "50p",
                                                6: "55p", 7: "65p", 8: "75p", 9: "85p", 10: "95p"})

  # Export result grid  
  result.to_csv(out_path + str(y) + '_d.csv')

  # Convert dataframe to JSON
  result = result.to_json(orient = 'records')
  result = json.loads(result, object_pairs_hook = OrderedDict)

  # Make JSON format readable
  result = json.dumps(result, indent = 4, sort_keys = False)

  # Save JSON to text format
  with open(out_path + str(y) + '_d.txt', 'w') as f:
    f.writelines(result)

  # Glue data with html environment
  filenames = [in_path + 'first_d_oneYear.txt', out_path + str(y) + '_d.txt',
              in_path + 'third.txt']
  with open(out_path + str(y) + '_d.html', 'w') as outfile:
    for i in filenames:
      with open (i) as infile:
        outfile.write(infile.read())

Mounted at /content/gdrive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


## year_p

There are 4 differences between d_files and p_files:

*   Deciles
*   Loop to construct result grid sheet (0 to 98 instead of 0 to 11)
*   Use first_p instead of first_d (look at pCodeGenerator excel file for detail about p_code generator)
*   out_path: use year_p_gsp_data instead of year_d_gsp_data
*   Change name when export data

In [0]:
import pandas as pd
import json
from collections import OrderedDict
from google.colab import drive
drive.mount('/content/gdrive')
in_path = 'gdrive/My Drive/Colab Notebooks/code/'
out_path = 'gdrive/My Drive/Colab Notebooks/oneYear/year_p/'

# Import raw data state codes, color codes, and deflator
raw = pd.read_csv(in_path + "raw.csv")
codeFirst_oneYear = pd.read_csv(in_path + "codeFirst_oneYear.csv", index_col = 0)
codeThird_oneYear = pd.read_csv(in_path + "codeThird_oneYear.csv", index_col = 0)
deflator = pd.read_csv(in_path + "COLI.csv")

# Generate constants
stateList = [1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,
             28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,
             50,51,53,54,55,56]
decile = [0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,
          0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,
          0.3,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39,0.4,0.41,0.42,0.43,
          0.44,0.45,0.46,0.47,0.48,0.49,0.5,0.51,0.52,0.53,0.54,0.55,0.56,0.57,
          0.58,0.59,0.6,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.7,0.71,
          0.72,0.73,0.74,0.75,0.76,0.77,0.78,0.79,0.8,0.81,0.82,0.83,0.84,0.85,
          0.86,0.87,0.88,0.89,0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99]
value = ['','','','','','','','','','','','','','','','','','','','','','',
         '','','','','','','','','','','','','','','','','','','','','','',
         '','','','','','','']

# Iterate through each year
for y in range(1978, 1979):
  
  # Generate result grid, decile-column
  result = pd.DataFrame(columns = stateList)
  for i in range(0, 98): result.loc[i] = value
  
  # Generate year dataframe
  year = raw[raw.YEAR == y]
  
  # Eliminate observations that has PERNUM != 1
  person = year[year.PERNUM == 1]
  
  # Merge 2 file: for raw, every row that has STATEFIP and YEAR match with 
  # that row in deflator will get the same deflator value.
  person = pd.merge(person, deflator, on = ["YEAR","STATEFIP"])
  
  # Rearrange culumns order
  person = person[['YEAR', 'ASECWTH', 'STATEFIP', 'HHINCOME', 'PERNUM', 
                   'ASECWT','DEFLATOR','SEX','RACE','HISPAN','EDUC']]
  
  # Generate deflated household income column
  person.insert(7, 'DHHINCOME', '')
  
  # Iterate through the entire 'person' to calculate deflated income
  for i in range (0, len(person)):
    person.iloc[i, 7] = person.iloc[i, 3]/person.iloc[i, 6]
  
  # Sort the remainded observation by STATEFIP
  person.sort_values('STATEFIP')
  
  # Iterate through each state
  c = 0
  for i in stateList:
    # Generate state dataframe
    state = person[person.STATEFIP == i]
    
    # Sort state dataframe by HHINCOME
    state = state.sort_values('HHINCOME')
    
    # Calculate cumulated weight and Percentage
    state.insert(8, 'CUMWT', '')
    state.insert(9, 'PERCENT', '')
    state.iloc[0, 8] = state.iloc[0, 1]
    state.iloc[0, 9] = state.iloc[0, 8]/(state.sum().ASECWT)
    for i in range (1, len(state)):
      state.iloc[i, 8] = state.iloc[i-1, 8] + state.iloc[i, 1]
      state.iloc[i, 9] = state.iloc[i, 8]/(state.sum().ASECWT)
    
    # Calculate decile
    r = 0
    for d in decile:
      for i in range (0, len(state)):
        if (d < state.iloc[i, 9]):
          result.iloc[r,c] = state.iloc[i, 7]
          r = r + 1
          break
    c = c + 1
    
  # Transpose result table: column-decile
  result = result.transpose()

  # Type casting result.index (type casting STATEFIP) to integer
  result.index = result.index.map(int)

  # Merge state dataframe with code dataframe
  result = pd.merge(codeFirst_oneYear, result, left_index = True, right_index = True)
  result = pd.merge(result, codeThird_oneYear, left_index = True, right_index = True)

  result.insert(101, 'POP','')
  r = 0
  for i in stateList:
    state = year[year.STATEFIP == i]
    result.iloc[r, 101] = state.sum().ASECWT
    r = r + 1

  result.insert(102, 'NORMPOP', '')
  for i in range(0, len(result)):
    result.iloc[i, 102] = round(result.iloc[i, 101]/(result['POP'].min()))

  for i in stateList:
    rep = result.loc[int(i),'NORMPOP'] - 1
    rep = int(rep)
    # The following statement need .copy() at the end for explicit reason
    # More information: https://www.dataquest.io/blog/settingwithcopywarning/
    line = result[result.index == i].copy()
    line.name = i
    line.iloc[0, 99] = ''
    for i in range(0,rep): result = result.append(line)
        
  # result.to_csv(out_path+str(y)+'withPop.csv')
  result = result.drop(columns = ['POP', 'NORMPOP'])
      
  # Sort the result by median
  result = result.sort_values(48, ascending = True)

  # Rename index column and role
  result = result.rename(index = str, columns = {0: "2p", 1: "3p", 2: "4p", 3: "5p",
                                                 4: "6p", 5: "7p", 6: "8p", 7: "9p",
                                                 8: "10p", 9: "11p", 10: "12p",
                                                 11: "13p", 12: "14p", 13: "15p",
                                                 14: "16p", 15: "17p", 16: "18p",
                                                 17: "19p", 18: "20p", 19: "21p",
                                                 20: "22p", 21: "23p", 22: "24p",
                                                 23: "25p", 24: "26p", 25: "27p",
                                                 26: "28p", 27: "29p", 28: "30p",
                                                 29: "31p", 30: "32p", 31: "33p",
                                                 32: "34p", 33: "35p", 34: "36p",
                                                 35: "37p", 36: "38p", 37: "39p",
                                                 38: "40p", 39: "41p", 40: "42p",
                                                 41: "43p", 42: "44p", 43: "45p",
                                                 44: "46p", 45: "47p", 46: "48p",
                                                 47: "49p", 48: "50p", 49: "51p",
                                                 50: "52p", 51: "53p", 52: "54p",
                                                 53: "55p", 54: "56p", 55: "57p",
                                                 56: "58p", 57: "59p", 58: "60p",
                                                 59: "61p", 60: "62p", 61: "63p",
                                                 62: "64p", 63: "65p", 64: "66p",
                                                 65: "67p", 66: "68p", 67: "69p",
                                                 68: "70p", 69: "71p", 70: "72p",
                                                 71: "73p", 72: "74p", 73: "75p",
                                                 74: "76p", 75: "77p", 76: "78p",
                                                 77: "79p", 78: "80p", 79: "81p",
                                                 80: "82p", 81: "83p", 82: "84p",
                                                 83: "85p", 84: "86p", 85: "87p",
                                                 86: "88p", 87: "89p", 88: "90p",
                                                 89: "91p", 90: "92p", 91: "93p",
                                                 92: "94p", 93: "95p", 94: "96p",
                                                 95: "97p", 96: "98p", 97: "99p"})
  
  result.to_csv(out_path+str(y) + '_p.csv')

  # Convert dataframe to JSON
  result = result.to_json(orient = 'records')
  result = json.loads(result, object_pairs_hook = OrderedDict)

  # Make JSON format readable
  result = json.dumps(result, indent = 4, sort_keys = False)

  # Save JSON to text format
  with open(out_path + str(y) + '_p.txt', 'w') as f:
    f.writelines(result)

  # Glue data with html environment
  filenames = [in_path + 'first_p_oneYear.txt', out_path + str(y) + '_p.txt',
               in_path + 'third.txt']
  with open(out_path + str(y) + '_p.html', 'w') as outfile:
    for i in filenames:
      with open (i) as infile:
        outfile.write(infile.read())

## state_d

In [0]:
import pandas as pd
import json
from collections import OrderedDict
from google.colab import drive
drive.mount('/content/gdrive')
in_path = 'gdrive/My Drive/Colab Notebooks/code/'
out_path = 'gdrive/My Drive/Colab Notebooks/oneState/state_d/'

# Import raw data state codes, color codes, and deflator
raw = pd.read_csv(in_path + "raw.csv")
codeFirst_oneState = pd.read_csv(in_path + "codeFirst_oneState.csv", index_col = 0)
codeThird_oneState = pd.read_csv(in_path + "codeThird_oneState.csv", index_col = 0)
deflator = pd.read_csv(in_path +"COLI.csv")

# Select data with HFLAG != 1 and then drop HFLAG
raw = raw[raw.HFLAG != 1]
raw = raw.drop(columns = ['HFLAG'])

# Generate constants
stateList = [1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,
             28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,
             50,51,53,54,55,56]

# stateList = [12]

decile = [0.05,0.15,0.25,0.35,0.45,0.50,0.55,0.65,0.75,0.85,0.95]
value = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '', '']
yearList = [1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988,
            1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

# Iterate through each state
for s in stateList:
  
  # Generate result grid, decile-column
  result = pd.DataFrame(columns = yearList)
  for i in range(0, 11): result.loc[i] = value
  
  # Generate state dataframe
  state = raw[raw.STATEFIP == s]

  # Eliminate observations that has PERNUM != 1
  person = state[state.PERNUM == 1]
  
  # Merge 2 file: for raw, every row that has STATEFIP and YEAR match with 
  # that row in deflator will get the same deflator value.
  person = pd.merge(person, deflator, on = ["YEAR","STATEFIP"])
  
  # Rearrange culumns order
  person = person[['YEAR', 'ASECWTH', 'STATEFIP', 'HHINCOME', 'PERNUM', 
                   'ASECWT','DEFLATOR','SEX','RACE','HISPAN','EDUC']]
  
  # Generate deflated household income column
  person.insert(7, 'DHHINCOME', '')
  
  # Iterate through the entire 'person' to calculate deflated income
  for i in range (0, len(person)):
    person.iloc[i, 7] = person.iloc[i, 3]/person.iloc[i, 6]
  
  # Sort the remainded observation by YEAR
  person.sort_values('YEAR')
  
  # Iterate through each year
  c = 0
  for i in yearList:
    # Generate year dataframe
    year = person[person.YEAR == i]
    
    # Sort year dataframe by HHINCOME
    year = year.sort_values('HHINCOME')
    
    # Calculate cumulated weight and Percentage
    year.insert(8, 'CUMWT', '')
    year.insert(9, 'PERCENT', '')
    year.iloc[0, 8] = year.iloc[0, 1]
    year.iloc[0, 9] = year.iloc[0, 8]/(year.sum().ASECWT)
    for i in range (1, len(year)):
      year.iloc[i, 8] = year.iloc[i-1, 8] + year.iloc[i, 1]
      year.iloc[i, 9] = year.iloc[i, 8]/(year.sum().ASECWT)
    
    # Calculate decile
    r = 0
    for d in decile:
      for i in range (0, len(year)):
        if (d < year.iloc[i, 9]):
          result.iloc[r,c] = year.iloc[i, 7]
          r = r + 1
          break
    c = c + 1
    
  # Transpose result table: column-decile
  result = result.transpose()

  # Type casting result.index (type casting YEAR) to integer
  result.index = result.index.map(int)

  # Merge state dataframe with code dataframe
  result = pd.merge(codeFirst_oneState, result, left_index = True, right_index = True)
  result = pd.merge(result, codeThird_oneState, left_index = True, right_index = True)

  result.insert(14, 'POP','')
  r = 0
  for i in yearList:
    year = state[state.YEAR == i]
    result.iloc[r, 14] = year.sum().ASECWT
    r = r + 1

  result.insert(15, 'NORMPOP', '')
  for i in range(0, len(result)):
    result.iloc[i, 15] = round(10*(result.iloc[i, 14])/(result['POP'].min()))
#     result.iloc[i, 15] = result.iloc[i, 14]/(result['POP'].min())
  for i in yearList:
    rep = result.loc[int(i),'NORMPOP'] - 1
    rep = int(rep)
    # The following statement need .copy() at the end for explicit reason
    # More information: https://www.dataquest.io/blog/settingwithcopywarning/
    line = result[result.index == i].copy()
    line.name = i
    # Remove the name of the state (so that the name does not repeat too many time)
    line.iloc[0, 12] = ''
    for i in range(0,rep): result = result.append(line)
        
#   # result.to_csv(out_path+str(y)+'withPop.csv')
  result = result.drop(columns = ['POP', 'NORMPOP'])
      
  # Sort the result by year
  result = result.sort_values("Year", ascending = True)

  # Rename index column and role
  result = result.rename(index = str, columns = {0: "5p", 1: "15p", 2: "25p",
                                                 3: "35p", 4: "45p", 5: "50p",
                                                 6: "55p", 7: "65p", 8: "75p",
                                                 9: "85p", 10: "95p"})
  
  result.to_csv(out_path+str(s) + '_d.csv')

  # Convert dataframe to JSON
  result = result.to_json(orient = 'records')
  result = json.loads(result, object_pairs_hook = OrderedDict)

  # Make JSON format readable
  result = json.dumps(result, indent = 4, sort_keys = False)

  # Save JSON to text format
  with open(out_path + str(s) + '_d.txt', 'w') as f:
    f.writelines(result)

  # Glue data with html environment
  filenames = [in_path + 'first_d_oneState.txt', out_path + str(s) + '_d.txt',
               in_path + 'third.txt']
  with open(out_path + str(s) + '_d.html', 'w') as outfile:
    for i in filenames:
      with open (i) as infile:
        outfile.write(infile.read())

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## state_p

In [0]:
import pandas as pd
import json
from collections import OrderedDict
from google.colab import drive
drive.mount('/content/gdrive')
in_path = 'gdrive/My Drive/Colab Notebooks/code/'
out_path = 'gdrive/My Drive/Colab Notebooks/oneState/state_p/'

# Import raw data state codes, color codes, and deflator
raw = pd.read_csv(in_path + "raw.csv")
codeFirst_oneState = pd.read_csv(in_path + "codeFirst_oneState.csv", index_col = 0)
codeThird_oneState = pd.read_csv(in_path + "codeThird_oneState.csv", index_col = 0)
deflator = pd.read_csv(in_path + "COLI.csv")

# Generate constants
stateList = [1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,
             28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,
             50,51,53,54,55,56]

# stateList = [11]

decile = [0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,
          0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,
          0.3,0.31,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39,0.4,0.41,0.42,0.43,
          0.44,0.45,0.46,0.47,0.48,0.49,0.5,0.51,0.52,0.53,0.54,0.55,0.56,0.57,
          0.58,0.59,0.6,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.7,0.71,
          0.72,0.73,0.74,0.75,0.76,0.77,0.78,0.79,0.8,0.81,0.82,0.83,0.84,0.85,
          0.86,0.87,0.88,0.89,0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99]

value = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '', '']

yearList = [1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988,
            1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
            2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

# Iterate through each state
for s in stateList:
  
  # Generate result grid, decile-column
  result = pd.DataFrame(columns = yearList)
  for i in range(0, 98): result.loc[i] = value
  
  # Generate state dataframe
  state = raw[raw.STATEFIP == s]

  # Eliminate observations that has PERNUM != 1
  person = state[state.PERNUM == 1]
  
  # Merge 2 file: for raw, every row that has STATEFIP and YEAR match with 
  # that row in deflator will get the same deflator value.
  person = pd.merge(person, deflator, on = ["YEAR","STATEFIP"])
  
  # Rearrange culumns order
  person = person[['YEAR', 'ASECWTH', 'STATEFIP', 'HHINCOME', 'PERNUM', 
                   'ASECWT','DEFLATOR','SEX','RACE','HISPAN','EDUC']]
  
  # Generate deflated household income column
  person.insert(7, 'DHHINCOME', '')
  
  # Iterate through the entire 'person' to calculate deflated income
  for i in range (0, len(person)):
    person.iloc[i, 7] = person.iloc[i, 3]/person.iloc[i, 6]
  
  # Sort the remainded observation by YEAR
  person.sort_values('YEAR')
  
  # Iterate through each year
  c = 0
  for i in yearList:
    # Generate year dataframe
    year = person[person.YEAR == i]
    
    # Sort year dataframe by HHINCOME
    year = year.sort_values('HHINCOME')
    
    # Calculate cumulated weight and Percentage
    year.insert(8, 'CUMWT', '')
    year.insert(9, 'PERCENT', '')
    year.iloc[0, 8] = year.iloc[0, 1]
    year.iloc[0, 9] = year.iloc[0, 8]/(year.sum().ASECWT)
    for i in range (1, len(year)):
      year.iloc[i, 8] = year.iloc[i-1, 8] + year.iloc[i, 1]
      year.iloc[i, 9] = year.iloc[i, 8]/(year.sum().ASECWT)
    
    # Calculate decile
    r = 0
    for d in decile:
      for i in range (0, len(year)):
        if (d < year.iloc[i, 9]):
          result.iloc[r,c] = year.iloc[i, 7]
          r = r + 1
          break
    c = c + 1
    
  # Transpose result table: column-decile
  result = result.transpose()

  # Type casting result.index (type casting YEAR) to integer
  result.index = result.index.map(int)

  # Merge state dataframe with code dataframe
  result = pd.merge(codeFirst_oneState, result, left_index = True, right_index = True)
  result = pd.merge(result, codeThird_oneState, left_index = True, right_index = True)

  result.insert(14, 'POP','')
  r = 0
  for i in yearList:
    year = state[state.YEAR == i]
    result.iloc[r, 14] = year.sum().ASECWT
    r = r + 1

  result.insert(15, 'NORMPOP', '')
  for i in range(0, len(result)):
    result.iloc[i, 15] = round(result.iloc[i, 14]/(result['POP'].min()))

  for i in yearList:
    rep = result.loc[int(i),'NORMPOP'] - 1
    rep = int(rep)
    # The following statement need .copy() at the end for explicit reason
    # More information: https://www.dataquest.io/blog/settingwithcopywarning/
    line = result[result.index == i].copy()
    line.name = i
    # Remove the name of the state (so that the name does not repeat too many time)
    line.iloc[0, 12] = ''
    for i in range(0,rep): result = result.append(line)
        
#   # result.to_csv(out_path+str(y)+'withPop.csv')
  result = result.drop(columns = ['POP', 'NORMPOP'])
      
  # Sort the result by year
  result = result.sort_values("Year", ascending = True)

  # Rename index column and role
  result = result.rename(index = str, columns = {0: "2p", 1: "3p", 2: "4p", 3: "5p",
                                                 4: "6p", 5: "7p", 6: "8p", 7: "9p",
                                                 8: "10p", 9: "11p", 10: "12p",
                                                 11: "13p", 12: "14p", 13: "15p",
                                                 14: "16p", 15: "17p", 16: "18p",
                                                 17: "19p", 18: "20p", 19: "21p",
                                                 20: "22p", 21: "23p", 22: "24p",
                                                 23: "25p", 24: "26p", 25: "27p",
                                                 26: "28p", 27: "29p", 28: "30p",
                                                 29: "31p", 30: "32p", 31: "33p",
                                                 32: "34p", 33: "35p", 34: "36p",
                                                 35: "37p", 36: "38p", 37: "39p",
                                                 38: "40p", 39: "41p", 40: "42p",
                                                 41: "43p", 42: "44p", 43: "45p",
                                                 44: "46p", 45: "47p", 46: "48p",
                                                 47: "49p", 48: "50p", 49: "51p",
                                                 50: "52p", 51: "53p", 52: "54p",
                                                 53: "55p", 54: "56p", 55: "57p",
                                                 56: "58p", 57: "59p", 58: "60p",
                                                 59: "61p", 60: "62p", 61: "63p",
                                                 62: "64p", 63: "65p", 64: "66p",
                                                 65: "67p", 66: "68p", 67: "69p",
                                                 68: "70p", 69: "71p", 70: "72p",
                                                 71: "73p", 72: "74p", 73: "75p",
                                                 74: "76p", 75: "77p", 76: "78p",
                                                 77: "79p", 78: "80p", 79: "81p",
                                                 80: "82p", 81: "83p", 82: "84p",
                                                 83: "85p", 84: "86p", 85: "87p",
                                                 86: "88p", 87: "89p", 88: "90p",
                                                 89: "91p", 90: "92p", 91: "93p",
                                                 92: "94p", 93: "95p", 94: "96p",
                                                 95: "97p", 96: "98p", 97: "99p"})
  
  result.to_csv(out_path+str(s) + '_p.csv')

  # Convert dataframe to JSON
  result = result.to_json(orient = 'records')
  result = json.loads(result, object_pairs_hook = OrderedDict)

  # Make JSON format readable
  result = json.dumps(result, indent = 4, sort_keys = False)

  # Save JSON to text format
  with open(out_path + str(s) + '_p.txt', 'w') as f:
    f.writelines(result)

  # Glue data with html environment
  filenames = [in_path + 'first_p_oneState.txt', out_path + str(s) + '_p.txt',
               in_path + 'third.txt']
  with open(out_path + str(s) + '_p.html', 'w') as outfile:
    for i in filenames:
      with open (i) as infile:
        outfile.write(infile.read())
        
        #Adjust max income 
        #Adjust household size