In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Original CDC and RIBCOT


In [2]:
cdcdata = pd.read_csv('data/filtered_cities500_wide.csv')
ribcot = pd.read_csv('data/RegionalInterestByConditionOverTime.csv')

In [3]:
print(cdcdata.shape)
print(ribcot.shape)

(83, 9)
(210, 128)


In [4]:
print(cdcdata.info())
print(ribcot.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               83 non-null     object 
 1   CityName            83 non-null     object 
 2   population2010      83 non-null     int64  
 3   cityfips            83 non-null     int64  
 4   bingedrinking       83 non-null     float64
 5   smokingrate         83 non-null     float64
 6   nophysicalactivity  83 non-null     float64
 7   obesityrate         83 non-null     float64
 8   sleepdeprivation    83 non-null     float64
dtypes: float64(5), int64(2), object(2)
memory usage: 6.0+ KB
None
                  dma  geoCode  2004+cancer  2004+cardiovascular  2004+stroke  \
0  Portland-Auburn ME      500           44                    6           17   
1         New York NY      501           47                    6           13   
2       Binghamton NY      502           48      

In [6]:
cdc_state = cdcdata['State'].unique()
print(cdc_state)
print(len(cdc_state))

['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'DC' 'FL' 'GA' 'HI' 'IA' 'IL' 'IN' 'KS'
 'KY' 'LA' 'MD' 'MI' 'MO' 'MS' 'MT' 'NC' 'NE' 'NV' 'NY' 'OH' 'OK' 'OR'
 'PA' 'SC' 'SD' 'TN' 'TX' 'UT' 'WA' 'WI']
36


In [7]:
ribcot_dma = ribcot['dma'].unique()
print(ribcot_dma)
print(len(ribcot_dma))

['Portland-Auburn ME' 'New York NY' 'Binghamton NY' 'Macon GA'
 'Philadelphia PA' 'Detroit MI' 'Boston MA-Manchester NH' 'Savannah GA'
 'Pittsburgh PA' 'Ft. Wayne IN' 'Cleveland-Akron (Canton) OH'
 'Washington DC (Hagerstown MD)' 'Baltimore MD'
 'Flint-Saginaw-Bay City MI' 'Buffalo NY' 'Cincinnati OH' 'Erie PA'
 'Charlotte NC' 'Greensboro-High Point-Winston Salem NC' 'Charleston SC'
 'Augusta GA' 'Providence RI-New Bedford MA' 'Columbus GA'
 'Burlington VT-Plattsburgh NY' 'Atlanta GA' 'Albany GA' 'Utica NY'
 'Indianapolis IN' 'Miami-Ft. Lauderdale FL' 'Louisville KY'
 'Tallahassee FL-Thomasville GA' 'Tri-Cities TN-VA'
 'Albany-Schenectady-Troy NY' 'Hartford & New Haven CT'
 'Orlando-Daytona Beach-Melbourne FL' 'Columbus OH' 'Youngstown OH'
 'Bangor ME' 'Rochester NY' 'Tampa-St. Petersburg (Sarasota) FL'
 'Traverse City-Cadillac MI' 'Lexington KY' 'Dayton OH'
 'Springfield-Holyoke MA' 'Norfolk-Portsmouth-Newport News VA'
 'Greenville-New Bern-Washington NC' 'Columbia SC' 'Toledo OH'
 'W

In [8]:
ribcot['dma'] = ribcot['dma'].str.replace(' (Hagerstown MD)', '', regex=False)
print(ribcot['dma'].unique())

['Portland-Auburn ME' 'New York NY' 'Binghamton NY' 'Macon GA'
 'Philadelphia PA' 'Detroit MI' 'Boston MA-Manchester NH' 'Savannah GA'
 'Pittsburgh PA' 'Ft. Wayne IN' 'Cleveland-Akron (Canton) OH'
 'Washington DC' 'Baltimore MD' 'Flint-Saginaw-Bay City MI' 'Buffalo NY'
 'Cincinnati OH' 'Erie PA' 'Charlotte NC'
 'Greensboro-High Point-Winston Salem NC' 'Charleston SC' 'Augusta GA'
 'Providence RI-New Bedford MA' 'Columbus GA'
 'Burlington VT-Plattsburgh NY' 'Atlanta GA' 'Albany GA' 'Utica NY'
 'Indianapolis IN' 'Miami-Ft. Lauderdale FL' 'Louisville KY'
 'Tallahassee FL-Thomasville GA' 'Tri-Cities TN-VA'
 'Albany-Schenectady-Troy NY' 'Hartford & New Haven CT'
 'Orlando-Daytona Beach-Melbourne FL' 'Columbus OH' 'Youngstown OH'
 'Bangor ME' 'Rochester NY' 'Tampa-St. Petersburg (Sarasota) FL'
 'Traverse City-Cadillac MI' 'Lexington KY' 'Dayton OH'
 'Springfield-Holyoke MA' 'Norfolk-Portsmouth-Newport News VA'
 'Greenville-New Bern-Washington NC' 'Columbia SC' 'Toledo OH'
 'West Palm Beach-F

In [9]:
def get_last_two_chars(dma):
  """
  Extracts the last two characters of a DMA string.

  Args:
    dma: The DMA string.

  Returns:
    The last two characters of the DMA string, or the original string if it has fewer than two characters.
  """
  if len(dma) >= 2 :
    return dma[-2:]
  else:
    return dma


In [10]:
ribcot['State'] = ribcot['dma'].map(get_last_two_chars)
print(ribcot.head())


                  dma  geoCode  2004+cancer  2004+cardiovascular  2004+stroke  \
0  Portland-Auburn ME      500           44                    6           17   
1         New York NY      501           47                    6           13   
2       Binghamton NY      502           48                    3           16   
3            Macon GA      503           44                   14           14   
4     Philadelphia PA      504           52                    7           16   

   2004+depression  2004+rehab  2004+vaccine  2004+diarrhea  2004+obesity  \
0               39          21            31             14            29   
1               38          16            33             12            27   
2               50          12            37             24            31   
3               37          19            49             14            29   
4               41          23            36             14            30   

   ...  2017+cancer  2017+cardiovascular  2017+str

In [11]:
ribcot_state = ribcot['State'].unique()
print(ribcot_state)
print(len(ribcot['State'].unique()))

# Compare ribcot_state and cdc_state
common_states = set(ribcot_state) & set(cdc_state)
print(f"Common states: {common_states}")

different_ribcot_states = set(ribcot_state) - set(cdc_state)
print(f"States in ribcot but not in cdc: {different_ribcot_states}")

different_cdc_states = set(cdc_state) - set(ribcot_state)
print(f"States in cdc but not in ribcot: {different_cdc_states}")


['ME' 'NY' 'GA' 'PA' 'MI' 'NH' 'IN' 'OH' 'DC' 'MD' 'NC' 'SC' 'MA' 'FL'
 'KY' 'VA' 'CT' 'TN' 'WV' 'TX' 'IL' 'KS' 'MO' 'AL' 'MN' 'LA' 'WI' 'IA'
 'OK' 'AR' 'MS' 'NE' 'ND' 'SD' 'AK' 'HI' 'CO' 'AZ' 'MT' 'ID' 'WY' 'UT'
 'CA' 'NM' 'OR' 'WA' 'NV']
47
Common states: {'GA', 'TN', 'SD', 'NC', 'AZ', 'DC', 'MT', 'OK', 'UT', 'OH', 'MI', 'SC', 'NV', 'AR', 'CO', 'MD', 'NY', 'OR', 'IA', 'CA', 'LA', 'AK', 'WI', 'IN', 'FL', 'KS', 'TX', 'WA', 'AL', 'KY', 'IL', 'PA', 'NE', 'HI', 'MS', 'MO'}
States in ribcot but not in cdc: {'MN', 'VA', 'ID', 'ME', 'WV', 'ND', 'NH', 'WY', 'NM', 'CT', 'MA'}
States in cdc but not in ribcot: set()


Rhode Island, Delware, Vermont, and New Jersey are not in RIBCOT, US is not a real code

In [12]:
plus_columns = [col for col in ribcot.columns if '+' in col]
print(plus_columns)


['2004+cancer', '2004+cardiovascular', '2004+stroke', '2004+depression', '2004+rehab', '2004+vaccine', '2004+diarrhea', '2004+obesity', '2004+diabetes', '2005+cancer', '2005+cardiovascular', '2005+stroke', '2005+depression', '2005+rehab', '2005+vaccine', '2005+diarrhea', '2005+obesity', '2005+diabetes', '2006+cancer', '2006+cardiovascular', '2006+stroke', '2006+depression', '2006+rehab', '2006+vaccine', '2006+diarrhea', '2006+obesity', '2006+diabetes', '2007+cancer', '2007+cardiovascular', '2007+stroke', '2007+depression', '2007+rehab', '2007+vaccine', '2007+diarrhea', '2007+obesity', '2007+diabetes', '2008+cancer', '2008+cardiovascular', '2008+stroke', '2008+depression', '2008+rehab', '2008+vaccine', '2008+diarrhea', '2008+obesity', '2008+diabetes', '2009+cancer', '2009+cardiovascular', '2009+stroke', '2009+depression', '2009+rehab', '2009+vaccine', '2009+diarrhea', '2009+obesity', '2009+diabetes', '2010+cancer', '2010+cardiovascular', '2010+stroke', '2010+depression', '2010+rehab', '

In [13]:
ribcot_long = pd.melt(ribcot, id_vars=['dma', 'State', 'geoCode'],
                      value_vars=plus_columns,
                      var_name='temp',
                      value_name='value')

print(ribcot_long.head())
print(ribcot_long.shape)

                  dma State  geoCode         temp  value
0  Portland-Auburn ME    ME      500  2004+cancer     44
1         New York NY    NY      501  2004+cancer     47
2       Binghamton NY    NY      502  2004+cancer     48
3            Macon GA    GA      503  2004+cancer     44
4     Philadelphia PA    PA      504  2004+cancer     52
(26460, 5)


In [14]:
ribcot_long['Condition'] = ribcot_long['temp'].map(lambda y: y.split('+')[1])
ribcot_long['Year'] = ribcot_long['temp'].map(lambda y: int(y.split('+')[0]))
print(ribcot_long.head())

                  dma State  geoCode         temp  value Condition  Year
0  Portland-Auburn ME    ME      500  2004+cancer     44    cancer  2004
1         New York NY    NY      501  2004+cancer     47    cancer  2004
2       Binghamton NY    NY      502  2004+cancer     48    cancer  2004
3            Macon GA    GA      503  2004+cancer     44    cancer  2004
4     Philadelphia PA    PA      504  2004+cancer     52    cancer  2004


In [15]:
ribcot_long = ribcot_long.drop(columns=['temp'])
ribcot_wide = ribcot_long.pivot(index=['dma', 'State', 'geoCode', 'Year'],
                                columns='Condition',
                                values='value').reset_index()




In [16]:
print(ribcot_wide.head())
print(ribcot_wide.shape)
print(ribcot_wide.columns)

Condition                    dma State  geoCode  Year  cancer  cardiovascular  \
0          Abilene-Sweetwater TX    TX      662  2004      27               0   
1          Abilene-Sweetwater TX    TX      662  2005      51              29   
2          Abilene-Sweetwater TX    TX      662  2006      43              38   
3          Abilene-Sweetwater TX    TX      662  2007      55              32   
4          Abilene-Sweetwater TX    TX      662  2008      54               7   

Condition  depression  diabetes  diarrhea  obesity  rehab  stroke  vaccine  
0                  67        45        32       28     17      16       50  
1                  10        21        30       29     57      36       23  
2                  21        31        57       32     36      20       36  
3                  32        61        26       32     72      64       28  
4                  30        43        55       21     85      61       24  
(2940, 13)
Index(['dma', 'State', 'geoCode', 'Year'

In [17]:
ribcot_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   dma             2940 non-null   object
 1   State           2940 non-null   object
 2   geoCode         2940 non-null   int64 
 3   Year            2940 non-null   int64 
 4   cancer          2940 non-null   int64 
 5   cardiovascular  2940 non-null   int64 
 6   depression      2940 non-null   int64 
 7   diabetes        2940 non-null   int64 
 8   diarrhea        2940 non-null   int64 
 9   obesity         2940 non-null   int64 
 10  rehab           2940 non-null   int64 
 11  stroke          2940 non-null   int64 
 12  vaccine         2940 non-null   int64 
dtypes: int64(11), object(2)
memory usage: 298.7+ KB


In [18]:
ribcot_wide.to_csv('data/ribcot.csv', index=False)


No Null entries, ribcot cleaning complete