In [29]:
import pandas as pd

# Load the data
data_2011 = pd.read_csv('2011_CountyData.csv')
data_2016 = pd.read_csv('2016_CountyData.csv')

def remove_percentage_sign(df):
    df.columns = df.columns.str.replace('% ', '')

# Remove percentage signs from column names in both data frames
remove_percentage_sign(data_2011)
remove_percentage_sign(data_2016)

data_2016.dropna(how='all', inplace=True)

# Ensure FIPS is the key column
data_2011['FIPS'] = data_2011['FIPS'].astype(str)
data_2016['FIPS'] = data_2016['FIPS'].astype(int).astype(str)

# Merge the data on FIPS
merged_data = pd.merge(data_2011, data_2016, on='FIPS', suffixes=('_2011', '_2016'))




In [32]:
data_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3190 entries, 0 to 3189
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   FIPS                     3190 non-null   object 
 1   State                    3190 non-null   object 
 2   County                   3140 non-null   object 
 3   Limited Access           0 non-null      float64
 4   Diabetes                 3190 non-null   float64
 5   Rural                    3189 non-null   float64
 6   Household Income         3189 non-null   float64
 7   African American         3190 non-null   float64
 8   Asian                    3190 non-null   float64
 9   Hispanic                 3190 non-null   float64
 10  Binge Drinking           2657 non-null   float64
 11  Physical Inactivity      3190 non-null   float64
 12  Mentally Unhealthy Days  2994 non-null   float64
 13  Obese                    3190 non-null   float64
 14  Excessive Drinking      

In [33]:
data_2016.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3192 entries, 0 to 3391
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   FIPS                     3192 non-null   object 
 1   State                    3192 non-null   object 
 2   County                   3141 non-null   object 
 3   Limited Access           3192 non-null   float64
 4   Diabetic                 3191 non-null   float64
 5   Rural                    3192 non-null   float64
 6   Household Income         3190 non-null   float64
 7   African American         3191 non-null   float64
 8   Asian                    3191 non-null   float64
 9   Hispanic                 3191 non-null   float64
 10  Binge Drinking           0 non-null      float64
 11  Physical Inactivity      0 non-null      float64
 12  Mentally Unhealthy Days  3191 non-null   float64
 13  Obese                    3192 non-null   float64
 14  Excessive Drinking       3191

In [34]:
merged_data.columns

Index(['FIPS', 'State_2011', 'County_2011', 'Limited Access_2011', 'Diabetes',
       'Rural_2011', 'Household Income_2011', 'African American_2011',
       'Asian_2011', 'Hispanic_2011', 'Binge Drinking_2011',
       'Physical Inactivity_2011', 'Mentally Unhealthy Days_2011',
       'Obese_2011', 'Excessive Drinking_2011', 'PCP Ratio_2011',
       'Fair/Poor_2011', 'Physically Inactive_2011', 'State_2016',
       'County_2016', 'Limited Access_2016', 'Diabetic', 'Rural_2016',
       'Household Income_2016', 'African American_2016', 'Asian_2016',
       'Hispanic_2016', 'Binge Drinking_2016', 'Physical Inactivity_2016',
       'Mentally Unhealthy Days_2016', 'Obese_2016', 'Excessive Drinking_2016',
       'PCP Ratio_2016', 'Fair/Poor_2016', 'Physically Inactive_2016'],
      dtype='object')

In [35]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3190 entries, 0 to 3189
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   FIPS                          3190 non-null   object 
 1   State_2011                    3190 non-null   object 
 2   County_2011                   3140 non-null   object 
 3   Limited Access_2011           0 non-null      float64
 4   Diabetes                      3190 non-null   float64
 5   Rural_2011                    3189 non-null   float64
 6   Household Income_2011         3189 non-null   float64
 7   African American_2011         3190 non-null   float64
 8   Asian_2011                    3190 non-null   float64
 9   Hispanic_2011                 3190 non-null   float64
 10  Binge Drinking_2011           2657 non-null   float64
 11  Physical Inactivity_2011      3190 non-null   float64
 12  Mentally Unhealthy Days_2011  2994 non-null   float64
 13  Obe

In [36]:
# Calculate changes
merged_data['change_rural'] = merged_data['Rural_2016'] - merged_data['Rural_2011']
merged_data['change_obese'] = merged_data['Obese_2016'] - merged_data['Obese_2011']
merged_data['change_binge_drinking'] = merged_data['Excessive Drinking_2016'] - merged_data['Binge Drinking_2011']
merged_data['change_physical_inactivity'] = merged_data['Physically Inactive_2016'] - merged_data['Physical Inactivity_2011']
merged_data['change_mentally_unhealthy_days'] = merged_data['Mentally Unhealthy Days_2016'] - merged_data['Mentally Unhealthy Days_2011']
merged_data['change_fair_poor_health'] = merged_data['Fair/Poor_2016'] - merged_data['Fair/Poor_2011']
merged_data['change_income'] = merged_data['Household Income_2016'] - merged_data['Household Income_2011']



In [38]:
# Select relevant columns for the regression model
regression_data = merged_data[[
    'FIPS', 'State_2011', 'County_2011',
    'Rural_2011', 'Household Income_2011', 'African American_2011', 
    'Asian_2011', 'Hispanic_2011', 'Obese_2011', 'PCP Ratio_2011',
    'change_rural', 'change_obese', 'change_binge_drinking', 
    'change_physical_inactivity', 'change_mentally_unhealthy_days',
    'change_fair_poor_health'
]]

# Save the regression data to a CSV file
regression_data.to_csv('regression_data.csv', index=False)

print("Regression data has been saved to 'regression_data.csv'.")


Regression data has been saved to 'regression_data.csv'.


In [40]:
regression_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3190 entries, 0 to 3189
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   FIPS                            3190 non-null   object 
 1   State_2011                      3190 non-null   object 
 2   County_2011                     3140 non-null   object 
 3   Rural_2011                      3189 non-null   float64
 4   Household Income_2011           3189 non-null   float64
 5   African American_2011           3190 non-null   float64
 6   Asian_2011                      3190 non-null   float64
 7   Hispanic_2011                   3190 non-null   float64
 8   Obese_2011                      3190 non-null   float64
 9   PCP Ratio_2011                  3190 non-null   object 
 10  change_rural                    3189 non-null   float64
 11  change_obese                    3190 non-null   float64
 12  change_binge_drinking           26