# Exploratory Data Analysis (EDA) and Feature Engineering

This notebook focuses on:
1. **Exploratory Data Analysis (EDA)**:
   - Understanding data distributions and relationships between features.
   - Identifying correlations and trends relevant to life expectancy.

2. **Feature Engineering**:
   - Cleaning and transforming the dataset.
   - Creating new features to improve predictive power.

The final dataset will be saved for use in the machine learning pipeline.

In [6]:
# Importing all the necessary packages

import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

%matplotlib inline

In [3]:
init_df = pd.read_csv('../data/cleaned/combined_all_years_cleaned.csv')

In [8]:
init_df.head()

Unnamed: 0,County,State,State_FIPS,County_FIPS,disability_rate,white_population,total_population,age_60_and_over,poverty_rate,housing_cost_burden,...,unemployment_rate,median_household_income,asian_population,per_capita_income,total_population_race,high_school_only_percentage,median_home_value,black_population,age_18_and_under_percentage,Year
0,Abbeville County,South Carolina,45,1,19.7,17759.0,25387.0,14.7,20.8,221.0,...,13.7,35067.0,17759.0,17876.0,25387.0,34.8,90300.0,7144.0,10.2,2012
1,Acadia Parish,Louisiana,22,1,17.9,48543.0,61611.0,10.9,19.1,339.0,...,8.6,38686.0,48543.0,19369.0,61611.0,42.2,88900.0,11184.0,9.9,2012
2,Accomack County,Virginia,51,1,11.8,22586.0,33454.0,17.5,20.2,199.0,...,7.0,40780.0,22586.0,22909.0,33454.0,39.3,153800.0,9620.0,8.0,2012
3,Ada County,Idaho,16,1,9.6,363070.0,394961.0,9.3,12.2,4338.0,...,8.5,55499.0,363070.0,27522.0,394961.0,20.8,193300.0,4057.0,9.2,2012
4,Adair County,Iowa,19,1,13.7,7457.0,7628.0,18.6,9.9,43.0,...,4.0,47872.0,7457.0,25147.0,7628.0,45.5,96700.0,11.0,7.1,2012


In [11]:
init_df.columns

Index(['County', 'State', 'State_FIPS', 'County_FIPS', 'disability_rate',
       'white_population', 'total_population', 'age_60_and_over',
       'poverty_rate', 'housing_cost_burden', 'bachelors_and_above_percentage',
       'median_gross_rent', 'age_15_to_44_percentage', 'gini_index',
       'native_american_population', 'less_than_9th_grade_percentage',
       'health_insurance_rate', 'unemployment_rate', 'median_household_income',
       'asian_population', 'per_capita_income', 'total_population_race',
       'high_school_only_percentage', 'median_home_value', 'black_population',
       'age_18_and_under_percentage', 'Year'],
      dtype='object')

In [17]:
# Now let us drop the columns that are not relevant for our data analysis

df = init_df.drop(columns=['State_FIPS', 'County_FIPS'])
df.head()

Unnamed: 0,County,State,disability_rate,white_population,total_population,age_60_and_over,poverty_rate,housing_cost_burden,bachelors_and_above_percentage,median_gross_rent,...,unemployment_rate,median_household_income,asian_population,per_capita_income,total_population_race,high_school_only_percentage,median_home_value,black_population,age_18_and_under_percentage,Year
0,Abbeville County,South Carolina,19.7,17759.0,25387.0,14.7,20.8,221.0,11.7,593.0,...,13.7,35067.0,17759.0,17876.0,25387.0,34.8,90300.0,7144.0,10.2,2012
1,Acadia Parish,Louisiana,17.9,48543.0,61611.0,10.9,19.1,339.0,9.3,541.0,...,8.6,38686.0,48543.0,19369.0,61611.0,42.2,88900.0,11184.0,9.9,2012
2,Accomack County,Virginia,11.8,22586.0,33454.0,17.5,20.2,199.0,17.3,741.0,...,7.0,40780.0,22586.0,22909.0,33454.0,39.3,153800.0,9620.0,8.0,2012
3,Ada County,Idaho,9.6,363070.0,394961.0,9.3,12.2,4338.0,37.8,821.0,...,8.5,55499.0,363070.0,27522.0,394961.0,20.8,193300.0,4057.0,9.2,2012
4,Adair County,Iowa,13.7,7457.0,7628.0,18.6,9.9,43.0,11.5,511.0,...,4.0,47872.0,7457.0,25147.0,7628.0,45.5,96700.0,11.0,7.1,2012


In [33]:
important_features = [
    'life_expectancy',             
    'median_household_income',
    'poverty_rate',
    'health_insurance_rate',
    'bachelors_and_above_percentage',
    'age_60_and_over',
    'disability_rate',
    'gini_index',
    'median_home_value',
    'housing_cost_burden',
    'per_capita_income'
]

In [35]:
sns.pairplot(data = df[important_features])

KeyError: "['life_expectancy'] not in index"