In [None]:
# Dependencies
import pandas as pd

In [None]:
# Read csv file with pandas and create dataframe
cancer_df = pd.read_csv("Resources/original_data/cancer_data.csv")
cancer_df.head()

In [None]:
#drop unnecessary columns
cancer_df = cancer_df.drop(columns = {'CancerType', 'Year', 'Sex', 'AgeAdjustedRate'})
cancer_df.head()

In [None]:
#rename columns to match other jupyter notebook df
renamed_cancer_df = cancer_df.rename(columns = {'Area': 'us_state', 'County': 'county'})
renamed_cancer_df.head()

In [None]:
#remove ' symbol from all columns
cols_to_check = ['us_state', 'county', 'CaseCount', 'Population']

renamed_cancer_df[cols_to_check] = renamed_cancer_df[cols_to_check].replace({"'":''}, regex=True)
renamed_cancer_df.head()

In [None]:
#remove county, area, parish, burough, etc from county column
renamed_cancer_df['county'] = renamed_cancer_df['county'].replace({' County':'', ' Census Area':'', ' Parish':'', ' Burough':'', ' Municipality':'', ' City and Burough': '', 'City': ''}, regex=True)
renamed_cancer_df.head()

In [None]:
#rename state from full name to abbreviation
renamed_cancer_df['us_state'] = renamed_cancer_df['us_state'].replace({'Alabama':'AL', 'Alaska':'AK', 'Arkansas':'AR', 'Arizona':'AZ', 'California':'CA', 
                                                                      'Colorado':'CO', 'Connecticut':'CT', 'Delaware':'DE', 'Florida':'FL', 'Georgia':'GA',
                                                                      'Hawaii':'HI', 'Idaho':'ID', 'Illinois':'IL', 'Indiana':'IN', 'Iowa':'IA', 'Kansas':'KS',
                                                                      'Kentucky':'KY', 'Louisiana':'LA', 'Maine':'ME', 'Maryland':'MD', 'Massachusetts':'MA', 
                                                                      'Michigan':'MI', 'Minnesota':'MN', 'Mississippi':'MS', 'Missouri':'MO', 'Montana':'MT', 'Nebraska':'NE', 
                                                                      'Nevada':'NV', 'New Hampshire':'NH', 'New Jersey':'NJ', 'New Mexico':'NM', 'New York':'NY',
                                                                      'North Carolina':'NC', 'North Dakota':'ND', 'Ohio':'OH', 'Oklahoma':'OK', 'Oregon':'OR',
                                                                      'Pennsylvania':'PA', 'Rhode Island':'RI', 'South Carolina':'SC', 'South Dakota':'SD', 
                                                                      'Tennessee':'TN', 'Texas':'TX', 'Utah':'UT', 'Vermont':'VT', 'Virginia':'VA', 'Washington':'WA', 
                                                                      'West VA':'WV', 'Wisconsin':'WI', 'Wyoming':'WY'}, regex=True)

renamed_cancer_df




In [None]:
#drop rows with "Data Supressed" 
renamed_cancer_df = renamed_cancer_df[renamed_cancer_df.CaseCount != 'Data Suppressed']
renamed_cancer_df

In [None]:
#convert CaseCount to integer
renamed_cancer_df['CaseCount'] = renamed_cancer_df['CaseCount'].astype(int)

#calculate average cancer cases (data is for 5 year period)
avg_cancer_cases = renamed_cancer_df['CaseCount']/5
renamed_cancer_df['avg_cancer_cases'] = avg_cancer_cases


renamed_cancer_df.head()

In [None]:
#convert Population to integer
renamed_cancer_df['Population'] = renamed_cancer_df['Population'].astype(int)

#calculate population percentage with lung cancer
cancer_percent = renamed_cancer_df['avg_cancer_cases']/renamed_cancer_df['Population'] *100000
renamed_cancer_df['percent_pop_per_100k'] = cancer_percent

renamed_cancer_df.head()

In [None]:
# get min and max of population percent
renamed_cancer_df['percent_pop_per_100k'].min()

In [None]:
renamed_cancer_df['percent_pop_per_100k'].max()

In [None]:
# create bins to cateogorize population percentage as low, medium, high
bins = [2.1032, 18.0650, 34.0268, 49.9888]
category = ['low', 'medium', 'high']

renamed_cancer_df['cancer_classification'] = pd.cut(renamed_cancer_df['percent_pop_per_100k'], bins, labels=category)
renamed_cancer_df.head()

In [None]:
renamed_cancer_df['cancer_classification'].value_counts()

In [None]:
clean_cancer_df = renamed_cancer_df.drop(columns ={'CaseCount', 'Population', 'avg_cancer_cases'})
clean_cancer_df

In [None]:
# Export file as a CSV, with index and header
clean_cancer_df.to_csv("Resources/clean_data/cancerdata.csv", index=False, header=True)