In [1]:
# importing the dependencies
import pandas as pd
import numpy


In [2]:
# Making the dataframe to inspect the data
df_cancer = pd.read_csv("Data/Countytable.csv")
df_cancer.head()

Unnamed: 0,Cancer header,County,County population,Note,Rate,Cancer type,Sex
0,All cancer types combined,Mower,39807,,410.7,All cancer types combined,Everyone
1,All cancer types combined,Todd,24494,,414.3,All cancer types combined,Everyone
2,All cancer types combined,Fillmore,20949,,416.5,All cancer types combined,Everyone
3,All cancer types combined,Lac qui Parle,6719,,422.4,All cancer types combined,Everyone
4,All cancer types combined,Koochiching,12515,,422.7,All cancer types combined,Everyone


In [3]:
# getting rid of some columns and renaming some columns
df_cancer.drop(["Cancer header", "Note","Cancer type","Sex"], axis = 1,inplace = True)
df_cancer.head()


Unnamed: 0,County,County population,Rate
0,Mower,39807,410.7
1,Todd,24494,414.3
2,Fillmore,20949,416.5
3,Lac qui Parle,6719,422.4
4,Koochiching,12515,422.7


In [4]:
df_cancer = df_cancer.rename(columns = {"Rate": "Rate per 100,000"})
df_cancer.head()

Unnamed: 0,County,County population,"Rate per 100,000"
0,Mower,39807,410.7
1,Todd,24494,414.3
2,Fillmore,20949,416.5
3,Lac qui Parle,6719,422.4
4,Koochiching,12515,422.7


In [5]:
# sort by county name 
df_cancer.sort_values(by = "County",inplace = True)
df_cancer.head()

Unnamed: 0,County,County population,"Rate per 100,000"
26,Aitkin,15834,449.5
64,Anoka,350253,486.5
69,Becker,34011,491.9
70,Beltrami,46403,492.3
7,Benton,40129,428.1


In [6]:
# setting the index
df_cancer.reset_index(drop= True, inplace = True)
df_cancer.head()

Unnamed: 0,County,County population,"Rate per 100,000"
0,Aitkin,15834,449.5
1,Anoka,350253,486.5
2,Becker,34011,491.9
3,Beltrami,46403,492.3
4,Benton,40129,428.1


In [7]:
# checking for null values
df_cancer.isnull().sum()

County               0
County population    0
Rate per 100,000     0
dtype: int64

In [8]:
# checking the duplicated values
df_cancer.duplicated()

0     False
1     False
2     False
3     False
4     False
      ...  
83    False
84    False
85    False
86    False
87    False
Length: 88, dtype: bool

In [9]:
#checking the data types
df_cancer.dtypes

County                object
County population     object
Rate per 100,000     float64
dtype: object

In [10]:
# change the datatype of county population to integer
df_cancer['County population'] = df_cancer['County population'].str.replace(',', '').astype(float)
#df_cancer[['County population']].apply(pd.to_numeric) 
#df_cancer.dtypes
df_cancer.head()


Unnamed: 0,County,County population,"Rate per 100,000"
0,Aitkin,15834.0,449.5
1,Anoka,350253.0,486.5
2,Becker,34011.0,491.9
3,Beltrami,46403.0,492.3
4,Benton,40129.0,428.1


In [11]:
df_cancer.dtypes

County                object
County population    float64
Rate per 100,000     float64
dtype: object

In [12]:
# jsonify the dataframe
#importing the dependencies
import json
from json import loads, dumps

In [13]:
# saving the data in an output file as json format
df_cancer.to_json('Output/cancer(2015-2019).json', orient='records')

In [14]:
#reading the healthoutcome excel file 
df_healthoutcome = pd.read_excel("Data/HealthOutcomeAndFactors.xlsx",header = 1)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,Z-Score,Rank,Z-Score.1,Rank.1
0,,,,,,,
1,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
2,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
3,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
4,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0


In [15]:
#Renaming the columns
df_healthoutcome.rename( columns = {"Z-Score":"HealthOutcome(Z-Score)", 
                                    "Rank":"HealthOutcome(Rank)","Z-Score.1":"HealthFactor(Z-Score)",
                                    "Rank.1":"HealthFactor(Rank)"}, inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
0,,,,,,,
1,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
2,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
3,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
4,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0


In [16]:
df_healthoutcome.drop(index = 0,inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
1,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
2,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
3,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
4,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0
5,27009.0,Minnesota,Benton,-0.022054,47.0,0.114975,59.0


In [17]:
# resetting the index
df_healthoutcome.reset_index( drop = True, inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
0,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
1,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
2,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
3,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0
4,27009.0,Minnesota,Benton,-0.022054,47.0,0.114975,59.0


In [18]:
# dropping column 
df_healthoutcome.drop( columns = ["State"], inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
0,27001.0,Aitkin,0.43145,69.0,0.761015,82.0
1,27003.0,Anoka,-0.426424,23.0,-0.413081,17.0
2,27005.0,Becker,0.472333,71.0,0.169356,63.0
3,27007.0,Beltrami,1.717782,86.0,0.763052,83.0
4,27009.0,Benton,-0.022054,47.0,0.114975,59.0


In [19]:
# Saving the file in json format
df_healthoutcome.to_json('Output/healthoutcome(2023).json', orient='records')

In [20]:
#reading the asthma data
df_asthma = pd.read_csv("Data/MN-asthma-county.csv")
df_asthma.head()


Unnamed: 0,__Outcome,__Year,_County,"Age-adjusted rate per 10,000",Note,Count of cases
0,Hospitalizations,2018-2020,Winona,0.5,Unstable rate due to small population,9
1,Hospitalizations,2018-2020,Todd,0.6,Unstable rate due to small population,7
2,Hospitalizations,2018-2020,Wright,1.1,,45
3,Hospitalizations,2018-2020,Wabasha,1.1,Unstable rate due to small population,7
4,Hospitalizations,2018-2020,Waseca,1.1,Unstable rate due to small population,7


In [21]:
# Getting the column names
df_asthma.columns

Index(['__Outcome', '__Year', '_County', 'Age-adjusted rate per 10,000',
       'Note', 'Count of cases'],
      dtype='object')

In [22]:
# dropping some columns
df_asthma.drop(['__Outcome', '__Year','Note'], axis = 1,inplace = True)
df_asthma.head()

Unnamed: 0,_County,"Age-adjusted rate per 10,000",Count of cases
0,Winona,0.5,9
1,Todd,0.6,7
2,Wright,1.1,45
3,Wabasha,1.1,7
4,Waseca,1.1,7


In [23]:
# Renaming the columns
df_asthma.rename(columns = {"_County": "County"}, inplace = True)
df_asthma.head()

Unnamed: 0,County,"Age-adjusted rate per 10,000",Count of cases
0,Winona,0.5,9
1,Todd,0.6,7
2,Wright,1.1,45
3,Wabasha,1.1,7
4,Waseca,1.1,7


In [24]:
# sorting data by county names
df_asthma.sort_values(by = "County", inplace = True)
df_asthma.head()

Unnamed: 0,County,"Age-adjusted rate per 10,000",Count of cases
22,Aitkin,2.0,9
42,Anoka,2.6,262
46,Becker,2.7,26
59,Beltrami,3.7,55
52,Benton,3.2,37


In [25]:
# Resetting the index
df_asthma.reset_index(drop= True, inplace = True)
df_asthma


Unnamed: 0,County,"Age-adjusted rate per 10,000",Count of cases
0,Aitkin,2,9
1,Anoka,2.6,262
2,Becker,2.7,26
3,Beltrami,3.7,55
4,Benton,3.2,37
...,...,...,...
83,Watonwan,*,*
84,Wilkin,*,*
85,Winona,0.5,9
86,Wright,1.1,45


In [26]:
# checking the length of dataframe
len(df_asthma)

88

In [27]:
#Checking for null values
df_asthma.isnull().sum()

County                          0
Age-adjusted rate per 10,000    0
Count of cases                  0
dtype: int64

In [28]:
# checking for data types 
df_asthma.dtypes

County                          object
Age-adjusted rate per 10,000    object
Count of cases                  object
dtype: object

In [29]:
df_asthma['Age-adjusted rate per 10,000'] = pd.to_numeric(df_asthma['Age-adjusted rate per 10,000']
                                                                           ,errors='coerce')
df_asthma.dtypes

County                           object
Age-adjusted rate per 10,000    float64
Count of cases                   object
dtype: object

In [30]:
df_asthma['Count of cases'] = pd.to_numeric(df_asthma['Count of cases'], errors='coerce')
                                                                           
df_asthma.dtypes

County                           object
Age-adjusted rate per 10,000    float64
Count of cases                  float64
dtype: object

In [31]:
df_asthma


Unnamed: 0,County,"Age-adjusted rate per 10,000",Count of cases
0,Aitkin,2.0,9.0
1,Anoka,2.6,262.0
2,Becker,2.7,26.0
3,Beltrami,3.7,55.0
4,Benton,3.2,37.0
...,...,...,...
83,Watonwan,,
84,Wilkin,,
85,Winona,0.5,9.0
86,Wright,1.1,45.0


In [32]:
# saving the file in json form
df_asthma.to_json('Output/asthma(2018-2020).json', orient='records')