In [1]:
# importing the dependencies
import pandas as pd
import numpy


In [2]:
# Making the dataframe to inspect the data
df_cancer = pd.read_csv("Data/Countytable.csv")
df_cancer.head()

Unnamed: 0,Cancer header,County,County population,Note,Rate,Cancer type,Sex
0,All cancer types combined,Mower,39807,,410.7,All cancer types combined,Everyone
1,All cancer types combined,Todd,24494,,414.3,All cancer types combined,Everyone
2,All cancer types combined,Fillmore,20949,,416.5,All cancer types combined,Everyone
3,All cancer types combined,Lac qui Parle,6719,,422.4,All cancer types combined,Everyone
4,All cancer types combined,Koochiching,12515,,422.7,All cancer types combined,Everyone


In [3]:
# getting rid of some columns and renaming some columns
df_cancer.drop(["Cancer header", "Note","Cancer type","Sex"], axis = 1,inplace = True)
df_cancer.head()


Unnamed: 0,County,County population,Rate
0,Mower,39807,410.7
1,Todd,24494,414.3
2,Fillmore,20949,416.5
3,Lac qui Parle,6719,422.4
4,Koochiching,12515,422.7


In [4]:
df_cancer = df_cancer.rename(columns = {"Rate": "Rate per 100,000"})
df_cancer.head()

Unnamed: 0,County,County population,"Rate per 100,000"
0,Mower,39807,410.7
1,Todd,24494,414.3
2,Fillmore,20949,416.5
3,Lac qui Parle,6719,422.4
4,Koochiching,12515,422.7


In [5]:
# sort by county name 
df_cancer.sort_values(by = "County",inplace = True)
df_cancer.head()

Unnamed: 0,County,County population,"Rate per 100,000"
26,Aitkin,15834,449.5
64,Anoka,350253,486.5
69,Becker,34011,491.9
70,Beltrami,46403,492.3
7,Benton,40129,428.1


In [6]:
# setting the index
df_cancer.reset_index(drop= True, inplace = True)
df_cancer.head()

Unnamed: 0,County,County population,"Rate per 100,000"
0,Aitkin,15834,449.5
1,Anoka,350253,486.5
2,Becker,34011,491.9
3,Beltrami,46403,492.3
4,Benton,40129,428.1


In [7]:
# checking for null values
df_cancer.isnull().sum()

County               0
County population    0
Rate per 100,000     0
dtype: int64

In [8]:
df_cancer.duplicated()

0     False
1     False
2     False
3     False
4     False
      ...  
83    False
84    False
85    False
86    False
87    False
Length: 88, dtype: bool

In [9]:
df_cancer.dtypes

County                object
County population     object
Rate per 100,000     float64
dtype: object

In [10]:
# change the datatype of county population to integer
df_cancer['County population'] = df_cancer['County population'].str.replace(',', '').astype(float)
#df_cancer[['County population']].apply(pd.to_numeric) 
#df_cancer.dtypes
df_cancer.head()


Unnamed: 0,County,County population,"Rate per 100,000"
0,Aitkin,15834.0,449.5
1,Anoka,350253.0,486.5
2,Becker,34011.0,491.9
3,Beltrami,46403.0,492.3
4,Benton,40129.0,428.1


In [11]:
df_cancer.dtypes

County                object
County population    float64
Rate per 100,000     float64
dtype: object

In [12]:
# jsonify the dataframe
#importing the dependencies
import json
from json import loads, dumps

In [13]:
result = df_cancer.to_json(orient ='records')
result

'[{"County":"Aitkin","County population":15834.0,"Rate per 100,000":449.5},{"County":"Anoka","County population":350253.0,"Rate per 100,000":486.5},{"County":"Becker","County population":34011.0,"Rate per 100,000":491.9},{"County":"Beltrami","County population":46403.0,"Rate per 100,000":492.3},{"County":"Benton","County population":40129.0,"Rate per 100,000":428.1},{"County":"Big Stone","County population":4996.0,"Rate per 100,000":466.2},{"County":"Blue Earth","County population":66795.0,"Rate per 100,000":456.3},{"County":"Brown","County population":25163.0,"Rate per 100,000":496.8},{"County":"Carlton","County population":35633.0,"Rate per 100,000":466.7},{"County":"Carver","County population":101949.0,"Rate per 100,000":452.1},{"County":"Cass","County population":29268.0,"Rate per 100,000":493.3},{"County":"Chippewa","County population":11953.0,"Rate per 100,000":536.9},{"County":"Chisago","County population":55315.0,"Rate per 100,000":472.0},{"County":"Clay","County population":63

In [14]:
df_cancer.to_json('Output/cancer.json(2015-2019)', orient='records', lines=True)

In [15]:
#reading the healthoutcome excel file 
df_healthoutcome = pd.read_excel("Data/HealthOutcomeAndFactors.xlsx",header = 1)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,Z-Score,Rank,Z-Score.1,Rank.1
0,,,,,,,
1,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
2,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
3,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
4,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0


In [16]:
#Renaming the columns
df_healthoutcome.rename( columns = {"Z-Score":"HealthOutcome(Z-Score)", 
                                    "Rank":"HealthOutcome(Rank)","Z-Score.1":"HealthFactor(Z-Score)",
                                    "Rank.1":"HealthFactor(Rank)"}, inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
0,,,,,,,
1,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
2,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
3,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
4,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0


In [18]:
df_healthoutcome.drop(index = 0,inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
1,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
2,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
3,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
4,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0
5,27009.0,Minnesota,Benton,-0.022054,47.0,0.114975,59.0


In [20]:
df_healthoutcome.reset_index( drop = True, inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,State,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
0,27001.0,Minnesota,Aitkin,0.43145,69.0,0.761015,82.0
1,27003.0,Minnesota,Anoka,-0.426424,23.0,-0.413081,17.0
2,27005.0,Minnesota,Becker,0.472333,71.0,0.169356,63.0
3,27007.0,Minnesota,Beltrami,1.717782,86.0,0.763052,83.0
4,27009.0,Minnesota,Benton,-0.022054,47.0,0.114975,59.0


In [22]:
df_healthoutcome.drop( columns = ["State"], inplace = True)
df_healthoutcome.head()

Unnamed: 0,FIPS,County,HealthOutcome(Z-Score),HealthOutcome(Rank),HealthFactor(Z-Score),HealthFactor(Rank)
0,27001.0,Aitkin,0.43145,69.0,0.761015,82.0
1,27003.0,Anoka,-0.426424,23.0,-0.413081,17.0
2,27005.0,Becker,0.472333,71.0,0.169356,63.0
3,27007.0,Beltrami,1.717782,86.0,0.763052,83.0
4,27009.0,Benton,-0.022054,47.0,0.114975,59.0


In [23]:
df_healthoutcome.to_json('Output/healthoutcome.json(2023)', orient='records', lines=True)