In [1]:
#Dependencies
import pandas as pd

In [2]:
#Set the file path
colorado_county_population_path="../../Data_files/raw_data/colorado_raw_data/colorado_population_bycounty.csv"
colorado_county_crime_path="../../Data_files/raw_data/colorado_raw_data/colorado_county_crimecounts.csv"
colorado_county_vote_path = "../../Data_files/raw_data/colorado_raw_data/county_vote_CO.csv" 
colorado_county_unemployment_path = "../../Data_files/raw_data/colorado_raw_data/colorado_unemployment.csv"

In [3]:
# Read the csv file into dataframe
colorado_county_raw_population_df=pd.read_csv(colorado_county_population_path,header=3)
colorado_county_raw_crime_df=pd.read_csv(colorado_county_crime_path)
colorado_county_raw_vote_df=pd.read_csv(colorado_county_vote_path)
colorado_county_raw_unemployment_df= pd.read_csv(colorado_county_unemployment_path,header=5)

## Cleanup Process of Colorado Population Data

In [4]:
# Show the last 10 rows
colorado_county_raw_population_df.tail(10)

Unnamed: 0.1,Unnamed: 0,Census,Estimates Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
60,".Summit County, Colorado",27994.0,27994.0,28073.0,28042.0,28298.0,28728.0,29317.0,30045.0,30552.0,30820.0,30817.0,31011.0
61,".Teller County, Colorado",23350.0,23359.0,23469.0,23369.0,23445.0,23356.0,23433.0,23388.0,24058.0,24674.0,25113.0,25388.0
62,".Washington County, Colorado",4814.0,4812.0,4821.0,4762.0,4701.0,4743.0,4733.0,4786.0,4834.0,4921.0,4888.0,4908.0
63,".Weld County, Colorado",252825.0,252827.0,254207.0,258827.0,264131.0,270165.0,277112.0,285819.0,295712.0,305885.0,314815.0,324492.0
64,".Yuma County, Colorado",10043.0,10043.0,10049.0,10134.0,10100.0,10132.0,10184.0,10037.0,10055.0,9959.0,9947.0,10019.0
65,Note: The estimates are based on the 2010 Cens...,,,,,,,,,,,,
66,Suggested Citation:,,,,,,,,,,,,
67,Annual Estimates of the Resident Population fo...,,,,,,,,,,,,
68,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,,
69,Release Date: March 2020,,,,,,,,,,,,


In [5]:
# Select necessary columns
colorado_county_raw_population_df=colorado_county_raw_population_df[["Unnamed: 0","2018"]]
# Drop unnecessary rows
colorado_county_raw_population_df=colorado_county_raw_population_df.drop(0)
colorado_county_raw_population_df=colorado_county_raw_population_df.drop([65,66,67,68,69])
# Rename columns
colorado_county_population_df=colorado_county_raw_population_df.rename(columns={"Unnamed: 0":"County","2018":"Population"})

In [6]:
colorado_county_population_df.head()

Unnamed: 0,County,Population
1,".Adams County, Colorado",511354
2,".Alamosa County, Colorado",16248
3,".Arapahoe County, Colorado",651797
4,".Archuleta County, Colorado",13730
5,".Baca County, Colorado",3584


## Cleanup Process of Colorado Crime Data

In [7]:
# Loop through each row in dataframe, leave only the county names
for index,row in colorado_county_population_df.iterrows():
    county_state=row["County"].replace(" County, Colorado","")
    county=county_state.replace(".","")
    row["County"]=county

In [8]:
colorado_county_raw_crime_df.head()

Unnamed: 0,Metropolitan/Nonmetropolitan,County,Violent\ncrime,Property\ncrime
0,Metropolitan Counties,Arapahoe,350.0,1865
1,,Boulder,78.0,672
2,,Clear Creek,24.0,56
3,,Douglas,366.0,2489
4,,Elbert,19.0,3


In [9]:
# Select necessary columns
colorado_county_crime_df=colorado_county_raw_crime_df[["County","Violent\ncrime","Property\ncrime"]]
colorado_county_crime_df.head()

Unnamed: 0,County,Violent\ncrime,Property\ncrime
0,Arapahoe,350.0,1865
1,Boulder,78.0,672
2,Clear Creek,24.0,56
3,Douglas,366.0,2489
4,Elbert,19.0,3


In [10]:
# Drop row no.46
colorado_county_crime_df=colorado_county_crime_df.drop(46)

In [11]:
# Loop through the rows and remove space at the end of the county name
for index,row in colorado_county_crime_df.iterrows():
    county=colorado_county_crime_df.loc[index,"County"]
    colorado_county_crime_df.loc[index,"County"]=county.strip()

In [12]:
# Rename the columns
colorado_county_crime_df=colorado_county_crime_df.rename(columns={"Violent\ncrime":"Violent Crime","Property\ncrime":"Property Crime"})

## Process of Merging Colorado Crime and Population Data

In [13]:
# Merge dataframes
colorado_county_combined_df=pd.merge(colorado_county_crime_df,colorado_county_population_df,on="County")
colorado_county_combined_df.head()

Unnamed: 0,County,Violent Crime,Property Crime,Population
0,Arapahoe,350.0,1865,651797
1,Boulder,78.0,672,324636
2,Clear Creek,24.0,56,9604
3,Douglas,366.0,2489,342989
4,Elbert,19.0,3,26303


In [14]:
# Check the datatypes
colorado_county_combined_df.dtypes

County             object
Violent Crime     float64
Property Crime     object
Population         object
dtype: object

In [15]:
# Remove "," from the numbers
colorado_county_combined_df["Population"]=colorado_county_combined_df["Population"].str.replace(",","")
colorado_county_combined_df["Property Crime"]=colorado_county_combined_df["Property Crime"].str.replace(",","")
# Change the datatype to numeric
colorado_county_combined_df["Population"]=pd.to_numeric(colorado_county_combined_df["Population"])
colorado_county_combined_df["Property Crime"]=pd.to_numeric(colorado_county_combined_df["Property Crime"])

In [16]:
# Convert the total crime numbers to crime rate per 1000
vc_rate = round(colorado_county_combined_df["Violent Crime"]/(colorado_county_combined_df["Population"]/1000),3)
pc_rate = round(colorado_county_combined_df["Property Crime"]/(colorado_county_combined_df["Population"]/1000),3)

In [17]:
# Create new columns and assign values
colorado_county_combined_df["Violent Crime per 1,000 inhabitants"]=vc_rate
colorado_county_combined_df["Property Crime per 1,000 inhabitants"]=pc_rate

In [18]:
# Drop total crime column
colorado_county_combined_df=colorado_county_combined_df.drop(["Violent Crime","Property Crime"], axis=1)
colorado_county_combined_df.head()

Unnamed: 0,County,Population,"Violent Crime per 1,000 inhabitants","Property Crime per 1,000 inhabitants"
0,Arapahoe,651797,0.537,2.861
1,Boulder,324636,0.24,2.07
2,Clear Creek,9604,2.499,5.831
3,Douglas,342989,1.067,7.257
4,Elbert,26303,0.722,0.114


## Cleanup Process of Colorado Voting Data

In [19]:
colorado_county_raw_vote_df.head()

Unnamed: 0,county,dem_gov_2014_%,rep_gov_2014_%,dem_house_2012_%,rep_house_2012_%,dem_house_2014_%,rep_house_2014_%,dem_house_2016_%,rep_house_2016_%,dem_pres_2012_%,rep_pres_2012_%,dem_pres_2016_%,rep_pres_2016_%,dem_senate_2014_%,rep_senate_2014_%,dem_senate_2016_%,rep_senate_2016_%,label_3
0,Adams,0.501126,0.444168,0.526886,0.401861,0.51163,0.470264,0.514574,0.42628,0.571123,0.402724,0.498552,0.413482,0.476951,0.448761,0.526509,0.41236,DEM
1,Alamosa,0.509386,0.433133,0.486207,0.458622,0.425093,0.515413,0.495194,0.462774,0.567535,0.402829,0.459576,0.438968,0.474155,0.456665,0.529977,0.415089,DEM
2,Arapahoe,0.520078,0.439368,0.485311,0.45,0.455291,0.490463,0.457761,0.478489,0.539042,0.439863,0.52759,0.386252,0.480201,0.464848,0.54004,0.409837,DEM
3,Archuleta,0.39425,0.55521,0.339208,0.595602,0.297932,0.634433,0.356941,0.597823,0.397832,0.574993,0.340646,0.581006,0.372272,0.569045,0.379206,0.5615,Swing
4,Baca,0.217578,0.725616,0.247859,0.698237,0.178631,0.754591,0.142857,0.815436,0.221747,0.740266,0.131444,0.814213,0.189887,0.739107,0.269878,0.687091,REP


In [20]:
colorado_county_raw_vote_df=colorado_county_raw_vote_df[["county","label_3"]]

In [21]:
colorado_county_vote_df = colorado_county_raw_vote_df.rename(columns={"county": "County", "label_3": "Party"})

colorado_county_vote_df["Affiliated Party Color"]= ""

# set condition to assign each row for new column
colorado_county_vote_df.loc[colorado_county_vote_df["Party"] == "REP", "Affiliated Party Color"] = "Red"
# separate by color for merge- purple
colorado_county_vote_df.loc[colorado_county_vote_df["Party"] == "Swing", "Affiliated Party Color"] = "Purple"
# blue state
colorado_county_vote_df.loc[colorado_county_vote_df["Party"] == "DEM", "Affiliated Party Color"] = "Blue"
colorado_county_vote_df.head()

Unnamed: 0,County,Party,Affiliated Party Color
0,Adams,DEM,Blue
1,Alamosa,DEM,Blue
2,Arapahoe,DEM,Blue
3,Archuleta,Swing,Purple
4,Baca,REP,Red


In [22]:
colorado_county_color_df = colorado_county_vote_df[["County", "Affiliated Party Color"]]

colorado_county_color_df.head()

Unnamed: 0,County,Affiliated Party Color
0,Adams,Blue
1,Alamosa,Blue
2,Arapahoe,Blue
3,Archuleta,Purple
4,Baca,Red


## Cleanup Process of Colorado Unemployment Data

In [23]:
# Display first 5 rows
colorado_county_raw_unemployment_df.tail()

Unnamed: 0,Rank,Area,Unemployment Rate
61,62,Baca County,1.8%
62,63,Cheyenne County,1.8%
63,64,Kiowa County,1.8%
64,"Source: LAUS Unit, LAUS system output file",,
65,Downloaded: 05/01/2020 2:05 PM,,


In [24]:
# Remove unnecessary columns and rows
colorado_county_unemployment_df=colorado_county_raw_unemployment_df.drop([64,65]).drop("Rank",axis=1)
colorado_county_unemployment_df

Unnamed: 0,Area,Unemployment Rate
0,Huerfano County,6.4%
1,Fremont County,5.0%
2,Otero County,4.9%
3,Rio Grande County,4.8%
4,Pueblo County,4.7%
...,...,...
59,Phillips County,1.9%
60,Yuma County,1.9%
61,Baca County,1.8%
62,Cheyenne County,1.8%


In [25]:
# Remove "%" for later data analysis
colorado_county_unemployment_df["Unemployment Rate"]=colorado_county_unemployment_df["Unemployment Rate"].str.replace("%","")
# Remove " County"
colorado_county_unemployment_df["Area"]=colorado_county_unemployment_df["Area"].str.replace(" County","")
# Rename the column
colorado_county_unemployment_df=colorado_county_unemployment_df.rename(columns={"Area":"County"})
# Sort the county names and reset index
colorado_county_unemployment_df=colorado_county_unemployment_df.sort_values(by="County").reset_index(drop=True)
# Change the datatype to numeric
colorado_county_unemployment_df["Unemployment Rate"]=pd.to_numeric(colorado_county_unemployment_df["Unemployment Rate"])
colorado_county_unemployment_df

Unnamed: 0,County,Unemployment Rate
0,Adams,3.3
1,Alamosa,3.9
2,Arapahoe,3.1
3,Archuleta,3.3
4,Baca,1.8
...,...,...
59,Summit,2.1
60,Teller,3.6
61,Washington,2.4
62,Weld,2.9


## Process of Merging Remaining Data

In [26]:
# Merge vote dataframe and unemployment dataframe 
vote_unemployment_merge = pd.merge(colorado_county_color_df, colorado_county_unemployment_df, on= "County")
vote_unemployment_merge.tail()

Unnamed: 0,County,Affiliated Party Color,Unemployment Rate
59,Summit,Blue,2.1
60,Teller,Red,3.6
61,Washington,Red,2.4
62,Weld,Purple,2.9
63,Yuma,Red,1.9


In [27]:
# Merge the vote&unemployment dataframe with the crime&population dataframe
colorado_county_merge = pd.merge(vote_unemployment_merge, colorado_county_combined_df, on="County")
colorado_county_merge.head()

Unnamed: 0,County,Affiliated Party Color,Unemployment Rate,Population,"Violent Crime per 1,000 inhabitants","Property Crime per 1,000 inhabitants"
0,Arapahoe,Blue,3.1,651797,0.537,2.861
1,Archuleta,Purple,3.3,13730,3.569,6.846
2,Baca,Red,1.8,3584,0.837,0.558
3,Bent,Purple,3.1,5808,5.51,32.197
4,Boulder,Blue,2.7,324636,0.24,2.07


In [28]:
colorado_county_merge.to_csv("../../Data_files/clean_data/colorado_clean_data/colorado_county_data(combined_info).csv", index=False)
colorado_county_merge

Unnamed: 0,County,Affiliated Party Color,Unemployment Rate,Population,"Violent Crime per 1,000 inhabitants","Property Crime per 1,000 inhabitants"
0,Arapahoe,Blue,3.1,651797,0.537,2.861
1,Archuleta,Purple,3.3,13730,3.569,6.846
2,Baca,Red,1.8,3584,0.837,0.558
3,Bent,Purple,3.1,5808,5.51,32.197
4,Boulder,Blue,2.7,324636,0.24,2.07
5,Chaffee,Purple,2.6,20041,0.549,2.994
6,Cheyenne,Red,1.8,1859,0.0,0.538
7,Clear Creek,Blue,2.9,9604,2.499,5.831
8,Conejos,Blue,4.1,8181,0.611,3.789
9,Crowley,Red,4.2,5957,1.847,7.051
