# Observations and Insights

<hr>

In [None]:
# Dependencies and Setup (John)
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import calendar

# CalFresh data file path (DeJuan)
calfresh_data_path = "Resources/CalFresh Data 19 - Current.csv"
# Census data file path (Jackson)
census_data_path = "Resources/co-est2019-alldata.csv"
# us-counties data file path (Siddharth)
all_counties_path = "Resources/us-counties.csv"
CA_COVID_Counties_Path = "Resources/CA_COVID_Counties_Data.csv"
covid_merge_path = "Resources/covid_merge.csv"
Top_10_Counties_path = "Resources/Top 10 Counties By Infection.csv"
Bottom_10_Counties_path = 'Resources/Bottom 10 Counties By Infection.csv'


In [None]:
# Read the all three datasets (John)
calfresh_data = pd.read_csv(calfresh_data_path)

# rename columns.
calfresh_data.rename(columns = {'Calendar Year' : 'Year', 'Unemployment Monthly	' : 'Unemployment', 
                                'CalFresh Households' : 'Households', 'CalFresh Persons' : 'Persons', 
                                'EBT_FSP_dollars' : 'EBT Cash'}, inplace = True)
 
# Use .drop function. Removed columns 7-8 as not needed, and index 0 (row 1). (John)
calfresh_data.drop(calfresh_data.columns[[2, 7, 8]], axis = 1, inplace = True)

## ** Ask TA why index 0 can't be dropped ---------------- ** ##
#calfresh_data.drop(labels=[0], axis=0)
#calfresh_data.drop(calfresh_data.index [[ 0]])
## ** ---------------------------------------------------- ** ##

# remove commas in the entire dataframe and convert number strings to a number int. (John)
calfresh_df = pd.DataFrame(calfresh_data)
calfresh_df.replace(',','', regex=True, inplace=True)

# Cconvert specific columns to number dtype (John)
calfresh_df = calfresh_df.astype({'Households': int, 'Persons': int, 'EBT Cash': int})

# convert unemployment from obj to float, removed "%", divide by 100 and round to 3 digits.
calfresh_df[calfresh_df.columns[3:]] = calfresh_df[calfresh_df.columns[3:]
                                                  ].replace('[\%,]', '', regex=True).astype(float)
calfresh_df['Unemployment Monthly'] = calfresh_df['Unemployment Monthly'].div(100).round(3)

#Filter calfreash data by 2020 and 2021 (DHall)
calfresh_data20_21 = calfresh_df[calfresh_df['Year']!=2019]

#Export Updated CSV to Resource Folder (DHall)
calfresh_data20_21.to_csv('Resources/calfresh_data20_21.csv')

calfresh_data20_21

In [None]:
#CA County Data (Jackson)
#Data cleanup: Separate date into 3 columns, MONTH, DATE, YEAR. Isolate counties to CA. Drop FIPS column. 
all_counties = pd.read_csv(all_counties_path,index_col=0, parse_dates=True)

#Drop FIPS column
all_counties_df = pd.DataFrame(all_counties).drop(["fips"], axis=1)

#Isolate counties to CA 
CA_counties_df = all_counties_df[all_counties_df["state"] == "California"]

CA_counties_df.reset_index(inplace=True)
CA_counties_df = CA_counties_df.astype({'date': str})
CA_counties_df[["year", "month", "day"]] = CA_counties_df["date"].str.split("-", expand = True)
CA_counties_df = CA_counties_df.astype({'deaths': int, 'year': int, 'month': int, 'day': int})
CA_counties_df['month'] =  CA_counties_df['month'].apply(lambda x: calendar.month_name[x])

CA_counties_df.pop('date')  # Deleted original date column

#CA_counties_df

In [None]:
# groupby multiple column and sum by cases; convert groupby into dataframe using .to_frame()
#county_group = CA_counties_df.groupby(['county', 'year', 'month'])["cases"].apply(lambda x : x.astype(
#    int).sum()).to_frame().reset_index()
county_cases = CA_counties_df.groupby(['county', 'year', 'month'])["cases"].apply(lambda x : x.astype(
    int).sum()).to_frame().reset_index()
county_cases

In [None]:
county_deaths = CA_counties_df.groupby(['county', 'year', 'month'])["deaths"].apply(lambda x : x.astype(
    int).sum()).to_frame().reset_index()
# merge two county data sets on county, year and month
county_merge = pd.merge(county_cases, county_deaths, how='left', left_on =[
    'county', 'year', 'month'], right_on =['county', 'year', 'month'])
#CA_counties_df.to_csv("Resources/CA_County_Data.csv")
#Updated Columns to Caps to match in Merging with CalFresh data (Dhall)

county_merge.rename(columns = {'county' : 'County', 'year' : 'Year', 
                                'month' : 'Month', 'cases' : 'Cases', 
                                'deaths' : 'Deaths'}, inplace = True)

county_merge

In [None]:
# merge two CalFresh and county_merge datasets on county, year and month
covid_merge = pd.read_csv(covid_merge_path)

covid_merge.head(100)


In [None]:
# Filter to show necessary columns adn rename columns to use initial Caps.
census_data = pd.read_csv(census_data_path)
census_data.rename(columns = {'STNAME' : 'State', 'CTYNAME' : 'County', 
                              'POPESTIMATE2019' : 'Population'}, inplace = True)
census_df = pd.DataFrame(census_data)

# filter rows in state that equal California and required columns (Siddharth)
census_ca = census_df.loc[census_df['State'] == 'California']
census_filtered = census_ca[['State', 'County', 'Population', 'DEATHS2019']]
# Deleted original DEATHS2019 and 'State' column
census_filtered.pop('DEATHS2019') 
census_filtered.pop('State') 

#Write Census Data to CSV
census_filtered.to_csv('Resources/census_filtered.csv')


census_filtered.head()

In [None]:
# 14 Month Overall data for counties worst hit with Covid cases Jan 2020 - Feb 2021
Top_10_Counties = pd.read_csv(Top_10_Counties_path)
Top_10_Counties   

In [None]:
#Crate and save Bar Plot showing Covid rate for 10 top Infected Counties (DHALL)
Top10COV = Top_10_Counties[["County","Rate Of COVID Infection"]]
Top10COV = Top10COV.set_index("County")


Top10COV.plot(kind="bar", figsize=(10,5))

plt.title("10 Most Covid Infected Counties")
plt.ylabel("Infection Rate %")
plt.xlabel("County")
plt.show()
plt.tight_layout()
plt.savefig('Resources/Top10COV.png')

In [None]:
# 14 Month Overall data for counties least hit with Covid cases Jan 2020 - Feb 2021
Bottom_10_Counties = pd.read_csv(Bottom_10_Counties_path)
Bottom_10_Counties

In [None]:
#Crate and save Bar Plot showing Covid rate for 10 least Infected Counties (DHALL)
Bot10COV = Bottom_10_Counties[["County","Rate Of COVID Infection"]]
Bot10COV = Bot10COV.set_index("County")


Bot10COV.plot(kind="bar", figsize=(10,5))

plt.title("10 Least Covid Infected Counties")
plt.ylabel("Infection Rate %")
plt.xlabel("County")
plt.show()
plt.tight_layout()
plt.savefig('Resources/Bot10COV.png')

In [None]:
# Combine CalFresh, census and counties data into a single dataset (DHALL)
Merged_CalFresh_Covid_Census_Data_path = 'Resources\Merged CalFresh_Covid_Census_Data.csv'

All_Merged_data = pd.read_csv(Merged_CalFresh_Covid_Census_Data_path)

All_Merged_data.head()


# Summary Statistics

## Line Chart 

In [None]:
# Show CA state covid cases over months using line plot. Years on x-axis and population on y-axis
# Timeframe: Jan 2020 to March 2021


In [None]:
# Graph trends amoungst various counties on a single plot. Pick 5, at least one urban area
# Timeframe: Jan 2020 to March 2021


In [None]:
# Correlation Rise of Covid and CalFresh (maybe unemployment) vs number of people receiving CalFresh aid.

### Bar Chart

In [None]:
# Number of Covid cases in all counties. Sort decending

In [None]:
# Number of Covid cases in top 10 counties. Sort decending