# Observations and Insights

<hr>

In [1]:
# Dependencies and Setup (John)
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import calendar

# CalFresh data file path (DeJuan)
calfresh_data_path = "Resources/CalFresh Data 19 - Current.csv"
# Census data file path (Jackson)
census_data_path = "Resources/co-est2019-alldata.csv"
# us-counties data file path (Siddharth)
all_counties_path = "Resources/us-counties.csv"


In [2]:
# Read the all three datasets (John)
calfresh_data = pd.read_csv(calfresh_data_path)

# rename columns.
calfresh_data.rename(columns = {'Calendar Year' : 'Year', 'Unemployment Monthly	' : 'Unemployment', 
                                'CalFresh Households' : 'Households', 'CalFresh Persons' : 'Persons', 
                                'EBT_FSP_dollars' : 'EBT Cash'}, inplace = True)
 
# Use .drop function. Removed columns 7-8 as not needed, and index 0 (row 1). (John)
calfresh_data.drop(calfresh_data.columns[[2, 7, 8]], axis = 1, inplace = True)

## ** Ask TA why index 0 can't be dropped ---------------- ** ##
#calfresh_data.drop(labels=[0], axis=0)
#calfresh_data.drop(calfresh_data.index [[ 0]])
## ** ---------------------------------------------------- ** ##

# remove commas in the entire dataframe and convert number strings to a number int. (John)
calfresh_df = pd.DataFrame(calfresh_data)
calfresh_df.replace(',','', regex=True, inplace=True)

# Cconvert specific columns to number dtype (John)
calfresh_df = calfresh_df.astype({'Households': int, 'Persons': int, 'EBT Cash': int})

# convert unemployment from obj to float, removed "%", divide by 100 and round to 3 digits.
calfresh_df[calfresh_df.columns[3:]] = calfresh_df[calfresh_df.columns[3:]
                                                  ].replace('[\%,]', '', regex=True).astype(float)
calfresh_df['Unemployment Monthly'] = calfresh_df['Unemployment Monthly'].div(100).round(3)

#Filter calfreash data by 2020 and 2021 (DHall)
calfresh_data20_21 = calfresh_df[calfresh_df['Year']!=2019]

#Export Updated CSV to Resource Folder (DHall)
calfresh_data20_21.to_csv('Resources/calfresh_data20_21.csv')

calfresh_data20_21

Unnamed: 0,County,Month,Year,Unemployment Monthly,Households,Persons,EBT Cash
708,Statewide,January,2020,0.043,2176109.0,4075962.0,500169011.0
709,Alameda,January,2020,0.031,65302.0,112881.0,13678807.0
710,Alpine,January,2020,0.046,77.0,126.0,14377.0
711,Amador,January,2020,0.047,1496.0,2588.0,305508.0
712,Butte,January,2020,0.056,17028.0,28648.0,3503997.0
...,...,...,...,...,...,...,...
1529,Tulare,February,2021,0.114,45068.0,101481.0,17778143.0
1530,Tuolumne,February,2021,0.087,3170.0,4907.0,786603.0
1531,Ventura,February,2021,0.068,36565.0,67207.0,11082712.0
1532,Yolo,February,2021,0.068,0.0,0.0,3572432.0


In [3]:
#CA County Data (Jackson)
#Data cleanup: Separate date into 3 columns, MONTH, DATE, YEAR. Isolate counties to CA. Drop FIPS column. 
all_counties = pd.read_csv(all_counties_path,index_col=0, parse_dates=True)

#Drop FIPS column
all_counties_df = pd.DataFrame(all_counties).drop(["fips"], axis=1)

#Isolate counties to CA 
CA_counties_df = all_counties_df[all_counties_df["state"] == "California"]

CA_counties_df.reset_index(inplace=True)
CA_counties_df = CA_counties_df.astype({'date': str})
CA_counties_df[["year", "month", "day"]] = CA_counties_df["date"].str.split("-", expand = True)
CA_counties_df = CA_counties_df.astype({'deaths': int, 'year': int, 'month': int, 'day': int})
CA_counties_df['month'] =  CA_counties_df['month'].apply(lambda x: calendar.month_name[x])

CA_counties_df.pop('date')  # Deleted original date column

#CA_counties_df

0        2020-01-25
1        2020-01-26
2        2020-01-26
3        2020-01-27
4        2020-01-27
            ...    
23292    2021-04-22
23293    2021-04-22
23294    2021-04-22
23295    2021-04-22
23296    2021-04-22
Name: date, Length: 23297, dtype: object

In [4]:
# groupby multiple column and sum by cases; convert groupby into dataframe using .to_frame()
#county_group = CA_counties_df.groupby(['county', 'year', 'month'])["cases"].apply(lambda x : x.astype(
#    int).sum()).to_frame().reset_index()

county_cases = CA_counties_df.groupby(['county', 'year', 'month'])["cases"].apply(lambda x : x.astype(
    int).sum()).to_frame().reset_index()
county_cases.head()

Unnamed: 0,county,year,month,cases
0,Alameda,2020,April,30470
1,Alameda,2020,August,463338
2,Alameda,2020,December,1247644
3,Alameda,2020,July,268879
4,Alameda,2020,June,136574


In [5]:
county_deaths = CA_counties_df.groupby(['county', 'year', 'month'])["deaths"].apply(lambda x : x.astype(
    int).sum()).to_frame().reset_index()
# merge two county data sets on county, year and month
county_merge = pd.merge(county_cases, county_deaths, how='left', left_on =[
    'county', 'year', 'month'], right_on =['county', 'year', 'month'])
#CA_counties_df.to_csv("Resources/CA_County_Data.csv")
county_merge.head()

Unnamed: 0,county,year,month,cases,deaths
0,Alameda,2020,April,30470,988
1,Alameda,2020,August,463338,6785
2,Alameda,2020,December,1247644,17659
3,Alameda,2020,July,268879,4909
4,Alameda,2020,June,136574,3458


In [7]:
# merge two CalFresh and county_merge datasets on county, year and month
covid_merge = pd.merge(calfresh_data20_21, county_merge, how='left', left_on =[
    'County', 'Year', 'Month'], right_on =['county', 'year', 'month'])
#CA_counties_df.to_csv("Resources/CA_County_Data.csv")
covid_merge.head()

## ** -----  NEED TO FILL/REPLACE NaN DATA WITH FILLER  ----- ##

Unnamed: 0,County,Month,Year,Unemployment Monthly,Households,Persons,EBT Cash,county,year,month,cases,deaths
0,Statewide,January,2020,0.043,2176109.0,4075962.0,500169011.0,,,,,
1,Alameda,January,2020,0.031,65302.0,112881.0,13678807.0,,,,,
2,Alpine,January,2020,0.046,77.0,126.0,14377.0,,,,,
3,Amador,January,2020,0.047,1496.0,2588.0,305508.0,,,,,
4,Butte,January,2020,0.056,17028.0,28648.0,3503997.0,,,,,


In [None]:
# Filter to show necessary columns adn rename columns to use initial Caps.
census_data = pd.read_csv(census_data_path)
census_data.rename(columns = {'STNAME' : 'State', 'CTYNAME' : 'County', 
                              'POPESTIMATE2019' : 'Population'}, inplace = True)
census_df = pd.DataFrame(census_data)

# filter rows in state that equal California and required columns (Siddharth)
census_ca = census_df.loc[census_df['State'] == 'California']
census_filtered = census_ca[['State', 'County', 'Population', 'DEATHS2019']]
census_filtered.head()

In [None]:
# Combine CalFresh, census and counties data into a single dataset (John)


# Summary Statistics

## Line Chart 

In [None]:
# Show CA state covid cases over months using line plot. Years on x-axis and population on y-axis
# Timeframe: Jan 2020 to March 2021


In [None]:
# Graph trends amoungst various counties on a single plot. Pick 5, at least one urban area
# Timeframe: Jan 2020 to March 2021


In [None]:
# Correlation Rise of Covid and CalFresh (maybe unemployment) vs number of people receiving CalFresh aid.

### Bar Chart

In [None]:
# Number of Covid cases in all counties. Sort decending

In [None]:
# Number of Covid cases in top 10 counties. Sort decending