# Observations and Insights

<hr>

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# CalFresh data file path (DeJuan)
calfresh_data_path = "Resources/CalFresh Data 19 - Current.csv"
# Census data file path (Jackson)
census_data_path = "Resources/co-est2019-alldata.csv"
# us-counties data file path (Siddharth)
all_counties_path = "Resources/us-counties.csv"


In [2]:
# Read the all three datasets (D,J,S)
# Lots of problems reading the file. After awhile, figured out 
# it was encoded ANSI and not utf-8. After re-coding, using notepad. it worked. lost 30 min.
calfresh_data = pd.read_csv(calfresh_data_path)

# rename columns.
calfresh_data.rename(columns = {'Calendar Year' : 'Year', 'Unemployment Monthly	' : 'Unemployment', 
                                'CalFresh Households' : 'Households', 'CalFresh Persons' : 'Persons', 
                                'EBT_FSP_dollars' : 'EBT Cash'}, inplace = True)

calfresh_data.head()

Unnamed: 0,County,Month,Date,Year,Unemployment Monthly,Households,Persons,Applications Recevied,Online Applications Received,EBT Cash
0,Statewide,January,Jan-19,2019,4.8%,1906041,3764747,165779,72917,936417639
1,Alameda,January,Jan-19,2019,0.3%,53529,93512,5515,2428,24484650
2,Alpine,January,Jan-19,2019,4.7%,70,114,4,1,27662
3,Amador,January,Jan-19,2019,4.9%,1462,2573,154,40,607423
4,Butte,January,Jan-19,2019,6.6%,16535,29634,2043,1085,7170854


In [3]:
#CA County Data
#Data cleanup: Separate date into 3 columns, MONTH, DATE, YEAR. Isolate counties to CA. Drop FIPS column. 

all_counties = pd.read_csv(all_counties_path,index_col=0, parse_dates=True)
all_counties.head()


Unnamed: 0_level_0,county,state,fips,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-21,Snohomish,Washington,53061.0,1,0.0
2020-01-22,Snohomish,Washington,53061.0,1,0.0
2020-01-23,Snohomish,Washington,53061.0,1,0.0
2020-01-24,Cook,Illinois,17031.0,1,0.0
2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [4]:
#Drop FIPS column
all_counties_df = pd.DataFrame(all_counties)

all_counties_df = all_counties_df.drop(["fips"], axis=1)
all_counties_df

Unnamed: 0_level_0,county,state,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-21,Snohomish,Washington,1,0.0
2020-01-22,Snohomish,Washington,1,0.0
2020-01-23,Snohomish,Washington,1,0.0
2020-01-24,Cook,Illinois,1,0.0
2020-01-24,Snohomish,Washington,1,0.0
...,...,...,...,...
2021-04-22,Sweetwater,Wyoming,4151,37.0
2021-04-22,Teton,Wyoming,3715,9.0
2021-04-22,Uinta,Wyoming,2157,12.0
2021-04-22,Washakie,Wyoming,897,26.0


In [5]:
#Isolate counties to CA 

CA_counties_df = all_counties_df[all_counties_df["state"] == "California"]
CA_counties_df


Unnamed: 0_level_0,county,state,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-25,Orange,California,1,0.0
2020-01-26,Los Angeles,California,1,0.0
2020-01-26,Orange,California,1,0.0
2020-01-27,Los Angeles,California,1,0.0
2020-01-27,Orange,California,1,0.0
...,...,...,...,...
2021-04-22,Tulare,California,49469,831.0
2021-04-22,Tuolumne,California,4091,64.0
2021-04-22,Ventura,California,80398,1002.0
2021-04-22,Yolo,California,13606,200.0


In [6]:
#Separate date into three columns 
CA_counties_df.reset_index(inplace=True)

CA_counties_df["Day"] = CA_counties_df["date"].dt.day
CA_counties_df["Month"] = CA_counties_df["date"].dt.month
CA_counties_df["Year"] = CA_counties_df["date"].dt.year
CA_counties_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CA_counties_df["Day"] = CA_counties_df["date"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CA_counties_df["Month"] = CA_counties_df["date"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CA_counties_df["Year"] = CA_counties_df["date"].dt.year


Unnamed: 0,date,county,state,cases,deaths,Day,Month,Year
0,2020-01-25,Orange,California,1,0.0,25,1,2020
1,2020-01-26,Los Angeles,California,1,0.0,26,1,2020
2,2020-01-26,Orange,California,1,0.0,26,1,2020
3,2020-01-27,Los Angeles,California,1,0.0,27,1,2020
4,2020-01-27,Orange,California,1,0.0,27,1,2020
...,...,...,...,...,...,...,...,...
23292,2021-04-22,Tulare,California,49469,831.0,22,4,2021
23293,2021-04-22,Tuolumne,California,4091,64.0,22,4,2021
23294,2021-04-22,Ventura,California,80398,1002.0,22,4,2021
23295,2021-04-22,Yolo,California,13606,200.0,22,4,2021


In [7]:
CA_counties_df.rename(columns = {"date":"Date",
                                "county":"County",
                                "state":"State",
                                "cases":"Cases",
                                "deaths":"Deaths"}, inplace=True)
CA_counties_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Date,County,State,Cases,Deaths,Day,Month,Year
0,2020-01-25,Orange,California,1,0.0,25,1,2020
1,2020-01-26,Los Angeles,California,1,0.0,26,1,2020
2,2020-01-26,Orange,California,1,0.0,26,1,2020
3,2020-01-27,Los Angeles,California,1,0.0,27,1,2020
4,2020-01-27,Orange,California,1,0.0,27,1,2020
...,...,...,...,...,...,...,...,...
23292,2021-04-22,Tulare,California,49469,831.0,22,4,2021
23293,2021-04-22,Tuolumne,California,4091,64.0,22,4,2021
23294,2021-04-22,Ventura,California,80398,1002.0,22,4,2021
23295,2021-04-22,Yolo,California,13606,200.0,22,4,2021


In [8]:
CA_counties_df.to_csv("Resources/CA_County_Data.csv")

In [9]:
#### Had initial problems. Discovered that all US .gov docs are encoded ANSI, 
# or have fixed length columns. You must open in note pad and save as utf-8.
census_data = pd.read_csv(census_data_path)

# rename columns to use initial Caps.
census_data.rename(columns = {'STNAME' : 'State', 'CTYNAME' : 'County', 
                              'POPESTIMATE2019' : 'Population'}, inplace = True)
census_df = pd.DataFrame(census_data)
census_df.drop(census_df.columns[[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 
                                  22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 
                                  39, 40, 41, 42, 43, 44, 45, 46, 47]], axis = 1, inplace = True)

# filter rows in state that equal California
census_ca = census_df.loc[census_df['State'] == 'California']

census_ca.head()
#print('\nResult census_ca :\n'', 

# Create new pd.DataFrame using only State, County, 

# Checking if update works.

Unnamed: 0,State,County,Population,DEATHS2019,NATURALINC2010,NATURALINC2011,NATURALINC2012,NATURALINC2013,NATURALINC2014,NATURALINC2015,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
190,California,California,39512223,282520,66008,271383,257916,251925,254859,246582,...,-5.151429,1.276797,1.495016,1.649031,2.203551,1.984957,0.500044,-0.629909,-2.130954,-3.276681
191,California,Alameda County,1671329,10944,2726,10144,9772,9849,10248,9887,...,-6.96567,5.134201,8.497481,10.150136,11.190365,10.411032,3.989666,-0.196911,-0.945009,-1.848964
192,California,Alpine County,1129,5,-2,-3,-1,-1,-3,-7,...,31.559964,-57.675244,15.4335,16.085791,-41.666667,2.781641,-22.59887,62.094532,-20.0,32.461677
193,California,Amador County,39752,427,-5,-177,-147,-167,-138,-132,...,11.395076,-4.40149,-7.93066,-8.843796,6.680664,11.822607,14.531292,32.49164,24.867195,11.824602
194,California,Butte County,219186,2527,106,264,156,17,333,145,...,-50.048384,-1.050181,3.461542,3.493706,6.968328,4.41373,6.84023,11.052323,7.768471,-49.590123


In [10]:
# Filter to show necessary columns
census_filtered = census_ca[['State', 'County', 'Population', 'DEATHS2019']]
census_filtered.head()

Unnamed: 0,State,County,Population,DEATHS2019
190,California,California,39512223,282520
191,California,Alameda County,1671329,10944
192,California,Alpine County,1129,5
193,California,Amador County,39752,427
194,California,Butte County,219186,2527


In [11]:
# Remove 'Date' Column from CSV, not usable for analysis. Is not an actual date, is only the Month and year joined. (DHall)
del calfresh_data['Date']

calfresh_data.head()

Unnamed: 0,County,Month,Year,Unemployment Monthly,Households,Persons,Applications Recevied,Online Applications Received,EBT Cash
0,Statewide,January,2019,4.8%,1906041,3764747,165779,72917,936417639
1,Alameda,January,2019,0.3%,53529,93512,5515,2428,24484650
2,Alpine,January,2019,4.7%,70,114,4,1,27662
3,Amador,January,2019,4.9%,1462,2573,154,40,607423
4,Butte,January,2019,6.6%,16535,29634,2043,1085,7170854


In [12]:
#Filter calfreash data by 2020 and 2021 (DHall)
calfresh_data20_21 = calfresh_data[calfresh_data['Year']!=2019]

calfresh_data20_21.head()

Unnamed: 0,County,Month,Year,Unemployment Monthly,Households,Persons,Applications Recevied,Online Applications Received,EBT Cash
708,Statewide,January,2020,4.3%,2176109,4075962,187228,86463,500169011
709,Alameda,January,2020,3.1%,65302,112881,6341,2939,13678807
710,Alpine,January,2020,4.6%,77,126,7,0,14377
711,Amador,January,2020,4.7%,1496,2588,171,41,305508
712,Butte,January,2020,5.6%,17028,28648,1901,1014,3503997


In [13]:
#Export Updated CSV to Resource Folder (DHall)
calfresh_data20_21.to_csv('Resources/calfresh_data20_21.csv')

In [6]:

# Combine CalFresh, census and counties data into a single dataset (JC)


# Create pd.DataFrame for population density by county. Timeframe 1-2020 to 3-2021

# Create pd.DataFrame for CalFresh recipients by county. Timeframe 1-2020 to 3-2021

# Create pd.DataFrame for total Covid cases by county. Timeframe 1-2020 to 3-2021
# Display the data table for preview


# Summary Statistics

## Line Chart 

In [7]:
# Show CA state covid cases over months using line plot. Years on x-axis and population on y-axis
# Timeframe: Jan 2020 to March 2021


In [8]:
# Graph trends amoungst various counties on a single plot. Pick 5, at least one urban area
# Timeframe: Jan 2020 to March 2021


In [9]:
# Correlation Rise of Covid and CalFresh (maybe unemployment) vs number of people receiving CalFresh aid.

### Bar Chart

In [10]:
# Number of Covid cases in all counties. Sort decending

In [11]:
# Number of Covid cases in top 10 counties. Sort decending