In [None]:
# Import all the packages

#%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sts
import numpy as np
import requests
import json

In [None]:
# Display the data frame

df_by_county = pd.read_csv("statewide_cases.csv")
df_by_county

In [None]:
# Group by on the county name and use .max to show just the totals
# The "newcountconfirmed" column shows the highest single day cases that the county has experienced

df_county = df_by_county.groupby(["county"]).max()
df_county = df_county[["totalcountconfirmed","totalcountdeaths","newcountconfirmed"]]
df_county

In [None]:
# Drop the rows that say 'Unassigned' and 'Out of country'

county_df = df_county.drop(["Out Of Country", "Unassigned"])
county_df = county_df.reset_index()
county_df

In [None]:
# Use the describe to show initial data of the data frame

county_df.describe()

In [None]:
# Create a data frame that shows the total number of cases for each county in the first seven days
# Again drop the rows that say unassigned and out of country

first_seven_df = df_by_county.loc[(df_by_county["date"] == "2020-03-18") | (df_by_county["date"] == "2020-03-19") | (df_by_county["date"] == "2020-03-20") | (df_by_county["date"] == "2020-03-21") | (df_by_county["date"] == "2020-03-22") | (df_by_county["date"] == "2020-03-23") | (df_by_county["date"] == "2020-03-24")]
first_seven_df = first_seven_df.groupby(["county"]).sum()
first_seven_df = first_seven_df[["newcountconfirmed"]]
first_seven_df = first_seven_df.rename(columns={"newcountconfirmed":"Total cases over first 7 days"})
first_seven = first_seven_df.drop(["Unassigned"])
first_seven.head()

In [None]:
# Create a data frame that shows the total number of cases for each county in the last seven days
# Again drop the rows that say unassigned and out of country

last_seven_df = df_by_county.loc[(df_by_county["date"] == "2020-11-07") | (df_by_county["date"] == "2020-11-08") | (df_by_county["date"] == "2020-11-09") | (df_by_county["date"] == "2020-11-10") | (df_by_county["date"] == "2020-11-11") | (df_by_county["date"] == "2020-11-12") | (df_by_county["date"] == "2020-11-13")]
last_seven_df = last_seven_df.groupby(["county"]).sum()
last_seven_df = last_seven_df[["newcountconfirmed"]]
last_seven_df = last_seven_df.rename(columns={"newcountconfirmed":"Total cases over last 7 days"})
last_seven = last_seven_df.drop(["Out Of Country", "Unassigned"])
last_seven.head()

In [None]:
# Merge the two previous tables together on the index 'county' to show the differences in first 7 days and last 7 days

merge_seven_days = pd.merge(first_seven, last_seven, left_index=True, right_index=True)
merge_seven_days

In [None]:
# Use group by on the county data frame by date to show total cases changing over each day

per_day = df_by_county.groupby(["date"]).sum()
per_day

In [None]:
# Create a line plot to show the total confirmed cases

x_values = np.arange(len(county_df["totalcountconfirmed"]))
y_values = county_df["totalcountconfirmed"]
plt.figure(figsize=(12,5))
plt.plot(x_values, y_values)
plt.xlabel("California counties")
plt.ylabel("Total number of confirmed cases")
plt.title("Total number of cases of COVID-19 by counties in California")
plt.xticks(x_values, county_df["county"], rotation=90)
plt.show()

In [None]:
# Create a line plot to show the total confirmed deaths

x_values = np.arange(len(county_df["totalcountdeaths"]))
y_values = county_df["totalcountdeaths"]
plt.figure(figsize=(12,5))
plt.plot(x_values, y_values)
plt.xlabel("California counties")
plt.ylabel("Total number of confirmed deaths")
plt.title("Total number of deaths from COVID-19 by counties in California")
plt.xticks(x_values, county_df["county"], rotation=90)
plt.show()

In [None]:
# Create a line plot to show the new counts

x_values = np.arange(len(county_df["newcountconfirmed"]))
y_values = county_df["newcountconfirmed"]
plt.figure(figsize=(12,5))
plt.plot(x_values, y_values)
plt.xlabel("California counties")
plt.ylabel("New cases from yesterday")
plt.title("Total number of cases of COVID-19 on Nov. 13")
plt.xticks(x_values, county_df["county"], rotation=90)
plt.show()

In [None]:
# Create a bar chart to show the total confirmed cases

x_axis = np.arange(len(county_df["totalcountconfirmed"]))
y_axis = county_df["totalcountconfirmed"]
plt.figure(figsize=(12,5))
plt.bar(x_axis, y_axis)
plt.xlabel("California counties")
plt.ylabel("Total number of confirmed cases")
plt.title("Total number of cases of COVID-19 by counties in California")
plt.xticks(x_values, county_df["county"], rotation=90)
plt.tight_layout()

In [None]:
# Create a bar chart to show the total confirmed deaths

x_axis = np.arange(len(county_df["totalcountdeaths"]))
y_axis = county_df["totalcountdeaths"]
plt.figure(figsize=(12,5))
plt.bar(x_axis, y_axis)
plt.xlabel("California counties")
plt.ylabel("Total number of confirmed deaths")
plt.title("Total number of deaths from COVID-19 by counties in California")
plt.xticks(x_values, county_df["county"], rotation=90)
plt.tight_layout()

In [None]:
# Create a bar chart to show the new confirmed cases

x_axis = np.arange(len(county_df["newcountconfirmed"]))
y_axis = county_df["newcountconfirmed"]
plt.figure(figsize=(12,5))
plt.bar(x_axis, y_axis)
plt.xlabel("California counties")
plt.ylabel("Total number of new cases")
plt.title("Total number of cases of COVID-19 on Nov. 13")
plt.xticks(x_values, county_df["county"], rotation=90)
plt.tight_layout()

In [None]:
# Create a line plot to show the differences between the first seven days and the last seven days

x_axis = np.arange(len(merge_seven_days["Total cases over first 7 days"]))
first_axis = merge_seven_days["Total cases over first 7 days"]
last_axis = merge_seven_days["Total cases over last 7 days"]
plt.figure(figsize=(12,5))
plt.plot(x_axis, first_axis, color="red")
plt.plot(x_axis, last_axis, color="blue")
plt.xticks(x_axis, county_df["county"], rotation=90)
plt.xlabel("Counties")
plt.ylabel("Total number of cases")
plt.title("Total number of cases of COVID-19 for first seven days and last seven days")
plt.tight_layout()

In [None]:
# Linear regression on the total cases vs total deaths (including LA county)

x_axis = county_df["totalcountconfirmed"]
y_axis = county_df["totalcountdeaths"]
slope, intercept, r_value, p_value, std_err = sts.linregress(x_axis, y_axis)
lin_eq = slope * x_axis + intercept

In [None]:
# Create a scatter plot that shows the total cases vs total deaths

x_axis = county_df["totalcountconfirmed"]
y_axis = county_df["totalcountdeaths"]
plt.scatter(x_axis, y_axis, marker="o")
plt.plot(x_axis,lin_eq,"r-")
line_eq = "y=" + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.annotate(line_eq,(100000,5),fontsize=15,color="red")
plt.xlabel("Total Cases")
plt.ylabel("Total Deaths")
plt.title("Total Cases vs Total Deaths")
print(f"The correlation coefficient is {round(r_value,3)}.")

In [None]:
# Determine the outlier(s) in the data set

total_cases = county_df["totalcountconfirmed"]
quartiles = total_cases.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of COVID-19 cases by county is: {lowerq}")
print(f"The upper quartile of COVID-19 cases by county is: {upperq}")
print(f"The interquartile range of COVID-19 by county cases is: {iqr}")
print(f"The the median of COVID-19 cases by county is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values above {upper_bound} could be outliers.")
outliers = county_df.loc[county_df["totalcountconfirmed"] >= upper_bound]
outliers

In [None]:
# Linear regression on the total cases vs total deaths (excluding outlier counties)

x_axis = county_df.loc[(county_df["county"] != "Los Angeles")]["totalcountconfirmed"]
y_axis = county_df.loc[(county_df["county"] != "Los Angeles")]["totalcountdeaths"]
slope, intercept, r_value, p_value, std_err = sts.linregress(x_axis, y_axis)
lin_eq = slope * x_axis + intercept

In [None]:
# Create a scatter plot that shows the total cases vs total deaths excluding outliers

x_axis = county_df.loc[(county_df["county"] != "Los Angeles")]["totalcountconfirmed"]
y_axis = county_df.loc[(county_df["county"] != "Los Angeles")]["totalcountdeaths"]
plt.scatter(x_axis, y_axis, marker="o", color="red")
plt.plot(x_axis,lin_eq,"b-")
line_eq = "y=" + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.annotate(line_eq,(20000,5),fontsize=15,color="blue")
plt.xlabel("Total Cases")
plt.ylabel("Total Deaths")
plt.title("Total Cases vs Total Deaths, excluding LA, Orange, Riverside, San Bernadino, and San Diego")
print(f"The correlation coefficient is {round(r_value,3)}.")

In [None]:
# Create a line plot that shows the new cases each day since the beginning of pandemic

x_axis = np.arange(len(per_day["newcountconfirmed"]))
y_axis = per_day["newcountconfirmed"]
plt.plot(x_axis, y_axis)
plt.xlabel("Days since beginning of Pandemic")
plt.ylabel("Total new cases of COVID-19")
plt.title("Total number of cases of COVID-19 each day")
plt.show()

In [None]:
# Is there a statistical difference on the data that excludes Los Angeles and Los Angeles by itself? In other 
# words, we claim that the data is the same, and there is no difference (LA county accounts for ALL the COVID-19 data)

population1 = per_day["newcountconfirmed"]
print(f"The mean of the first data set is {round(population1.mean(),3)}")
population2 = df_by_county.loc[df_by_county["county"] == "Los Angeles"]["newcountconfirmed"]
print(f"The mean of the first data set is {round(population2.mean(),3)}")
print(f"The null hypothesis is that the means are the same")
print(f"The alternate hypothesis is that the means are different")
print(sts.ttest_ind(population1, population2, equal_var=False))
print(f"We can conclude that the means are different")