# Importing Packages and Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
import seaborn as sb 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/CONVENIENT_us_confirmed_cases.csv')


In [None]:
df.head() #to understand the data 

## Data Cleaning

In [None]:
#delete first row index 0
df = df.drop([0])

In [None]:
# convert to datatime
df = df.rename(columns={'Province_State': 'Date'})
df['Date'] = pd.to_datetime(df['Date'])
df.head()

In [None]:
#Convert column headers into a column its own column
#Also create a new column that contains the number of cases 
df = df.melt(id_vars=["Date"], 
        var_name="State", 
        value_name="Count")
df.head()

In [None]:
#Make a column containing only months
df['Month'] = df['Date'].dt.month
df.head()

In [None]:
#Convert from strings to numeric
df['Month'] = pd.to_numeric(df['Month'])
df['Count'] = pd.to_numeric(df['Count'])

In [None]:
#Some months have a .1 or .2 
#Combine states with the same name togther 
#Possibility is to create a function 
def get_states(state):
    return state.split(".")[0]

df['States']= df['State'].apply(lambda x: get_states(x))
df.head()


### Which state has the highest number of confirmed cases across all months? 

In [None]:
#The count column should be a sum up all the cases for each state
aggregation_functions = {'Count': 'sum', 'States': 'first'}
df_new = df.groupby(df['States']).aggregate(aggregation_functions)
df_new.head(n = 10)



In [None]:
#Sort the data so that it is in descending order
df_new_sorted = df_new.sort_values('Count', ascending=False)

In [None]:
#plot
#top ten cases 
df_new_top_ten = df_new_sorted.head(10)
plt.bar(df_new_top_ten['States'],df_new_top_ten['Count'])
plt.xticks(df_new_top_ten['States'],rotation = 'vertical', size = 10)
plt.title('Total Confirmed Cases for the \nTop Ten States',size = 16)
plt.ylabel('Total Confirmed Cases')
plt.xlabel('States')
plt.show()

### A Question that pops up is why certain states have such a high number of confirmed cases? Is this due to population density or other external factors? To answer this question, a plot can be made that accounts for population density

## Load in US metadata

In [None]:
#load in metadata
df2 = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/CONVENIENT_us_metadata.csv')
df2.head()

## Data Cleaning

In [None]:
def get_states(state):
    return state.split(".")[0]

df2['States']= df2['Province_State'].apply(lambda x: get_states(x))
df2.head()

In [None]:
#Sum up population for each state
aggregation_functions2 = {'Population': 'sum', 'States': 'first'}
df2_new = df2.groupby(df2['States']).aggregate(aggregation_functions2)
df2_new.head()

In [None]:
#Merge the two dataframes, each state associated with population
#Used the unsorted data, to concatenate via index 
df_combined = pd.concat([df_new, df2_new], axis = 1)
df_combined['Population'] = pd.to_numeric(df_combined['Population'])
#sort data in decending order in terms of Count
df_combined_sorted = df_combined.sort_values('Count', ascending=False)
df_combined_sorted.head()

In [None]:
#Delete the extra columm of states 
df_combined_sorted = df_combined_sorted.loc[:, ~df_combined_sorted.columns.duplicated()]
df_combined_sorted.head()

In [None]:
#top ten cases and overlay plot with population size
df_com_top_ten = df_combined_sorted.head(10)
States = df_com_top_ten['States']
Cases  = df_com_top_ten['Count']
Population = df_com_top_ten['Population']
fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.bar(States,Cases, color = 'k')
ax2.plot(States, Population, 'g-')
ax1.set_title('Total Confirmed Cases for the \nTop Ten States',size = 16)
ax1.set_xlabel('States', size = 14)
ax1.set_ylabel('Number of Confirmed Cases', color='k', size = 14)
ax2.set_ylabel('Population', color='g', size = 12)
ax1.set_xticklabels(States, rotation = 'vertical', size = 10)
plt.show()

### An interesting observation Texas has a population less than California but still acounts for higher number of covid cases. 

## Plot Texas confirmed cases for each month

In [None]:
#look at the state of Texas using a histogram
df_texas = df.loc[df['States'] == 'Texas']

import plotly.express as px

plot = px.histogram(df_texas, y="Count", x="Month",
                    title = 'Total Number of Confirmed Cases Texas')
plot.show()


### Further analysis can be conducted to understand the peak in cases November. Is this due to external factors like policies or social distancing?