In [1]:
from urllib.request import urlretrieve

In [2]:
urlretrieve('https://hub.jovian.ml/wp-content/uploads/2020/09/italy-covid-daywise.csv', 
            'italy-covid-daywise.csv')

('italy-covid-daywise.csv', <http.client.HTTPMessage at 0x7f510c05dd30>)

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
# df means dataframe

covid_df = pd.read_csv('italy-covid-daywise.csv')
covid_df.tail().style.background_gradient(cmap = "Greens")

Unnamed: 0,date,new_cases,new_deaths,new_tests
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,
247,2020-09-03,1326.0,6.0,


In [10]:
type(covid_df)

pandas.core.frame.DataFrame

In [3]:
covid_df

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
...,...,...,...,...
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,


In [14]:
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        248 non-null    object 
 1   new_cases   248 non-null    float64
 2   new_deaths  248 non-null    float64
 3   new_tests   135 non-null    float64
dtypes: float64(3), object(1)
memory usage: 7.9+ KB


In [15]:
covid_df.describe()

Unnamed: 0,new_cases,new_deaths,new_tests
count,248.0,248.0,135.0
mean,1094.818548,143.133065,31699.674074
std,1554.508002,227.105538,11622.209757
min,-148.0,-31.0,7841.0
25%,123.0,3.0,25259.0
50%,342.0,17.0,29545.0
75%,1371.75,175.25,37711.0
max,6557.0,971.0,95273.0


In [16]:
covid_df.columns

Index(['date', 'new_cases', 'new_deaths', 'new_tests'], dtype='object')

In [17]:
covid_df.shape

(248, 4)

In [18]:
covid_data_dict = {
    'date' : ['2020-08-30', '2020-08-31', '2020-09-01',  '2020-09-02', '2020-09-03'],
    'new_cases' : [1444, 1365, 996, 975, 1326],
    'new_deaths' : [1, 4, 6, 8, 6],
    'new_tests' : [53541, 42583, 54395, None, None]
}

In [19]:
covid_data_dict['new_cases']

[1444, 1365, 996, 975, 1326]

In [20]:
covid_df['new_cases']

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
243    1444.0
244    1365.0
245     996.0
246     975.0
247    1326.0
Name: new_cases, Length: 248, dtype: float64

In [21]:
# Each column is represented using a data structure called Series, which is essentially a numpy array with some
# extra methods and properties.

type(covid_df['new_cases'])

pandas.core.series.Series

In [22]:
covid_df['new_cases'][246]

975.0

Pandas also provides the .at method to directly retrieve at a specific row & column.

In [23]:
covid_df.at[246, 'new_cases']

975.0

In [24]:
covid_df.at[246, 'new_tests']

nan

In [25]:
# or 

covid_df.new_cases[246]

975.0

We can also pass a list of columns within the indexing notation [] to access a subset of the data frame with just the given columns.

In [34]:
cases_df = covid_df[['date', 'new_cases']]
cases_df.head()

Unnamed: 0,date,new_cases
0,2019-12-31,0.0
1,2020-01-01,0.0
2,2020-01-02,0.0
3,2020-01-03,0.0
4,2020-01-04,0.0


In [36]:
covid_df_copy = covid_df.copy()
covid_df_copy

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
...,...,...,...,...
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,


Note, however, that the new data frame cases_df is simply a "view" of the original data frame covid_df i.e. they both point to the same data in the computer's memory, and changing any values inside one of them will also change the respective values in the other. Sharing data between data frames makes data manipulation in Pandas blazing fast, and you needn't worry about the overhead of copying thousands or millions of rows every time you want to crate a new data frame by operating on an existing one.

In [37]:
# to access a specific row, we use .loc method

covid_df.loc[243]

date          2020-08-30
new_cases           1444
new_deaths             1
new_tests          53541
Name: 243, dtype: object

In [38]:
covid_df.iloc[243]

date          2020-08-30
new_cases           1444
new_deaths             1
new_tests          53541
Name: 243, dtype: object

In [39]:
type(covid_df.at[0, 'new_tests'])

numpy.float64

In [40]:
covid_df

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
...,...,...,...,...
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,


In [41]:
# to find the first index that doesn't contain a NaN value using first_valid_index method of a series.

covid_df.new_tests.first_valid_index()

111

In [42]:
covid_df.loc[108:113]

Unnamed: 0,date,new_cases,new_deaths,new_tests
108,2020-04-17,3786.0,525.0,
109,2020-04-18,3493.0,575.0,
110,2020-04-19,3491.0,480.0,
111,2020-04-20,3047.0,433.0,7841.0
112,2020-04-21,2256.0,454.0,28095.0
113,2020-04-22,2729.0,534.0,44248.0


In [43]:
# to get random sample from the dataframe

covid_df.sample(10)

Unnamed: 0,date,new_cases,new_deaths,new_tests
138,2020-05-17,875.0,153.0,33505.0
175,2020-06-23,221.0,23.0,23225.0
100,2020-04-09,3836.0,540.0,
4,2020-01-04,0.0,0.0,
134,2020-05-13,1402.0,172.0,37049.0
79,2020-03-19,4207.0,473.0,
34,2020-02-03,0.0,0.0,
74,2020-03-14,2547.0,252.0,
124,2020-05-03,1900.0,474.0,27047.0
177,2020-06-25,577.0,-31.0,29421.0


In [8]:
# to count NaN values in a column
covid_df.new_tests.isna().sum()

113

TypeError: 'DataFrame' object is not callable