# Анализ данных по Ковид-19

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# загрузка данных
data = pd.read_csv('covid_19_data.csv')
data.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [5]:
data.shape

(156292, 8)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156292 entries, 0 to 156291
Data columns (total 8 columns):
SNo                156292 non-null int64
ObservationDate    156292 non-null object
Province/State     111979 non-null object
Country/Region     156292 non-null object
Last Update        156292 non-null object
Confirmed          156292 non-null float64
Deaths             156292 non-null float64
Recovered          156292 non-null float64
dtypes: float64(3), int64(1), object(4)
memory usage: 9.5+ MB


In [7]:
# проверка ключей
data.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')

In [9]:
# удаление ненужных столбцов по ключам
# 1 способ (с присвоением в переменную) drop columns 1 and rows
# data = data.drop(['SNo', 'Last Update'], axis=1)
# 2 способ (без присвоением в переменную)
# data.drop(['SNo', 'Last Update'], axis=1, inplace=True)
data.head()

KeyError: "['SNo' 'Last Update'] not found in axis"

In [11]:
# проверка на дубликаты
data.duplicated(['ObservationDate', 'Country/Region', 'Province/State']).sum()

4

In [12]:
# удаление дубликатов
data = data.drop_duplicates(['ObservationDate', 'Country/Region', 'Province/State'])

In [13]:
data.duplicated(['ObservationDate', 'Country/Region', 'Province/State']).sum()

0

In [16]:
# изучение страны
country_array = data['Country/Region'].unique()

for country in np.sort(country_array):
    print(country)

 Azerbaijan
('St. Martin',)
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahamas, The
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burma
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Cape Verde
Cayman Islands
Central African Republic
Chad
Channel Islands
Chile
Colombia
Comoros
Congo (Brazzaville)
Congo (Kinshasa)
Costa Rica
Croatia
Cuba
Curacao
Cyprus
Czech Republic
Denmark
Diamond Princess
Djibouti
Dominica
Dominican Republic
East Timor
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Faroe Islands
Fiji
Finland
France
French Guiana
Gabon
Gambia
Gambia, The
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guadeloupe
Guam
Guatemala
Guernsey
Guinea
Guinea-Bissau
Guyana
Haiti
Holy See
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Ivory Coast
Jamaic

In [17]:
# изучаем Others
# применяем маскирование
data[data['Country/Region'] == 'Others']

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
933,02/07/2020,Cruise Ship,Others,61.0,0.0,0.0
1005,02/08/2020,Cruise Ship,Others,61.0,0.0,0.0
1077,02/09/2020,Diamond Princess cruise ship,Others,64.0,0.0,0.0
1143,02/10/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1215,02/11/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1286,02/12/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1359,02/13/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1433,02/14/2020,Diamond Princess cruise ship,Others,218.0,0.0,0.0
1506,02/15/2020,Diamond Princess cruise ship,Others,285.0,0.0,0.0
1578,02/16/2020,Diamond Princess cruise ship,Others,355.0,0.0,0.0


In [18]:
# приводим в порядок даты
data['ObservationDate']

0         01/22/2020
1         01/22/2020
2         01/22/2020
3         01/22/2020
4         01/22/2020
             ...    
156287    11/15/2020
156288    11/15/2020
156289    11/15/2020
156290    11/15/2020
156291    11/15/2020
Name: ObservationDate, Length: 156288, dtype: object

In [20]:
data['ObservationDate'][0]

'01/22/2020'

In [22]:
# пример метода конвертации даты
pd.to_datetime('01/22/2020').date()

datetime.date(2020, 1, 22)

In [25]:
#  конвертация временных данных в тип timeStamp
data['ObservationDate'] = pd.to_datetime(data['ObservationDate'])
data['ObservationDate']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0        2020-01-22
1        2020-01-22
2        2020-01-22
3        2020-01-22
4        2020-01-22
            ...    
156287   2020-11-15
156288   2020-11-15
156289   2020-11-15
156290   2020-11-15
156291   2020-11-15
Name: ObservationDate, Length: 156288, dtype: datetime64[ns]

In [26]:
data['ObservationDate'][0]

Timestamp('2020-01-22 00:00:00')

In [29]:
# извлечение даты из Timestamp (исключение временных данных 00:00:00)
data['Date'] = data['ObservationDate'].apply(lambda el: el.date())


In [28]:
# пример работы метода apply()

# функция
# f = lambda x: x*2

# s =pd.Series([10,20,30]).apply(f)
# s

0    20
1    40
2    60
dtype: int64

In [30]:
data['Date']

0         2020-01-22
1         2020-01-22
2         2020-01-22
3         2020-01-22
4         2020-01-22
             ...    
156287    2020-11-15
156288    2020-11-15
156289    2020-11-15
156290    2020-11-15
156291    2020-11-15
Name: Date, Length: 156288, dtype: object

In [31]:
data['Date'][0]

datetime.date(2020, 1, 22)