## Exercise 01 : Basic operations

In [1]:
import pandas as pd

## Read the data and create a dataframe views with two columns: datetime and user

In [2]:
# Не забываем прописать путь (абсолютный или относительный) к файлу с данными
file_path = '../data/feed-views.log'
col_names = ['datetime', 'user']

# Считываем данные из файла, указав разделитель и названия столбцов
views = pd.read_csv(file_path, sep='\t', names=col_names)
views

Unnamed: 0,datetime,user
0,2020-04-17 12:01:08.463179,artem
1,2020-04-17 12:01:23.743946,artem
2,2020-04-17 12:27:30.646665,artem
3,2020-04-17 12:35:44.884757,artem
4,2020-04-17 12:35:52.735016,artem
...,...,...
1071,2020-05-21 18:45:20.441142,valentina
1072,2020-05-21 23:03:06.457819,maxim
1073,2020-05-21 23:23:49.995349,pavel
1074,2020-05-21 23:49:22.386789,artem


In [3]:
# Проверим формат данных в датафрейме
views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1076 entries, 0 to 1075
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   datetime  1076 non-null   object
 1   user      1076 non-null   object
dtypes: object(2)
memory usage: 16.9+ KB


In [4]:
# Переведём столбец 'datetime' в формат datetime
views['datetime'] = pd.to_datetime(views.datetime)
views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1076 entries, 0 to 1075
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  1076 non-null   datetime64[ns]
 1   user      1076 non-null   object        
dtypes: datetime64[ns](1), object(1)
memory usage: 16.9+ KB


In [5]:
# Разделим столбец 'datetime' по единицам измерения времени
views['year'] = pd.DatetimeIndex(views['datetime']).year
views['month'] = pd.DatetimeIndex(views['datetime']).month
views['day'] = pd.DatetimeIndex(views['datetime']).day
views['hour'] = pd.DatetimeIndex(views['datetime']).hour
views['minute'] = pd.DatetimeIndex(views['datetime']).minute
views['second'] = pd.DatetimeIndex(views['datetime']).second
views

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52
...,...,...,...,...,...,...,...,...
1071,2020-05-21 18:45:20.441142,valentina,2020,5,21,18,45,20
1072,2020-05-21 23:03:06.457819,maxim,2020,5,21,23,3,6
1073,2020-05-21 23:23:49.995349,pavel,2020,5,21,23,23,49
1074,2020-05-21 23:49:22.386789,artem,2020,5,21,23,49,22


## Create the new column daytime

In [6]:
# Добавим столбец 'daytime' для разделения по времени суток

cut_labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']

cut_bins = [0, 4, 7, 11, 17, 20, 24] # при right=False - правая граница интервала не включается

views['daytime'] = pd.cut(views.hour, bins = cut_bins , labels = cut_labels, right=False)

# Заменим первый столбец с индексами на столбец 'user'
views.set_index('user', inplace=True)
views

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
...,...,...,...,...,...,...,...,...
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening
pavel,2020-05-21 23:23:49.995349,2020,5,21,23,23,49,evening
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening


## Calculate the number of elements in dataframe

In [7]:
# Посчитаем общее количество записей (строк)
views.count()

datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64

In [8]:
# Посчитаем количество записей для каждого времени суток
views['daytime'].value_counts()

evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: daytime, dtype: int64

## Sort values in dataframe by hour, minute, and second in ascending order (simultaneously and not one by one)

In [9]:
# Отсортируем датафрейм по часам, минутам и секундам в порядке возрастания
views_sort = views.sort_values(['hour', 'minute', 'second'])
views_sort

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night
pavel,2020-05-12 00:01:38.444917,2020,5,12,0,1,38,night
pavel,2020-05-12 00:01:55.395042,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening
anatoliy,2020-05-09 23:53:55.599821,2020,5,9,23,53,55,evening
pavel,2020-05-09 23:54:54.260791,2020,5,9,23,54,54,evening
valentina,2020-05-14 23:58:56.754866,2020,5,14,23,58,56,evening


## Calculate the minimum and maximum for the hours and the mode for the daytime categories

In [10]:
# Найдём самые поздние часы в категории 'night'
max_night_hour = views[views['daytime'] == 'night']['hour'].max()
max_night_hour

3

In [11]:
# Посмотрим пользователей, посещавших сайт в это время
views[(views['hour'] == max_night_hour)]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
konstantin,2020-04-19 03:23:35.471598,2020,4,19,3,23,35,night
konstantin,2020-04-19 03:23:55.473926,2020,4,19,3,23,55,night
konstantin,2020-04-19 03:33:07.757714,2020,4,19,3,33,7,night


In [12]:
# Найдём самые ранние часы в категории 'morning'
min_morning_hour = views[views['daytime'] == 'morning']['hour'].min()
min_morning_hour

8

In [13]:
# Посмотрим пользователей, посещавших сайт в это время
views[(views['hour'] == min_morning_hour)]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning


In [14]:
# Найдём моду (самое частое значение) по категории 'hour'
views['hour'].mode()

0    22
Name: hour, dtype: int64

In [15]:
# Найдём моду (самое частое значение) по категории 'daytime'
views['daytime'].mode()

0    evening
Name: daytime, dtype: category
Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening']

## Show the 3 earliest hours in the morning and the corresponding usernames and the 3 latest hours and the usernames

In [16]:
# Покажем 3 самых ранних часа в категории 'morning' и соответствующие имена пользователей
views[views['daytime'] == 'morning'].nsmallest(3, ['hour', 'minute', 'second'])

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning
alexander,2020-05-15 09:02:24.999438,2020,5,15,9,2,24,morning


In [17]:
# Покажем 3 самых поздних часа в категории 'morning' и соответствующие имена пользователей
views[views['daytime'] == 'morning'].nlargest(3, ['hour', 'minute', 'second'])

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
konstantin,2020-04-18 10:57:37.331258,2020,4,18,10,57,37,morning
konstantin,2020-05-09 10:56:59.161519,2020,5,9,10,56,59,morning
maxim,2020-04-18 10:56:55.833899,2020,4,18,10,56,55,morning


## Use the method describe() to get the basic statistics for the columns

In [18]:
# Используем describe для получения базовой статистики
data_describe = views.describe()
data_describe

Unnamed: 0,year,month,day,hour,minute,second
count,1076.0,1076.0,1076.0,1076.0,1076.0,1076.0
mean,2020.0,4.870818,13.552974,16.249071,29.629182,29.500929
std,0.0,0.335557,4.906567,6.95549,17.689388,17.405506
min,2020.0,4.0,1.0,0.0,0.0,0.0
25%,2020.0,5.0,11.0,13.0,14.0,14.0
50%,2020.0,5.0,13.0,19.0,29.0,30.0
75%,2020.0,5.0,15.0,22.0,46.0,45.0
max,2020.0,5.0,30.0,23.0,59.0,59.0


In [19]:
# Вычислим межквартильный диапазон для столбца 'hour' и сохраним его в переменную iqr
iqr = data_describe['hour']['75%'] - data_describe['hour']['25%']
iqr

9.0