In [1]:
import pandas as pd

## create a dataframe views with two columns: datetime and user by reading feed-views.log

In [2]:
views = pd.DataFrame
try:
    views = pd.read_csv("../data/feed-views.log", header=None, names=['datetime', 'user'], engine="python", sep='\t')
except IOError as e:
    print(e)
    quit(1)

In [3]:
try:
    datetime_converter = lambda x: x.strip()
    views['datetime'] = views['datetime'].apply(datetime_converter).astype('datetime64[ns]')
    print(views.dtypes)
except ValueError as e:
    print(f"Exception: {e}")
    exit(2)

datetime    datetime64[ns]
user                object
dtype: object


In [4]:
views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second']= views['datetime'].dt.second
views

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52
...,...,...,...,...,...,...,...,...
1071,2020-05-21 18:45:20.441142,valentina,2020,5,21,18,45,20
1072,2020-05-21 23:03:06.457819,maxim,2020,5,21,23,3,6
1073,2020-05-21 23:23:49.995349,pavel,2020,5,21,23,23,49
1074,2020-05-21 23:49:22.386789,artem,2020,5,21,23,49,22


## create the new column daytime

In [5]:
dt_bins = [0, 3, 6, 10, 16, 19, 23]
dt_labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']
views['daytime'] = pd.cut(views['hour'], bins=dt_bins, labels=dt_labels, include_lowest = True, ordered=False)
views.set_index('user')

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
...,...,...,...,...,...,...,...,...
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening
pavel,2020-05-21 23:23:49.995349,2020,5,21,23,23,49,evening
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening


## calculate the number of elements in your viewsframe

In [6]:
print(views.count())

datetime    1076
user        1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64


In [7]:
print(views['daytime'].value_counts())

evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: daytime, dtype: int64


## sort values in your viewsframe by hour, minute, and second in ascending order(simultaneously and not one by one)

In [8]:
views.sort_values(axis=0, by=['hour', 'minute', 'second'], inplace=True)

## calculate the minimum and maximum for the hours and the mode for the daytime categories

In [9]:
evening_filter = views['daytime'].isin(['night'])
max_n = views[evening_filter]['hour'].max()
print(f"maximum of hour for the rows where the time of day is night: {max_n}")
max_n_filter = views['hour'].isin([max_n])
views[max_n_filter].head(1)

maximum of hour for the rows where the time of day is night: 3


Unnamed: 0,datetime,user,year,month,day,hour,minute,second,daytime
46,2020-04-19 03:23:35.471598,konstantin,2020,4,19,3,23,35,night


In [10]:
morning_filter = views['daytime'].isin(['morning'])
min_m = views[morning_filter]['hour'].min()
print(f"minimum of hour for the rows where the time of day is morning: {min_m}")
min_m_filter = views['hour'].isin([min_m])
views[min_m_filter].head(1)

minimum of hour for the rows where the time of day is morning: 8


Unnamed: 0,datetime,user,year,month,day,hour,minute,second,daytime
963,2020-05-15 08:16:03.918402,alexander,2020,5,15,8,16,3,morning


In [11]:
mode_h = views['hour'].mode()[0]
print(f"Mode for the hour: {mode_h}")
mode_dt = views['daytime'].mode()[0]
print(f"Mode for the daytime: {mode_dt}")

Mode for the hour: 22
Mode for the daytime: evening


## show the 3 earliest hours in the morning and the corresponding usernames and the 3 latest hours and the usernames using nsmallest() and nlargest()

In [12]:
print("3 earliest hours in the morning and the corresponding usernames:")
views[morning_filter].nsmallest(3, 'hour')[['hour', 'user']]

3 earliest hours in the morning and the corresponding usernames:


Unnamed: 0,hour,user
963,8,alexander
964,8,alexander
965,9,alexander


In [13]:
print("3 latest hours in the morning and the corresponding usernames:")
views[morning_filter].nlargest(3, 'hour')[['hour', 'user']]

3 latest hours in the morning and the corresponding usernames:


Unnamed: 0,hour,user
976,10,konstantin
756,10,maxim
757,10,maxim


## use the method describe() to get the basic statistics for the columns

In [14]:
iqr = views['hour'].describe()['75%'] - views['hour'].describe()['25%']
print(f"the interquartile range for the hour: {iqr}")

the interquartile range for the hour: 9.0
