# EX01
## create a dataframe views with two columns: datetime and user by reading feed-views.log

In [9]:
import pandas as pd

views = pd.read_csv(
    '../data/feed-views.log',            # путь к исходному файлу
    sep = '\t',                          # разделитель  
    names=['datetime', 'user'],          # задаём имена колонок
    parse_dates=['datetime']             # читаем datetime как datetime64[ns]
)

views.head()

Unnamed: 0,datetime,user
0,2020-04-17 12:01:08.463179,artem
1,2020-04-17 12:01:23.743946,artem
2,2020-04-17 12:27:30.646665,artem
3,2020-04-17 12:35:44.884757,artem
4,2020-04-17 12:35:52.735016,artem


In [10]:
views['year']   = views['datetime'].dt.year
views['month']  = views['datetime'].dt.month
views['day']    = views['datetime'].dt.day
views['hour']   = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second

views.head()

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52


## create the new column daytime

In [None]:
bins   = [-1, 3, 6, 10, 16, 19, 23]
labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']

views['daytime'] = pd.cut(
    views['hour'],
    bins=bins,
    labels=labels,
    right=True,
    include_lowest=True
)

views = views.set_index('user')

# views.head()
print(views[['datetime','hour','daytime']])

                            datetime  hour        daytime
user                                                     
artem     2020-04-17 12:01:08.463179    12      afternoon
artem     2020-04-17 12:01:23.743946    12      afternoon
artem     2020-04-17 12:27:30.646665    12      afternoon
artem     2020-04-17 12:35:44.884757    12      afternoon
artem     2020-04-17 12:35:52.735016    12      afternoon
...                              ...   ...            ...
valentina 2020-05-21 18:45:20.441142    18  early evening
maxim     2020-05-21 23:03:06.457819    23        evening
pavel     2020-05-21 23:23:49.995349    23        evening
artem     2020-05-21 23:49:22.386789    23        evening
artem     2020-05-22 10:36:14.662600    10        morning

[1076 rows x 3 columns]


## calculate the number of elements in your dataframe

## 

In [12]:
# 1. Общее число непустых значений в каждом столбце
print("Count per column:")
print(views.count())

# Если нужно именно число строк (элементов) в датафрейме:
print("\nTotal number of rows:")
print(len(views))

# 2. Число элементов в каждом интервале времени суток
print("\nValue counts for 'daytime':")
print(views['daytime'].value_counts())

Count per column:
datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64

Total number of rows:
1076

Value counts for 'daytime':
daytime
evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: count, dtype: int64


## sort values in your dataframe by hour, minute, and second in ascending order (simultaneously and not one by one)

In [13]:
views.sort_values(by=['hour','minute','second'], ascending=True, inplace=True)
views

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night
pavel,2020-05-12 00:01:38.444917,2020,5,12,0,1,38,night
pavel,2020-05-12 00:01:55.395042,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening
anatoliy,2020-05-09 23:53:55.599821,2020,5,9,23,53,55,evening
pavel,2020-05-09 23:54:54.260791,2020,5,9,23,54,54,evening
valentina,2020-05-14 23:58:56.754866,2020,5,14,23,58,56,evening


## calculate the minimum and maximum for the hours and the mode for the daytime categories

In [None]:
# 1. Минимум и максимум по столбцу hour
min_hour = views['hour'].min()
max_hour = views['hour'].max()
print(f"Hour: min = {min_hour}, max = {max_hour}")

#    Мода по категориям daytime
mode_daytime = views['daytime'].mode().iloc[0]
print(f"Most frequent daytime category = {mode_daytime}")

# 2. Максимальный час для строк, где daytime == 'night'
max_night_hour = views.loc[views['daytime']=='night','hour'].max()
print(f"Max hour in 'night' = {max_night_hour}")

# 3. Минимальный час для строк, где daytime == 'morning'
min_morning_hour = views.loc[views['daytime']=='morning','hour'].min()
print(f"Min hour in 'morning' = {min_morning_hour}")

# 4. Юзеры 

# 1) вычисляем максимальный час ночи
max_night_hour = views.loc[views['daytime']=='night', 'hour'].max()

# 2) строим маску строк, где ночь и этот час
mask = (views['daytime']=='night') & (views['hour']==max_night_hour)

# 3) получаем пользователей из индекса
users_at_max_night = views.loc[mask].index.unique()
print(f"Users at hour {max_night_hour} in 'night': {users_at_max_night.tolist()}")

# 5. Мода для hour и для daytime
mode_hour = views['hour'].mode().iloc[0]
mode_daytime_again = views['daytime'].mode().iloc[0]
print(f"Mode of hour = {mode_hour}")
print(f"Mode of daytime = {mode_daytime_again}")

Hour: min = 0, max = 23
Most frequent daytime category = evening
Max hour in 'night' = 3
Min hour in 'morning' = 8
Users at hour 3 in 'night': ['konstantin']
Mode of hour = 22
Mode of daytime = evening


## show the 3 earliest hours in the morning and the corresponding usernames and the 3 latest hours and the usernames using nsmallest() and nlargest()

In [24]:
# 3 самых ранних часа утром и пользователи
morning_early = views[views['daytime'] == 'morning'].nsmallest(3, 'hour')
print("3 самых ранних часа утром:")
print(morning_early[['hour']])  # Индекс — user

# 3 самых поздних часа и пользователи
latest_hours = views.nlargest(3, 'hour')
print("\n3 самых поздних часа:")
print(latest_hours[['hour']])

3 самых ранних часа утром:
           hour
user           
alexander     8
alexander     8
alexander     9

3 самых поздних часа:
           hour
user           
ekaterina    23
ekaterina    23
ekaterina    23


## use the method describe() to get the basic statistics for the columns

In [25]:
# Получаем базовую статистику по столбцам
stats = views.describe()

# Извлекаем значения 3-го и 1-го квартиля из describe() по столбцу 'hour'
q3 = stats.loc['75%', 'hour']
q1 = stats.loc['25%', 'hour']

# Вычисляем межквартильный размах
iqr = q3 - q1

# Вывод
print("Базовая статистика по столбцам:")
print(stats)

print("\nМежквартильный размах для 'hour':", iqr)

Базовая статистика по столбцам:
                            datetime    year        month          day  \
count                           1076  1076.0  1076.000000  1076.000000   
mean   2020-05-10 09:00:41.211420672  2020.0     4.870818    13.552974   
min       2020-04-17 12:01:08.463179  2020.0     4.000000     1.000000   
25%       2020-05-10 01:13:49.857472  2020.0     5.000000    11.000000   
50%    2020-05-11 22:48:35.302552832  2020.0     5.000000    13.000000   
75%    2020-05-14 14:44:34.749530624  2020.0     5.000000    15.000000   
max       2020-05-22 10:36:14.662600  2020.0     5.000000    30.000000   
std                              NaN     0.0     0.335557     4.906567   

              hour       minute       second  
count  1076.000000  1076.000000  1076.000000  
mean     16.249071    29.629182    29.500929  
min       0.000000     0.000000     0.000000  
25%      13.000000    14.000000    14.000000  
50%      19.000000    29.000000    30.000000  
75%      22.000000 