In [140]:
import pandas as pd
import numpy as np
import os

In [141]:
OUTPUT_DIR = "../output"
INPUT_DIR = "../input"
USER_ACTIVITY = "user-activity.csv"
USER_ACTIVITIES = "user-activities.csv"

# Load User Activity Dataset

In [142]:
# Load dataset
path_dataset = os.path.join(INPUT_DIR, USER_ACTIVITY)
user_activity_dataset = pd.read_csv(path_dataset)

In [143]:
# Head dataset
user_activity_dataset.head()

Unnamed: 0,user_id,timestamp
0,621.0,0:00:02
1,818.0,0:00:10
2,4.0,0:00:21
3,563.0,0:00:21
4,671.0,0:00:26


In [144]:
# Tail dataset
user_activity_dataset.tail()

Unnamed: 0,user_id,timestamp
71991,,
71992,,
71993,,
71994,,
71995,,


# Dataset Dimensions

In [145]:
user_activity_dataset.shape

(71996, 2)

# Dataset Columns

In [146]:
user_activity_dataset.columns.tolist()

['user_id', 'timestamp']

# Remove Missing Values

In [147]:
user_activity_dataset = user_activity_dataset.dropna()

In [148]:
user_activity_dataset.shape

(61582, 2)

In [149]:
user_activity_dataset

Unnamed: 0,user_id,timestamp
0,621.0,0:00:02
1,818.0,0:00:10
2,4.0,0:00:21
3,563.0,0:00:21
4,671.0,0:00:26
...,...,...
61577,12.0,23:59:18
61578,646.0,23:59:20
61579,458.0,23:59:34
61580,968.0,23:59:53


# Sort Values By Timestamp

In [150]:
user_activity_dataset.sort_values(by=['timestamp'])

Unnamed: 0,user_id,timestamp
51518,675.0,0:00:00
10370,381.0,0:00:01
51519,155.0,0:00:01
0,621.0,0:00:02
30545,972.0,0:00:05
...,...,...
14632,136.0,9:59:54
45574,662.0,9:59:54
45575,241.0,9:59:58
45576,127.0,9:59:59


In [151]:
# Count activities by user
group_user_activity = user_activity_dataset.groupby(pd.Grouper(key='user_id', axis=0)).groups
activities = []
for user_activity in group_user_activity:
    #print("user_activity: ", user_activity)
    group_user_activity[user_activity].values
    df_user_activity = pd.DataFrame({'index': group_user_activity[user_activity].values})
    df = df_user_activity.set_index('index').join(user_activity_dataset)
    activities.append({'user_id': user_activity, 'count': df['user_id'].count()})
user_activities_dataset = pd.DataFrame(activities)
user_activities_dataset = user_activities_dataset.sort_values(by=['count'])

In [152]:
user_activities_dataset.head()

Unnamed: 0,user_id,count
940,474.0,9
689,779.0,9
258,110.0,12
920,317.0,12
837,689.0,13


In [153]:
user_activities_dataset.tail()

Unnamed: 0,user_id,count
232,490.0,208
845,586.0,220
115,343.0,1336
6,136.0,1653
393,51.0,1824


In [154]:
# User with min activities
user_activities_dataset['count'].min()

9

In [155]:
# User with max activities
user_activities_dataset['count'].max()

1824

In [156]:
user_activities_dataset['count'].max() - user_activities_dataset['count'].min()

1815

# Save Dataset User Activities

In [157]:
path_dataset = os.path.join(OUTPUT_DIR, USER_ACTIVITIES)
user_activities_dataset.to_csv(path_dataset, index=False)