In [1]:
import numpy as np
import pandas as pd
import time
from datetime import timedelta
import datetime
import math

### Functions

In [2]:
# get list of users with volume unchanged for specified period. Also returns total volume of disk space used by all users in list
def get_volume_older_than(df, period):
    """period to include is in months"""
    datetime_delta = timedelta(weeks=4*period)
    oldest_date = datetime.datetime.now() - datetime_delta
    unixtime = time.mktime(oldest_date.timetuple())
    older = df[df['unix_ts'] < unixtime]
    older = older.sort_values(by='volume', ascending=False)
    total_volume_unchanged = older['volume'].sum()
    return older, total_volume_unchanged

# get the percentage use of each user
def get_top_users(df, total):
    df = df[['user', 'volume']]
    df['%'] = (df['volume']/total) * 100
    return df.round(2)

# to format volumes nicely
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])

In [13]:
# get data from bash script output into dataframe
data = 'example.txt'
df = pd.read_csv(data, delim_whitespace=True, header=None, names=["user", "volume", "weekday", "month", "day", "time", "zone", "year", "unix_ts"])

In [14]:
older, volume = get_volume_older_than(df, 2)

In [15]:
older

Unnamed: 0,user,volume,weekday,month,day,time,zone,year,unix_ts
4,user5,7818503043769,Tue,Jun,15,15:42:54,SAST,2021,1623765000.0
16,user17,7229396370993,Mon,Oct,18,12:16:04,SAST,2021,1634552000.0
2,user3,6057724100588,Fri,May,14,19:24:15,SAST,2021,1621013000.0
17,user18,2724868657471,Tue,Oct,19,13:20:32,SAST,2021,1634642000.0
19,user20,2655842721154,Tue,Oct,26,13:59:05,SAST,2021,1635250000.0
14,user15,2072533075361,Sun,Oct,10,20:34:53,SAST,2021,1633891000.0
5,user6,1910996286933,Tue,Jun,22,17:02:07,SAST,2021,1624374000.0
1,user2,1896993787933,Fri,May,14,10:37:53,SAST,2021,1620981000.0
13,user14,313174940147,Fri,Oct,1,01:10:54,SAST,2021,1633043000.0
9,user10,237132364033,Tue,Jul,20,14:08:03,SAST,2021,1626783000.0


In [16]:
top_users = get_top_users(older, volume)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [17]:
top_users['volume'] = df['volume'].apply(lambda x: convert_size(x))

In [18]:
top_users

Unnamed: 0,user,volume,%
4,user5,7.11 TB,23.62
16,user17,6.58 TB,21.84
2,user3,5.51 TB,18.3
17,user18,2.48 TB,8.23
19,user20,2.42 TB,8.02
14,user15,1.88 TB,6.26
5,user6,1.74 TB,5.77
1,user2,1.73 TB,5.73
13,user14,291.67 GB,0.95
9,user10,220.85 GB,0.72


In [19]:
# users above one Tb
users_above_one = older[older['volume'] > 1*10**12]

In [20]:
users_above_one

Unnamed: 0,user,volume,weekday,month,day,time,zone,year,unix_ts
4,user5,7818503043769,Tue,Jun,15,15:42:54,SAST,2021,1623765000.0
16,user17,7229396370993,Mon,Oct,18,12:16:04,SAST,2021,1634552000.0
2,user3,6057724100588,Fri,May,14,19:24:15,SAST,2021,1621013000.0
17,user18,2724868657471,Tue,Oct,19,13:20:32,SAST,2021,1634642000.0
19,user20,2655842721154,Tue,Oct,26,13:59:05,SAST,2021,1635250000.0
14,user15,2072533075361,Sun,Oct,10,20:34:53,SAST,2021,1633891000.0
5,user6,1910996286933,Tue,Jun,22,17:02:07,SAST,2021,1624374000.0
1,user2,1896993787933,Fri,May,14,10:37:53,SAST,2021,1620981000.0


In [22]:
# in bytes
volume

33105939024500

In [23]:
convert_size(volume)

'30.11 TB'