## Pandas GroupBy

In [1]:
import dateutil
import pandas as pd

In [2]:
# Load data from csv file
data = pd.DataFrame.from_csv('C:/Users/acer/Desktop/Python Classnotes/GroupBy/phone_data.csv')

  


In [3]:
data.head()

Unnamed: 0_level_0,date,duration,item,month,network,network_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15/10/14 06:58,34.429,data,2014-11,data,data
1,15/10/14 06:58,13.0,call,2014-11,Vodafone,mobile
2,15/10/14 14:46,23.0,call,2014-11,Meteor,mobile
3,15/10/14 14:48,4.0,call,2014-11,Tesco,mobile
4,15/10/14 17:27,4.0,call,2014-11,Tesco,mobile


In [4]:
# Convert date from string to date times
data['datetime']=data.date
data['date'] = data['date'].apply(dateutil.parser.parse, dayfirst=True)
data.head()

Unnamed: 0_level_0,date,duration,item,month,network,network_type,datetime
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2014-10-15 06:58:00,34.429,data,2014-11,data,data,15/10/14 06:58
1,2014-10-15 06:58:00,13.0,call,2014-11,Vodafone,mobile,15/10/14 06:58
2,2014-10-15 14:46:00,23.0,call,2014-11,Meteor,mobile,15/10/14 14:46
3,2014-10-15 14:48:00,4.0,call,2014-11,Tesco,mobile,15/10/14 14:48
4,2014-10-15 17:27:00,4.0,call,2014-11,Tesco,mobile,15/10/14 17:27


In [5]:
# How many rows the dataset
data['item'].count()

830

In [6]:
data.shape[0]

830

In [7]:
# How many entries are there for each month?
data['month'].value_counts()

2014-11    230
2015-01    205
2014-12    157
2015-02    137
2015-03    101
Name: month, dtype: int64

In [8]:
# Number of non-null unique network entries
data['network'].nunique()

9

In [9]:
data['network'].value_counts()

Vodafone     215
Three        215
data         150
Meteor        87
Tesco         84
landline      42
voicemail     27
world          7
special        3
Name: network, dtype: int64

In [10]:
data.groupby(['month']).groups.keys()

dict_keys(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'])

In [11]:
len(data.groupby(['month']).groups['2014-11'])

230

In [12]:
# Get the first entry for each month
data.groupby('month').first()

Unnamed: 0_level_0,date,duration,item,network,network_type,datetime
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-11,2014-10-15 06:58:00,34.429,data,data,data,15/10/14 06:58
2014-12,2014-11-13 06:58:00,34.429,data,data,data,13/11/14 06:58
2015-01,2014-12-13 06:58:00,34.429,data,data,data,13/12/14 06:58
2015-02,2015-01-13 06:58:00,34.429,data,data,data,13/01/15 06:58
2015-03,2015-02-12 20:15:00,69.0,call,landline,landline,12/02/15 20:15


In [13]:
# Get the sum of the durations per month
data.groupby('month')['duration'].sum()

month
2014-11    26639.441
2014-12    14641.870
2015-01    18223.299
2015-02    15522.299
2015-03    22750.441
Name: duration, dtype: float64

In [15]:
# Get the number of dates / entries in each month
data.groupby('month').date.count()

month
2014-11    230
2014-12    157
2015-01    205
2015-02    137
2015-03    101
Name: date, dtype: int64

In [16]:
# What is the sum of durations, for calls only, to each network
data[data['item'] == 'call'].groupby('network')['duration'].sum()

network
Meteor        7200.0
Tesco        13828.0
Three        36464.0
Vodafone     14621.0
landline     18433.0
voicemail     1775.0
Name: duration, dtype: float64

In [17]:
# How many calls, sms, and data entries are in each month?
data.groupby(['month', 'item'])['date'].count()

month    item
2014-11  call    107
         data     29
         sms      94
2014-12  call     79
         data     30
         sms      48
2015-01  call     88
         data     31
         sms      86
2015-02  call     67
         data     31
         sms      39
2015-03  call     47
         data     29
         sms      25
Name: date, dtype: int64

In [18]:
# produces Pandas Series
data.groupby('month')['duration'].sum() 

month
2014-11    26639.441
2014-12    14641.870
2015-01    18223.299
2015-02    15522.299
2015-03    22750.441
Name: duration, dtype: float64

In [19]:
# Produces Pandas DataFrame
data.groupby('month')[['duration']].sum()

Unnamed: 0_level_0,duration
month,Unnamed: 1_level_1
2014-11,26639.441
2014-12,14641.87
2015-01,18223.299
2015-02,15522.299
2015-03,22750.441


In [20]:
#The groupby output will have an index or multi-index on rows corresponding to your 
#chosen grouping variables. To avoid setting this index, pass “as_index=False” to the groupby operation.
data.groupby('month', as_index=False).agg({"duration": "sum"})

Unnamed: 0,month,duration
0,2014-11,26639.441
1,2014-12,14641.87
2,2015-01,18223.299
3,2015-02,15522.299
4,2015-03,22750.441


In [21]:
data.groupby('month').agg({"duration": "sum"})#With Index

Unnamed: 0_level_0,duration
month,Unnamed: 1_level_1
2014-11,26639.441
2014-12,14641.87
2015-01,18223.299
2015-02,15522.299
2015-03,22750.441


In [22]:
# Group the data frame by month and item and extract a number of stats from each group
data.groupby(
   ['month', 'item']
).agg(
    {
         'duration':sum,    # Sum duration per group
         'network_type': "count",  # get the count of networks
         'date': 'first'  # get the first date per group
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,network_type,date
month,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11,call,25547.0,107,2014-10-15 06:58:00
2014-11,data,998.441,29,2014-10-15 06:58:00
2014-11,sms,94.0,94,2014-10-16 22:18:00
2014-12,call,13561.0,79,2014-11-14 17:24:00
2014-12,data,1032.87,30,2014-11-13 06:58:00
2014-12,sms,48.0,48,2014-11-14 17:28:00
2015-01,call,17070.0,88,2014-12-15 20:03:00
2015-01,data,1067.299,31,2014-12-13 06:58:00
2015-01,sms,86.0,86,2014-12-15 19:56:00
2015-02,call,14416.0,67,2015-01-15 10:36:00


In [23]:
gb=data.groupby(['month', 'item'], as_index=False)
gb.agg(
    {
         'duration':sum,    # Sum duration per group
         'network_type': "count",  # get the count of networks
         'date': 'first'  # get the first date per group
    }
)

Unnamed: 0,month,item,duration,network_type,date
0,2014-11,call,25547.0,107,2014-10-15 06:58:00
1,2014-11,data,998.441,29,2014-10-15 06:58:00
2,2014-11,sms,94.0,94,2014-10-16 22:18:00
3,2014-12,call,13561.0,79,2014-11-14 17:24:00
4,2014-12,data,1032.87,30,2014-11-13 06:58:00
5,2014-12,sms,48.0,48,2014-11-14 17:28:00
6,2015-01,call,17070.0,88,2014-12-15 20:03:00
7,2015-01,data,1067.299,31,2014-12-13 06:58:00
8,2015-01,sms,86.0,86,2014-12-15 19:56:00
9,2015-02,call,14416.0,67,2015-01-15 10:36:00


In [29]:
# Define the aggregation procedure outside of the groupby operation
aggregations = {
    'duration':'sum',
    'date': lambda x: max(x)
}
data.groupby('month').agg(aggregations)

Unnamed: 0_level_0,duration,date
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-11,26639.441,2014-11-13 22:31:00
2014-12,14641.87,2014-12-14 19:54:00
2015-01,18223.299,2015-01-14 23:36:00
2015-02,15522.299,2015-02-12 06:58:00
2015-03,22750.441,2015-03-14 00:16:00


In [30]:
# Group the data frame by month and item and extract a number of stats from each group
data.groupby(
    ['month', 'item']
).agg(
    {
        # Find the min, max, and sum of the duration column
        'duration': [min, max, sum],
        # find the number of network type entries
        'network_type': "count",
        # minimum, first, and number of unique dates
        'date': [min, 'first', 'nunique']
    }
)


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,duration,duration,network_type,date,date,date
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,sum,count,min,first,nunique
month,item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2014-11,call,1.0,1940.0,25547.0,107,2014-10-15 06:58:00,2014-10-15 06:58:00,104
2014-11,data,34.429,34.429,998.441,29,2014-10-15 06:58:00,2014-10-15 06:58:00,29
2014-11,sms,1.0,1.0,94.0,94,2014-10-16 22:18:00,2014-10-16 22:18:00,79
2014-12,call,2.0,2120.0,13561.0,79,2014-11-14 17:24:00,2014-11-14 17:24:00,76
2014-12,data,34.429,34.429,1032.87,30,2014-11-13 06:58:00,2014-11-13 06:58:00,30
2014-12,sms,1.0,1.0,48.0,48,2014-11-14 17:28:00,2014-11-14 17:28:00,41
2015-01,call,2.0,1859.0,17070.0,88,2014-12-15 20:03:00,2014-12-15 20:03:00,84
2015-01,data,34.429,34.429,1067.299,31,2014-12-13 06:58:00,2014-12-13 06:58:00,31
2015-01,sms,1.0,1.0,86.0,86,2014-12-15 19:56:00,2014-12-15 19:56:00,58
2015-02,call,1.0,1863.0,14416.0,67,2015-01-15 10:36:00,2015-01-15 10:36:00,67


In [31]:
# Group the data frame by month and item and extract a number of stats from each group
data.groupby(
    ['month', 'item'],as_index=False
).agg(
    {
        # Find the min, max, and sum of the duration column
        'duration': [min, max, sum],
        # find the number of network type entries
        'network_type': "count",
        # minimum, first, and number of unique dates
        'date': [min, 'first', 'nunique']
    }
)


Unnamed: 0_level_0,month,item,duration,duration,duration,network_type,date,date,date
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,sum,count,min,first,nunique
0,2014-11,call,1.0,1940.0,25547.0,107,2014-10-15 06:58:00,2014-10-15 06:58:00,104
1,2014-11,data,34.429,34.429,998.441,29,2014-10-15 06:58:00,2014-10-15 06:58:00,29
2,2014-11,sms,1.0,1.0,94.0,94,2014-10-16 22:18:00,2014-10-16 22:18:00,79
3,2014-12,call,2.0,2120.0,13561.0,79,2014-11-14 17:24:00,2014-11-14 17:24:00,76
4,2014-12,data,34.429,34.429,1032.87,30,2014-11-13 06:58:00,2014-11-13 06:58:00,30
5,2014-12,sms,1.0,1.0,48.0,48,2014-11-14 17:28:00,2014-11-14 17:28:00,41
6,2015-01,call,2.0,1859.0,17070.0,88,2014-12-15 20:03:00,2014-12-15 20:03:00,84
7,2015-01,data,34.429,34.429,1067.299,31,2014-12-13 06:58:00,2014-12-13 06:58:00,31
8,2015-01,sms,1.0,1.0,86.0,86,2014-12-15 19:56:00,2014-12-15 19:56:00,58
9,2015-02,call,1.0,1863.0,14416.0,67,2015-01-15 10:36:00,2015-01-15 10:36:00,67


In [61]:
#Give name to aggregate Columns
df=data[data['item'] == 'call'].groupby('month').agg(
    {
    # Get max of the duration column for each group
    'duration': [min,sum],
    # Get min of the duration column for each group
    # Get sum of the duration column for each group
    # Apply a lambda to date column
    "date": lambda x: (max(x) - min(x)).days
    }
)

df

Unnamed: 0_level_0,duration,duration,date
Unnamed: 0_level_1,min,sum,<lambda>
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2014-11,1.0,25547.0,28
2014-12,2.0,13561.0,30
2015-01,2.0,17070.0,30
2015-02,1.0,14416.0,25
2015-03,2.0,21727.0,19


In [62]:
df.columns = df.columns.droplevel(0)
df

Unnamed: 0_level_0,min,sum,<lambda>
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-11,1.0,25547.0,28
2014-12,2.0,13561.0,30
2015-01,2.0,17070.0,30
2015-02,1.0,14416.0,25
2015-03,2.0,21727.0,19


In [65]:
#Give name to aggregate Columns
df=data[data['item'] == 'call'].groupby('month').agg(
    {
    # Get max of the duration column for each group
    'duration': [min,sum],
    # Get min of the duration column for each group
    # Get sum of the duration column for each group
    # Apply a lambda to date column
    "date": lambda x: (max(x) - min(x)).days
    }
)

df.columns = ["_".join(x) for x in df.columns.ravel()]

In [66]:
df

Unnamed: 0_level_0,duration_min,duration_sum,date_<lambda>
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-11,1.0,25547.0,28
2014-12,2.0,13561.0,30
2015-01,2.0,17070.0,30
2015-02,1.0,14416.0,25
2015-03,2.0,21727.0,19


In [68]:
grouped = data.groupby('month').agg({"duration": [min, max, sum]})
grouped.columns = grouped.columns.droplevel(level=0)
grouped.rename(columns={
    "min": "min_duration", "max": "max_duration", "sum": "sum_duration"
})
grouped.head()

Unnamed: 0_level_0,min,max,sum
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-11,1.0,1940.0,26639.441
2014-12,1.0,2120.0,14641.87
2015-01,1.0,1859.0,18223.299
2015-02,1.0,1863.0,15522.299
2015-03,1.0,10528.0,22750.441
