## Grouping and Aggregation

In [1]:
import pandas as pd
import numpy as np

###  Amounts for each demographic based on marital status

In [2]:
df = pd.read_csv("./data/marketing_campaign.csv", sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [8]:
df.describe()

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,...,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,...,5.316518,0.072768,0.074554,0.072768,0.064286,0.013393,0.009375,3.0,11.0,0.149107
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,...,2.426645,0.259813,0.262728,0.259813,0.245316,0.114976,0.096391,0.0,0.0,0.356274
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,...,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0


In [17]:
amount_bought = [
        'MntWines',
        'MntFruits',
        'MntMeatProducts',
        'MntFishProducts',
        'MntSweetProducts',
        'MntGoldProds'
    ]

In [18]:
df.groupby(['Marital_Status'])[amount_bought].mean()

# df.groupby(['Marital_Status']): This part groups the DataFrame df by the 'Marital_Status' column, creating separate groups 
# for each unique marital status.

# [amount_bought]: This is where you specify the columns you want to calculate the mean for. You should use the amount_bought 
# list as a selector for columns you want to include in the result.

# .mean(): Finally, .mean() calculates the mean for each of the selected columns within each group.


Unnamed: 0_level_0,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Absurd,355.5,84.5,312.5,205.5,30.5,204.0
Alone,184.666667,4.0,26.333333,7.666667,7.0,27.0
Divorced,324.844828,27.426724,150.206897,35.043103,26.818966,46.288793
Married,299.480324,25.734954,160.681713,35.380787,26.701389,42.822917
Single,288.33125,26.835417,182.108333,38.216667,27.2625,43.729167
Together,306.825862,25.35,168.103448,38.991379,26.122414,42.994828
Widow,369.272727,33.090909,189.285714,51.38961,39.012987,56.766234
YOLO,322.0,3.0,50.0,4.0,3.0,42.0


### Grouping by multiple columns

In [20]:
df.groupby(['Education', 'Marital_Status'])['Income'].agg(['median', 'mean'])

# groupby(['Education', 'Marital_Status']): This part of the code groups the DataFrame by two columns, 'Education' and 
# 'Marital_Status'. This means the data will be divided into groups based on unique combinations of these two columns.

# ['Income']: This specifies the column 'Income' from which you want to calculate the median and mean within each group.

# .agg(['median', 'mean']): This part specifies the aggregation functions to be applied within each group. It calculates 
# both the median and mean of the 'Income' column for each unique combination of 'Education' and 'Marital_Status'.

Unnamed: 0_level_0,Unnamed: 1_level_0,median,mean
Education,Marital_Status,Unnamed: 2_level_1,Unnamed: 3_level_1
2n Cycle,Divorced,49118.0,49395.130435
2n Cycle,Married,46462.5,46201.1
2n Cycle,Single,48668.5,53673.944444
2n Cycle,Together,45774.0,44736.410714
2n Cycle,Widow,47682.0,51392.2
Basic,Divorced,9548.0,9548.0
Basic,Married,22352.0,21960.5
Basic,Single,16383.0,18238.666667
Basic,Together,23179.0,21240.071429
Basic,Widow,22123.0,22123.0


### Applying a custom aggregate function

In [21]:
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,13-06-2013,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,10-06-2014,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,25-01-2014,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,24-01-2014,8,428,...,3,0,0,0,0,0,0,3,11,0


In [24]:
def top(df, n=5, column='NumWebPurchases'):
    return df.sort_values(by=column)[-n:]


# df: This is the Pandas DataFrame you want to operate on.

# n=5: This is an optional parameter that specifies how many top rows you want to return. By default, it is set to 5.

# column='NumWebPurchases': This is an optional parameter that specifies the column by which you want to sort the DataFrame. 
# By default, it is set to 'NumWebPurchases'.

# df.sort_values(by=column): This line sorts the DataFrame df based on the specified column in ascending order.

# [-n:]: This part of the code uses slicing to select the last n rows of the sorted DataFrame. Since the DataFrame was sorted 
# in ascending order, these will be the rows with the highest values in the specified column.

# The function then returns the selected top rows.

In [25]:

visits = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
df.groupby(['Education']).apply(top)[visits]

Unnamed: 0_level_0,Unnamed: 1_level_0,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2n Cycle,67,1,11,10,10
2n Cycle,1507,6,11,8,5
2n Cycle,2171,1,11,3,12
2n Cycle,1119,5,11,1,6
2n Cycle,797,3,11,4,8
Basic,1324,2,3,0,3
Basic,1220,3,3,1,3
Basic,1284,2,4,2,5
Basic,502,1,4,1,3
Basic,2013,6,11,2,8


### Filtering rows based on statistics created groups

In [37]:
stocks = pd.read_csv("./data/stock_data.csv")

In [38]:
stocks = pd.read_csv("./data/stock_data.csv", parse_dates=True, index_col = "Unnamed: 0")
stocks.head()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,7.4,21.11,29.22,909.03
2003-01-03,7.45,21.14,29.24,908.59
2003-01-06,7.45,21.52,29.96,929.01
2003-01-07,7.43,21.93,28.95,922.93
2003-01-08,7.28,21.31,28.83,909.93


In [34]:
stocks.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [39]:
# Calculate returns

rets = stocks.pct_change().dropna()
rets

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-03,0.006757,0.001421,0.000684,-0.000484
2003-01-06,0.000000,0.017975,0.024624,0.022474
2003-01-07,-0.002685,0.019052,-0.033712,-0.006545
2003-01-08,-0.020188,-0.028272,-0.004145,-0.014086
2003-01-09,0.008242,0.029094,0.021159,0.019386
...,...,...,...,...
2011-10-10,0.051406,0.026286,0.036977,0.034125
2011-10-11,0.029526,0.002227,-0.000131,0.000544
2011-10-12,0.004747,-0.001481,0.011669,0.009795
2011-10-13,0.015515,0.008160,-0.010238,-0.002974


In [40]:
get_year = lambda x: x.year
by_year = rets.groupby(get_year).sum()*100
by_year

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,43.584872,6.898664,18.771676,21.55207
2004,118.442218,10.185021,25.943358,9.226193
2005,87.945329,0.095162,13.837245,3.483644
2006,23.832396,16.859259,34.845801,13.26972
2007,91.965795,21.529358,24.568472,4.740316
2008,-66.939672,-47.032987,-0.880614,-40.145891
2009,96.233386,54.235841,-10.095848,24.783483
2010,46.161592,-4.332628,11.263742,13.656375
2011,29.584961,1.784256,10.90339,-0.663912
