In [2]:
# Loading a Sample Pandas DataFrame
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/subaash1112/Assured-Data-Science-FS/data/sales.csv', parse_dates=['date'])
print(df.head())

        date  gender      region  sales
0 2022-08-22    Male  North-West  20381
1 2022-03-05    Male  North-East  14495
2 2022-02-09    Male  North-East  13510
3 2022-06-22    Male  North-East  15983
4 2022-08-10  Female  North-West  15007


In [2]:
# Creating a Pandas GroupBy Object
print(df.groupby('region'))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f4851305250>


In [3]:
# Counting the Groups in a Pandas GroupBy Object
print(df.groupby('region').ngroups)

3


In [4]:
# Accessing the Groups in a GroupBy object
print(df.groupby('region').groups)

{'North-East': [1, 2, 3, 12, 13, 14, 15, 18, 19, 23, 24, 25, 26, 28, 31, 35, 40, 41, 45, 50, 51, 58, 59, 60, 63, 64, 67, 68, 72, 75, 76, 77, 80, 82, 83, 85, 87, 92, 94, 100, 105, 108, 109, 112, 113, 114, 115, 120, 121, 125, 127, 130, 134, 138, 139, 140, 141, 142, 146, 149, 155, 157, 164, 165, 176, 177, 179, 180, 188, 191, 193, 195, 199, 201, 205, 207, 211, 216, 218, 222, 224, 228, 231, 236, 242, 243, 246, 249, 254, 256, 259, 264, 265, 271, 272, 280, 283, 287, 288, 289, ...], 'North-West': [0, 4, 7, 11, 16, 17, 21, 29, 30, 34, 36, 38, 39, 42, 43, 44, 46, 48, 49, 52, 55, 56, 62, 66, 70, 71, 74, 79, 88, 89, 90, 91, 95, 98, 99, 103, 106, 116, 117, 119, 122, 123, 128, 129, 132, 137, 143, 144, 147, 148, 151, 153, 154, 156, 160, 161, 169, 171, 175, 178, 184, 185, 186, 198, 204, 206, 209, 212, 217, 221, 223, 225, 230, 233, 234, 235, 237, 240, 244, 251, 252, 253, 255, 260, 261, 263, 266, 267, 277, 278, 281, 285, 290, 291, 296, 297, 300, 318, 320, 321, ...], 'South': [5, 6, 8, 9, 10, 20, 22, 27,

In [5]:
# Accessing only Group Names of a GroupBy Object
print(df.groupby('region').groups.keys())

dict_keys(['North-East', 'North-West', 'South'])


In [6]:
# Selecting a Pandas GroupBy Group
print(df.groupby('region').get_group('South'))


          date  gender region  sales
5   2022-09-06    Male  South  21792
6   2022-08-21    Male  South  20113
8   2022-11-22    Male  South  14594
9   2022-01-16  Female  South  24114
10  2022-12-21    Male  South  35154
..         ...     ...    ...    ...
972 2022-06-09    Male  South  22254
979 2022-11-24  Female  South  25591
981 2022-12-05    Male  South  34334
985 2022-12-01  Female  South  21282
994 2022-09-29    Male  South  21255

[331 rows x 4 columns]


In [7]:
# Replicating split-apply-combine Without GroupBy

# Create a Container Dictionary
averages = {}

# Split the data into different regions
for region in df['region'].unique():
    tempdf = df[df['region'] == region]

    # Apply an aggregation function
    average = tempdf['sales'].mean()

    # Combine the data into a DataFrame
    averages[region] = [average]

aggregate_df = pd.DataFrame.from_dict(averages, orient='index', columns=['Average Sales'])
print(aggregate_df)

            Average Sales
North-West   15257.732919
North-East   17386.072046
South        24466.864048


In [8]:
# Aggregating Data with Pandas .groupby()
averages = df.groupby('region')['sales'].mean()
print(averages)

region
North-East    17386.072046
North-West    15257.732919
South         24466.864048
Name: sales, dtype: float64


In [9]:
# Calculating the Standard Deviation of Each Group's Sales
standard_deviations = df.groupby('region')['sales'].std()
print(standard_deviations)

region
North-East    2032.541552
North-West    3621.456493
South         5253.702513
Name: sales, dtype: float64


In [10]:
# Applying Multiple Aggregations with .agg()
import numpy as np
aggs = df.groupby('region')['sales'].agg([np.mean, np.std, np.var])
print(aggs)

                    mean          std           var
region                                             
North-East  17386.072046  2032.541552  4.131225e+06
North-West  15257.732919  3621.456493  1.311495e+07
South       24466.864048  5253.702513  2.760139e+07


In [11]:
# Calculating percentage of region's sales
df['Percent Of Region Sales'] = df['sales'] / df.groupby('region')['sales'].transform('sum')
print(df.head())

        date  gender      region  sales  Percent Of Region Sales
0 2022-08-22    Male  North-West  20381                 0.004148
1 2022-03-05    Male  North-East  14495                 0.002403
2 2022-02-09    Male  North-East  13510                 0.002239
3 2022-06-22    Male  North-East  15983                 0.002649
4 2022-08-10  Female  North-West  15007                 0.003055


In [13]:
# Transforming a DataFrame with GroupBy
df['ranked'] = df.groupby('region')['sales'].rank(ascending=False)
print(df.sort_values(by='sales', ascending=False).head())

          date  gender region  sales  Percent Of Region Sales  ranked
61  2022-02-22  Female  South  43775                 0.005405     1.0
673 2022-04-19    Male  South  37878                 0.004677     2.0
111 2022-10-31  Female  South  36444                 0.004500     3.0
892 2022-09-05    Male  South  35723                 0.004411     4.0
136 2022-02-27    Male  South  35485                 0.004382     5.0


In [16]:
# Filtering Rows Where the Group's Average Sale Price is Less Than 20,000
df = df.groupby('region').filter(lambda x: x['sales'].mean() < 20000)
print(df.head())

        date  gender      region  sales  Percent Of Region Sales  ranked
0 2022-08-22    Male  North-West  20381                 0.004148   297.0
1 2022-03-05    Male  North-East  14495                 0.002403    33.0
2 2022-02-09    Male  North-East  13510                 0.002239    10.0
3 2022-06-22    Male  North-East  15983                 0.002649    82.0
4 2022-08-10  Female  North-West  15007                 0.003055   148.5


In [17]:
# Grouping Data by Multiple Columns
sums = df.groupby(['region', 'gender']).sum()
print(sums.head())

                     sales  Percent Of Region Sales   ranked
region     gender                                           
North-East Female  3051132                 0.505743  29509.0
           Male    2981835                 0.494257  30869.0
North-West Female  2455899                 0.499879  25971.5
           Male    2457091                 0.500121  26031.5


In [18]:
# Ranking Sales by Region and by Gender
df['rank'] = df.groupby(['region', 'gender'])['sales'].rank(ascending=False)
print(df.head())

        date  gender      region  sales  Percent Of Region Sales  ranked  \
0 2022-08-22    Male  North-West  20381                 0.004148   297.0   
1 2022-03-05    Male  North-East  14495                 0.002403    33.0   
2 2022-02-09    Male  North-East  13510                 0.002239    10.0   
3 2022-06-22    Male  North-East  15983                 0.002649    82.0   
4 2022-08-10  Female  North-West  15007                 0.003055   148.5   

    rank  
0   11.0  
1  154.0  
2  168.0  
3  138.0  
4   89.5  


In [19]:
# Using a User-Defined Function in a GroupBy Object
def group_range(x):
    return x.max() - x.min()

ranges = df.groupby(['region', 'gender'])['sales'].apply(group_range)
print(ranges)

region      gender
North-East  Female    10881
            Male      10352
North-West  Female    20410
            Male      17469
Name: sales, dtype: int64


In [20]:
# Return the first two records of each group
print(df.groupby(['region', 'gender']).head(2))

         date  gender      region  sales  Percent Of Region Sales  ranked  \
0  2022-08-22    Male  North-West  20381                 0.004148   297.0   
1  2022-03-05    Male  North-East  14495                 0.002403    33.0   
2  2022-02-09    Male  North-East  13510                 0.002239    10.0   
4  2022-08-10  Female  North-West  15007                 0.003055   148.5   
7  2022-07-08    Male  North-West  13650                 0.002778   104.0   
11 2022-04-30  Female  North-West  19631                 0.003996   292.0   
12 2022-11-25  Female  North-East  18262                 0.003027   229.0   
13 2022-08-14  Female  North-East  13733                 0.002276    16.0   

     rank  
0    11.0  
1   154.0  
2   168.0  
4    89.5  
7   112.0  
11   16.0  
12   59.0  
13  169.0  


In [21]:
# Getting the second largest value in each group
print(df.groupby(['region', 'gender'])['sales'].nlargest(2))

region      gender     
North-East  Female  407    22545
                    561    21933
            Male    560    22361
                    442    21951
North-West  Female  758    26813
                    46     24573
            Male    844    23553
                    576    23485
Name: sales, dtype: int64


Exercise 1.3C2 - Solutions

Question 1

In [22]:
print(df.groupby('region')['date'].min())

region
North-East   2022-01-02
North-West   2022-01-02
Name: date, dtype: datetime64[ns]


Question 2

In [23]:
print(df.groupby('region')['sales'].std().min())

2032.5415517362096


Question 3

In [24]:
print(df.groupby(['region', 'gender']).tail(2))

          date  gender      region  sales  Percent Of Region Sales  ranked  \
988 2022-07-10    Male  North-West  12500                 0.002544    81.0   
990 2022-07-07  Female  North-East  16468                 0.002730   113.0   
993 2022-06-11    Male  North-West  14942                 0.003041   145.0   
995 2022-06-02  Female  North-West  14015                 0.002853   116.0   
996 2022-05-20  Female  North-East  15503                 0.002570    61.0   
997 2022-04-02    Male  North-East  18714                 0.003102   262.0   
998 2022-12-07    Male  North-East  19910                 0.003300   310.0   
999 2022-12-19  Female  North-West  16589                 0.003377   198.5   

      rank  
988  129.0  
990  112.0  
993   87.0  
995  102.0  
996  145.0  
997   48.0  
998   22.0  
999   65.0  
