In [1]:
import pandas as pd

.groupby() will create categories based on common values across 1 or more columns in the original dataframe

In [5]:
#get the data
fortune = pd.read_csv("../data/pandas/fortune1000.csv", index_col="Rank")
fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [10]:
#clean up the data and get it into the desired types and shape
fortune.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 7 columns):
Company      1000 non-null object
Sector       1000 non-null object
Industry     1000 non-null object
Location     1000 non-null object
Revenue      1000 non-null int64
Profits      1000 non-null int64
Employees    1000 non-null int64
dtypes: int64(3), object(4)
memory usage: 62.5+ KB


In [21]:
#determine how many unique values there are for Industry, Location, and Sector
print ("Total rows: ",fortune.count())
print ("Unique values")
print ("Sector:",fortune.Sector.nunique())
print ("Industry:",fortune.Industry.nunique())
print ("Location:",fortune.Location.nunique())

Total rows:  Company      1000
Sector       1000
Industry     1000
Location     1000
Revenue      1000
Profits      1000
Employees    1000
dtype: int64
Unique values
Sector: 21
Industry: 73
Location: 416


In [24]:
#Sector and Industry appear to be good candidates for category types
fortune.Sector = fortune.Sector.astype('category')
fortune.Industry = fortune.Industry.astype('category')
fortune.info() #check our data savings

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 7 columns):
Company      1000 non-null object
Sector       1000 non-null category
Industry     1000 non-null category
Location     1000 non-null object
Revenue      1000 non-null int64
Profits      1000 non-null int64
Employees    1000 non-null int64
dtypes: category(2), int64(3), object(2)
memory usage: 52.7+ KB


In [25]:
# .groupby works best on columns with a common value that is oft repeated in that column
sectors = fortune.groupby("Sector")

In [26]:
len(sectors) #returns the # of groupings instead of the # of rows, like we'd normally expect with a dataframe

21

In [28]:
#find out how big each of these groupings are in rows.  This is similar to value_counts
sectors.size()

Sector
Aerospace & Defense              20
Apparel                          15
Business Services                51
Chemicals                        30
Energy                          122
Engineering & Construction       26
Financials                      139
Food and Drug Stores             15
Food, Beverages & Tobacco        43
Health Care                      75
Hotels, Resturants & Leisure     25
Household Products               28
Industrials                      46
Materials                        43
Media                            25
Motor Vehicles & Parts           24
Retailing                        80
Technology                      102
Telecommunications               15
Transportation                   36
Wholesalers                      40
dtype: int64

In [30]:
# .first() method gets the first row from each grouping.  The .last() does the opposite
sectors.first()

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,Boeing,Aerospace and Defense,"Chicago, IL",96114,5176,161400
Apparel,Nike,Apparel,"Beaverton, OR",30601,3273,62600
Business Services,ManpowerGroup,Temporary Help,"Milwaukee, WI",19330,419,27000
Chemicals,Dow Chemical,Chemicals,"Midland, MI",48778,7685,49495
Energy,Exxon Mobil,Petroleum Refining,"Irving, TX",246204,16150,75600
Engineering & Construction,Fluor,"Engineering, Construction","Irving, TX",18114,413,38758
Financials,Berkshire Hathaway,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
Food and Drug Stores,CVS Health,Food and Drug Stores,"Woonsocket, RI",153290,5237,199000
"Food, Beverages & Tobacco",Archer Daniels Midland,Food Production,"Chicago, IL",67702,1849,32300
Health Care,McKesson,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [31]:
# the 'groups' attribute shows the values associated with the key for each grouping
sectors.groups

{'Aerospace & Defense': Int64Index([ 24,  45,  60,  88, 118, 120, 209, 245, 282, 378, 389, 490, 560,
             605, 785, 788, 836, 903, 958, 987],
            dtype='int64', name='Rank'),
 'Apparel': Int64Index([91, 231, 340, 354, 448, 547, 575, 597, 683, 695, 726, 794, 877,
             882, 917],
            dtype='int64', name='Rank'),
 'Business Services': Int64Index([144, 186, 199, 204, 221, 248, 249, 294, 307, 312, 355, 392, 404,
             440, 467, 468, 481, 485, 492, 503, 545, 626, 635, 652, 677, 694,
             714, 729, 734, 735, 737, 744, 767, 776, 777, 783, 791, 792, 796,
             801, 803, 816, 819, 820, 869, 870, 886, 939, 951, 952, 993],
            dtype='int64', name='Rank'),
 'Chemicals': Int64Index([ 56, 101, 182, 189, 206, 253, 262, 277, 288, 296, 316, 538, 549,
             555, 566, 580, 613, 624, 654, 668, 717, 720, 724, 758, 761, 829,
             865, 898, 934, 949],
            dtype='int64', name='Rank'),
 'Energy': Int64Index([  2,  14,  30,  32,

In [38]:
# find the mean of the values in a specific grouping
sectors.groups['Apparel'][0].mean()

91.0

# .get_group() method
used to extract a group from a dataframegroupby object

In [43]:
#using get_group() returns a dataframe with all the rows in that group
sectors.get_group('Transportation').head()

Unnamed: 0_level_0,Company,Employees,Industry,Location,Profits,Revenue
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
48,UPS,341240,"Mail, Package, and Freight Delivery","Atlanta, GA",4844,58363
58,FedEx,323035,"Mail, Package, and Freight Delivery","Memphis, TN",1050,47453
67,American Airlines Group,118500,Airlines,"Fort Worth, TX",7610,40990
68,Delta Air Lines,82949,Airlines,"Atlanta, GA",4526,40704
80,United Continental Holdings,84000,Airlines,"Chicago, IL",7340,37864


In [44]:
#you can perform normal methods on the dataframe now that you have it
sectors.get_group('Apparel').describe()

Unnamed: 0,Employees,Profits,Revenue
count,15.0,15.0,15.0
mean,23093.133333,549.066667,6397.866667
std,21833.228179,810.479441,7254.770616
min,5978.0,82.0,2204.0
25%,9259.5,191.5,2853.0
50%,13500.0,233.0,3963.0
75%,23100.0,500.5,6676.0
max,65300.0,3273.0,30601.0


In [53]:
#use pivot_table on a groupby group
retail = sectors.get_group('Retailing')
#let's get the median profit and revenue by industry
retail.pivot_table(values=["Profits","Revenue"], index="Industry", aggfunc = "median")

Unnamed: 0_level_0,Profits,Revenue
Industry,Unnamed: 1_level_1,Unnamed: 2_level_1
"Automotive Retailing, Services",228.0,10079.5
General Merchandisers,636.5,19786.5
Specialty Retailers: Apparel,126.5,3520.5
Specialty Retailers: Other,178.5,5521.0


In [58]:
#get the total employees by industry
retail.pivot_table(values='Employees', index='Industry',aggfunc='sum')

Unnamed: 0_level_0,Employees
Industry,Unnamed: 1_level_1
"Advertising, marketing",0
Aerospace and Defense,0
Airlines,0
Apparel,0
"Automotive Retailing, Services",174974
Beverages,0
"Building Materials, Glass",0
Chemicals,0
Commercial Banks,0
Computer Peripherals,0


In [63]:
# Aggregate functions for groupby: max(), min(), count(), median(), mean(), and even describe()!
sectors.mean()

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,17897.0,1437.1,48402.85
Apparel,6397.866667,549.066667,23093.133333
Business Services,5337.156863,553.470588,26687.254902
Chemicals,8129.9,754.266667,15455.033333
Energy,12441.057377,-602.02459,9745.303279
Engineering & Construction,5922.423077,204.0,15642.615385
Financials,15950.784173,1872.007194,24172.28777
Food and Drug Stores,32251.266667,1117.266667,93026.533333
"Food, Beverages & Tobacco",12929.465116,1195.744186,28177.488372
Health Care,21529.426667,1414.853333,35710.52


# groupby() with multiple column values

In [64]:
#groupby sector and industry
sectorXind = fortune.groupby(['Sector','Industry'])

In [67]:
#see the industries specific to each sector
sectorXind.size()

Sector                      Industry                                      
Aerospace & Defense         Aerospace and Defense                             20
Apparel                     Apparel                                           15
Business Services           Advertising, marketing                             2
                            Diversified Outsourcing Services                  14
                            Education                                          3
                            Financial Data Services                           19
                            Miscellaneous                                      3
                            Temporary Help                                     5
                            Waste Management                                   5
Chemicals                   Chemicals                                         30
Energy                      Energy                                            14
                            Mining

In [70]:
sectorXind.sum().dropna() #there's a lot of "NaN", so I use dropna() to get rid of these

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue,Profits,Employees
Sector,Industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aerospace & Defense,Aerospace and Defense,357940.0,28742.0,968057.0
Apparel,Apparel,95968.0,8236.0,346397.0
Business Services,"Advertising, marketing",22748.0,1549.0,124100.0
Business Services,Diversified Outsourcing Services,64829.0,4305.0,708330.0
Business Services,Education,7485.0,69.0,46755.0
Business Services,Financial Data Services,100778.0,17456.0,264926.0
Business Services,Miscellaneous,11185.0,2130.0,37720.0
Business Services,Temporary Help,34716.0,1000.0,60020.0
Business Services,Waste Management,30454.0,1718.0,119199.0
Chemicals,Chemicals,243897.0,22628.0,463651.0


# .agg() method to aggregate columns in a groupby object

In [73]:
# instead of calling a method collectively, be specific about what operation to use on each column. 
#use a dict {} for the arguments
sectors.agg({"Revenue":"sum",
              "Profits": "sum",
              "Employees": "mean"})

Unnamed: 0_level_0,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aerospace & Defense,357940,28742,48402.85
Apparel,95968,8236,23093.133333
Business Services,272195,28227,26687.254902
Chemicals,243897,22628,15455.033333
Energy,1517809,-73447,9745.303279
Engineering & Construction,153983,5304,15642.615385
Financials,2217159,260209,24172.28777
Food and Drug Stores,483769,16759,93026.533333
"Food, Beverages & Tobacco",555967,51417,28177.488372
Health Care,1614707,106114,35710.52


In [74]:
#you can also give a list to .agg() to apply multiple operations to multiple columns
sectors.agg(["size","sum","mean"])

Unnamed: 0_level_0,Revenue,Revenue,Revenue,Profits,Profits,Profits,Employees,Employees,Employees
Unnamed: 0_level_1,size,sum,mean,size,sum,mean,size,sum,mean
Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Aerospace & Defense,20,357940,17897.0,20,28742,1437.1,20,968057,48402.85
Apparel,15,95968,6397.866667,15,8236,549.066667,15,346397,23093.133333
Business Services,51,272195,5337.156863,51,28227,553.470588,51,1361050,26687.254902
Chemicals,30,243897,8129.9,30,22628,754.266667,30,463651,15455.033333
Energy,122,1517809,12441.057377,122,-73447,-602.02459,122,1188927,9745.303279
Engineering & Construction,26,153983,5922.423077,26,5304,204.0,26,406708,15642.615385
Financials,139,2217159,15950.784173,139,260209,1872.007194,139,3359948,24172.28777
Food and Drug Stores,15,483769,32251.266667,15,16759,1117.266667,15,1395398,93026.533333
"Food, Beverages & Tobacco",43,555967,12929.465116,43,51417,1195.744186,43,1211632,28177.488372
Health Care,75,1614707,21529.426667,75,106114,1414.853333,75,2678289,35710.52


# iterating over groups!

In [77]:
#create a blank dataframe but with the structure of the fortune df
df = pd.DataFrame(columns= fortune.columns)
df

Unnamed: 0,Company,Sector,Industry,Location,Revenue,Profits,Employees


In [78]:
#iterate over the df
for sector, data in sectors:
    highest_revenue_co_in_group = data.nlargest(1,"Revenue") #get the largest value for this sector, only return highest value, choose the "Revenue" column
    #now add it to my new dataframe
    df = df.append(highest_revenue_co_in_group)

In [79]:
df

Unnamed: 0,Company,Sector,Industry,Location,Revenue,Profits,Employees
24,Boeing,Aerospace & Defense,Aerospace and Defense,"Chicago, IL",96114,5176,161400
91,Nike,Apparel,Apparel,"Beaverton, OR",30601,3273,62600
144,ManpowerGroup,Business Services,Temporary Help,"Milwaukee, WI",19330,419,27000
56,Dow Chemical,Chemicals,Chemicals,"Midland, MI",48778,7685,49495
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
155,Fluor,Engineering & Construction,"Engineering, Construction","Irving, TX",18114,413,38758
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
7,CVS Health,Food and Drug Stores,Food and Drug Stores,"Woonsocket, RI",153290,5237,199000
41,Archer Daniels Midland,"Food, Beverages & Tobacco",Food Production,"Chicago, IL",67702,1849,32300
5,McKesson,Health Care,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [80]:
#get the highest revenue by city
cities = fortune.groupby("Location")

#create a blank df to put this into
df = pd.DataFrame(columns=fortune.columns)
df

Unnamed: 0,Company,Sector,Industry,Location,Revenue,Profits,Employees


In [85]:
#loop over each city
for city, data in cities:
    highest_revenue_in_city = data.nlargest(1,"Revenue") #get top Revenue for each column
    df = df.append(highest_revenue_in_city)
df

Unnamed: 0,Company,Sector,Industry,Location,Revenue,Profits,Employees
138,Abbott Laboratories,Health Care,Medical Products and Equipment,"Abbott Park, IL",20661,4423,74000
169,Goodyear Tire & Rubber,Motor Vehicles & Parts,Motor Vehicles and Parts,"Akron, OH",16443,307,66000
288,Air Products & Chemicals,Chemicals,Chemicals,"Allentown, PA",9895,1278,19550
830,Benchmark Electronics,Technology,Semiconductors and Other Electronic Components,"Angleton, TX",2541,95,10500
374,Casey’s General Stores,Retailing,Specialty Retailers: Other,"Ankeny, IA",7052,181,22408
915,Domino’s Pizza,"Hotels, Resturants & Leisure",Food Services,"Ann Arbor, MI",2217,193,11900
596,Colfax,Industrials,Industrial Machinery,"Annapolis Junction, MD",3967,168,17087
215,Land O’Lakes,"Food, Beverages & Tobacco",Food Consumer Products,"Arden Hills, MN",13161,308,10000
190,AES,Energy,Utilities: Gas and Electric,"Arlington, VA",14963,306,21000
31,IBM,Technology,Information Technology Services,"Armonk, NY",82461,13190,411798
