# GroupBy

In [29]:
import pandas as pd

In [30]:
fortune = pd.read_csv('data/fortune1000.csv', index_col='Rank')
sectors = fortune.groupby(by='Sector')

In [31]:
fortune.tail()

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
996,New York Community Bancorp,Financials,Commercial Banks,"Westbury, NY",1902,-47,3448
997,Portland General Electric,Energy,Utilities: Gas and Electric,"Portland, OR",1898,172,2646
997,Portland General Electric,Energy,Utilities: Gas and Electric,"Portland, OR",1898,172,2646
999,Wendy’s,"Hotels, Resturants & Leisure",Food Services,"Dublin, OH",1896,161,21200
1000,Briggs & Stratton,Industrials,Industrial Machinery,"Wauwatosa, WI",1895,46,5480


In [33]:
# notice that these are two different opject types - DataFrame, and DataFrameGroupBy
type(fortune)
type(sectors)

pandas.core.frame.DataFrame

pandas.core.groupby.DataFrameGroupBy

In [36]:
# calling DataFrameGroupBy object (sectors) by itself (without a method) does not return anything interesting
# Think of the DataFrameGroupBy object as a collection of dataframes
sectors

<pandas.core.groupby.DataFrameGroupBy object at 0x1057807b8>

## The `.groupby()` Method

In [38]:
len(fortune)

1000

In [37]:
len(sectors) # returns 21 sectors on the groupby object

21

In [39]:
fortune['Sector'].nunique()

21

In [41]:
sectors.size() # lists out the 21 sectors

Sector
Aerospace & Defense              20
Apparel                          15
Business Services                51
Chemicals                        30
Energy                          122
Engineering & Construction       26
Financials                      139
Food and Drug Stores             15
Food, Beverages & Tobacco        43
Health Care                      75
Hotels, Resturants & Leisure     25
Household Products               28
Industrials                      46
Materials                        43
Media                            25
Motor Vehicles & Parts           24
Retailing                        80
Technology                      102
Telecommunications               15
Transportation                   36
Wholesalers                      40
dtype: int64

In [42]:
fortune['Sector'].value_counts()

Financials                      139
Energy                          122
Technology                      102
Retailing                        80
Health Care                      75
Business Services                51
Industrials                      46
Materials                        43
Food, Beverages & Tobacco        43
Wholesalers                      40
Transportation                   36
Chemicals                        30
Household Products               28
Engineering & Construction       26
Hotels, Resturants & Leisure     25
Media                            25
Motor Vehicles & Parts           24
Aerospace & Defense              20
Food and Drug Stores             15
Telecommunications               15
Apparel                          15
Name: Sector, dtype: int64

In [43]:
sectors.first()

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,Boeing,Aerospace and Defense,"Chicago, IL",96114,5176,161400
Apparel,Nike,Apparel,"Beaverton, OR",30601,3273,62600
Business Services,ManpowerGroup,Temporary Help,"Milwaukee, WI",19330,419,27000
Chemicals,Dow Chemical,Chemicals,"Midland, MI",48778,7685,49495
Energy,Exxon Mobil,Petroleum Refining,"Irving, TX",246204,16150,75600
Engineering & Construction,Fluor,"Engineering, Construction","Irving, TX",18114,413,38758
Financials,Berkshire Hathaway,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
Food and Drug Stores,CVS Health,Food and Drug Stores,"Woonsocket, RI",153290,5237,199000
"Food, Beverages & Tobacco",Archer Daniels Midland,Food Production,"Chicago, IL",67702,1849,32300
Health Care,McKesson,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [44]:
sectors.last()

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,Delta Tucker Holdings,Aerospace and Defense,"McLean, VA",1923,-133,12000
Apparel,Guess,Apparel,"Los Angeles, CA",2204,82,13500
Business Services,DeVry Education Group,Education,"Downers Grove, IL",1910,140,11770
Chemicals,H.B. Fuller,Chemicals,"St. Paul, MN",2084,87,4425
Energy,Portland General Electric,Utilities: Gas and Electric,"Portland, OR",1898,172,2646
Engineering & Construction,MDC Holdings,Homebuilders,"Denver, CO",1909,66,1225
Financials,New York Community Bancorp,Commercial Banks,"Westbury, NY",1902,-47,3448
Food and Drug Stores,Fred’s,Food and Drug Stores,"Memphis, TN",2151,-7,7103
"Food, Beverages & Tobacco",Alliance One International,Tobacco,"Morrisville, NC",2066,-15,6835
Health Care,Providence Service,Health Care: Pharmacy and Other Services,"Tucson, AZ",1987,84,9072


In [45]:
sectors.groups # returns a dictionary with 

{'Aerospace & Defense': Int64Index([ 24,  45,  60,  88, 118, 120, 209, 245, 282, 378, 389, 490, 560,
             605, 785, 788, 836, 903, 958, 987],
            dtype='int64', name='Rank'),
 'Apparel': Int64Index([91, 231, 340, 354, 448, 547, 575, 597, 683, 695, 726, 794, 877,
             882, 917],
            dtype='int64', name='Rank'),
 'Business Services': Int64Index([144, 186, 199, 204, 221, 248, 249, 294, 307, 312, 355, 392, 404,
             440, 467, 468, 481, 485, 492, 503, 545, 626, 635, 652, 677, 694,
             714, 729, 734, 735, 737, 744, 767, 776, 777, 783, 791, 792, 796,
             801, 803, 816, 819, 820, 869, 870, 886, 939, 951, 952, 993],
            dtype='int64', name='Rank'),
 'Chemicals': Int64Index([ 56, 101, 182, 189, 206, 253, 262, 277, 288, 296, 316, 538, 549,
             555, 566, 580, 613, 624, 654, 668, 717, 720, 724, 758, 761, 829,
             865, 898, 934, 949],
            dtype='int64', name='Rank'),
 'Energy': Int64Index([  2,  14,  30,  32,

In [47]:
fortune.loc[24]

Company                     Boeing
Sector         Aerospace & Defense
Industry     Aerospace and Defense
Location               Chicago, IL
Revenue                      96114
Profits                       5176
Employees                   161400
Name: 24, dtype: object

In [48]:
fortune.loc[45]

Company        United Technologies
Sector         Aerospace & Defense
Industry     Aerospace and Defense
Location            Farmington, CT
Revenue                      61047
Profits                       7608
Employees                   197200
Name: 45, dtype: object

In [50]:
fortune.loc[91]

Company               Nike
Sector             Apparel
Industry           Apparel
Location     Beaverton, OR
Revenue              30601
Profits               3273
Employees            62600
Name: 91, dtype: object

## Retrieve a Group with the `.get_group()` Method

In [52]:
sectors.get_group('Apparel')

Unnamed: 0_level_0,Company,Employees,Industry,Location,Profits,Revenue
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
91,Nike,62600,Apparel,"Beaverton, OR",3273,30601
231,VF,64000,Apparel,"Greensboro, NC",1232,12377
340,PVH,26200,Apparel,"New York, NY",572,8020
354,Ralph Lauren,20000,Apparel,"New York, NY",702,7620
448,Hanesbrands,65300,Apparel,"Winston-Salem, NC",429,5732
547,Levi Strauss,12500,Apparel,"San Francisco, CA",209,4495
575,Coach,12950,Apparel,"New York, NY",402,4192
597,Under Armour,9600,Apparel,"Baltimore, MD",233,3963
683,Fossil Group,15100,Apparel,"Richardson, TX",221,3229
695,Skechers U.S.A.,6400,Apparel,"Manhattan Beach, CA",232,3159
