In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 20)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Index Object

In [3]:
college = pd.read_csv('data/college.csv')

In [4]:
columns = college.columns
columns

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

In [5]:
columns.values   # values are the nd.array with 1-dimension

array(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY',
       'RELAFFIL', 'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS',
       'UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
       'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF',
       'CURROPER', 'PCTPELL', 'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10',
       'GRAD_DEBT_MDN_SUPP'], dtype=object)

In [6]:
columns[5]

'WOMENONLY'

In [7]:
columns[[1, 8, 5]]

Index(['CITY', 'SATMTMID', 'WOMENONLY'], dtype='object')

In [8]:
columns[-7:-4]

Index(['PPTUG_EF', 'CURROPER', 'PCTPELL'], dtype='object')

In [9]:
columns.min(), columns.max()

('CITY', 'WOMENONLY')

In [10]:
columns.isnull().sum()

0

In [11]:
columns + '_COL'

Index(['INSTNM_COL', 'CITY_COL', 'STABBR_COL', 'HBCU_COL', 'MENONLY_COL',
       'WOMENONLY_COL', 'RELAFFIL_COL', 'SATVRMID_COL', 'SATMTMID_COL',
       'DISTANCEONLY_COL', 'UGDS_COL', 'UGDS_WHITE_COL', 'UGDS_BLACK_COL',
       'UGDS_HISP_COL', 'UGDS_ASIAN_COL', 'UGDS_AIAN_COL', 'UGDS_NHPI_COL',
       'UGDS_2MOR_COL', 'UGDS_NRA_COL', 'UGDS_UNKN_COL', 'PPTUG_EF_COL',
       'CURROPER_COL', 'PCTPELL_COL', 'PCTFLOAN_COL', 'UG25ABV_COL',
       'MD_EARN_WNE_P10_COL', 'GRAD_DEBT_MDN_SUPP_COL'],
      dtype='object')

#### index are not mutable

In [12]:
# columns[1] = 'city'

In [13]:
c1 = columns[:4]
c1

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU'], dtype='object')

In [14]:
c2 = columns[2:6]
c2

Index(['STABBR', 'HBCU', 'MENONLY', 'WOMENONLY'], dtype='object')

In [15]:
c1.union(c2)

Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR', 'WOMENONLY'], dtype='object')

In [16]:
c1.intersection(c2)

Index(['STABBR', 'HBCU'], dtype='object')

In [17]:
c1.symmetric_difference(c2)

Index(['CITY', 'INSTNM', 'MENONLY', 'WOMENONLY'], dtype='object')

In [18]:
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s2 = pd.Series(index=list('cababb'), data=np.arange(6))

In [19]:
s1, s2

(a    0
 a    1
 a    2
 b    3
 dtype: int64,
 c    0
 a    1
 b    2
 a    3
 b    4
 b    5
 dtype: int64)

In [20]:
s1 + s2

a    1.0
a    3.0
a    2.0
a    4.0
a    3.0
a    5.0
b    5.0
b    7.0
b    8.0
c    NaN
dtype: float64

In [21]:
### Adding series with same index

In [22]:
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('aaabb'), data=np.arange(5))

In [23]:
s1 + s2

a    0
a    2
a    4
b    6
b    8
dtype: int64

In [24]:
employee = pd.read_csv('data/employee.csv', index_col='RACE')
employee.head()

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,...,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hispanic/Latino,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,...,Female,Active,2006-06-12,2012-10-13
Hispanic/Latino,1,LIBRARY ASSISTANT,Library,26125.0,...,Female,Active,2000-07-19,2010-09-18
White,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,...,Male,Active,2015-02-03,2015-02-03
White,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,...,Male,Active,1982-02-08,1991-05-25
White,4,ELECTRICIAN,General Services Department,56347.0,...,Male,Active,1989-06-19,1994-10-22


In [25]:
employee.index.size

2000

In [26]:
sal_1 = employee['BASE_SALARY']
sal_2 = employee['BASE_SALARY']
sal_1 is sal_2    #reference created.

True

In [27]:
sal_1 = employee['BASE_SALARY'].copy()
sal_2 = employee['BASE_SALARY'].copy()
sal_1 is sal_2

False

#### Changing the index alignment for one Series object

In [28]:
sal_1 = sal_1.sort_index()
sal_1.head()

RACE
American Indian or Alaskan Native    78355.0
American Indian or Alaskan Native    26125.0
American Indian or Alaskan Native    98536.0
American Indian or Alaskan Native        NaN
American Indian or Alaskan Native    55461.0
Name: BASE_SALARY, dtype: float64

In [29]:
sal_2.head()

RACE
Hispanic/Latino    121862.0
Hispanic/Latino     26125.0
White               45279.0
White               63166.0
White               56347.0
Name: BASE_SALARY, dtype: float64

In [30]:
sal_add = sal_1 + sal_2     ## cartesian product created for both index columsn, due to different alignment,
sal_add_same = sal_1 + sal_1

In [31]:
len(sal_1),len(sal_2),len(sal_add),len(sal_add_same)

(2000, 2000, 1175424, 2000)

In [32]:
index_vc = sal_1.index.value_counts(dropna=False)  # get all values, default drops the NA values

In [33]:
index_vc

Black or African American            700
White                                665
Hispanic/Latino                      480
Asian/Pacific Islander               107
NaN                                   35
American Indian or Alaskan Native     11
Others                                 2
Name: RACE, dtype: int64

In [34]:
index_vc.pow(2).sum()  # cartesian product proof (index_count * index_count)

1175424

### Adding Series or DF with different index values

In [35]:
baseball_14 = pd.read_csv('data/baseball14.csv', index_col='playerID')
baseball_15 = pd.read_csv('data/baseball15.csv', index_col='playerID')
baseball_16 = pd.read_csv('data/baseball16.csv', index_col='playerID')

In [36]:
baseball_14.head()

Unnamed: 0_level_0,yearID,stint,teamID,lgID,...,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
altuvjo01,2014,1,HOU,AL,...,5.0,1.0,5.0,20.0
cartech02,2014,1,HOU,AL,...,5.0,0.0,4.0,12.0
castrja01,2014,1,HOU,AL,...,9.0,1.0,3.0,11.0
corpoca01,2014,1,HOU,AL,...,3.0,1.0,2.0,3.0
dominma01,2014,1,HOU,AL,...,5.0,2.0,7.0,23.0


In [37]:
baseball_14.index.difference(baseball_15.index)

Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
       'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
      dtype='object', name='playerID')

In [38]:
baseball_14.index.difference(baseball_16.index)

Index(['cartech02', 'corpoca01', 'dominma01', 'fowlede01', 'grossro01',
       'guzmaje01', 'hoeslj01', 'krausma01', 'preslal01', 'singljo02',
       'villajo01'],
      dtype='object', name='playerID')

In [39]:
hits_14 = baseball_14['H']
hits_15 = baseball_15['H']
hits_16 = baseball_16['H']

In [40]:
hits_14.head()

playerID
altuvjo01    225
cartech02    115
castrja01    103
corpoca01     40
dominma01    121
Name: H, dtype: int64

In [41]:
(hits_14 + hits_15).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01      NaN
corpoca01      NaN
Name: H, dtype: float64

In [42]:
hits_14.add(hits_15, fill_value=0).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01     46.0
corpoca01     40.0
Name: H, dtype: float64

In [43]:
hits_total = hits_14.add(hits_15, fill_value=0).add(hits_16, fill_value=0).astype(int)
hits_total.head()

playerID
altuvjo01    641
bregmal01     53
cartech02    193
castrja01    243
congeha01     46
Name: H, dtype: int64

In [44]:
hits_total.hasnans

False

### With data frames

In [45]:
df_14 = baseball_14[['G','AB', 'R', 'H']]
df_15 = baseball_15[['AB', 'R', 'H', 'HR']]

In [46]:
df_14.head()

Unnamed: 0_level_0,G,AB,R,H
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,158,660,85,225
cartech02,145,507,68,115
castrja01,126,465,43,103
corpoca01,55,170,22,40
dominma01,157,564,51,121


In [47]:
df_15.head()

Unnamed: 0_level_0,AB,R,H,HR
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,638,86,200,15
cartech02,391,50,78,24
castrja01,337,38,71,11
congeha01,201,25,46,11
correca01,387,52,108,22


In [48]:
(df_14 + df_15).head(10).style.highlight_null('lightgreen')

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,,425.0,,171.0
cartech02,898.0,,193.0,,118.0
castrja01,802.0,,174.0,,81.0
congeha01,,,,,
corpoca01,,,,,
correca01,,,,,
dominma01,,,,,
fowlede01,,,,,
gattiev01,,,,,
gomezca01,,,,,


In [49]:
df_14.add(df_15, fill_value=0).head(10).style.highlight_null('yellow')

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,158.0,425.0,15.0,171.0
cartech02,898.0,145.0,193.0,24.0,118.0
castrja01,802.0,126.0,174.0,11.0,81.0
congeha01,201.0,,46.0,11.0,25.0
corpoca01,170.0,55.0,40.0,,22.0
correca01,387.0,,108.0,22.0,52.0
dominma01,564.0,157.0,121.0,,51.0
fowlede01,434.0,116.0,120.0,,61.0
gattiev01,566.0,,139.0,27.0,66.0
gomezca01,149.0,,36.0,4.0,19.0


In [50]:
employee = pd.read_csv('data/employee.csv')

In [51]:
dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]
dept_sal.head()

Unnamed: 0,DEPARTMENT,BASE_SALARY
0,Municipal Courts Department,121862.0
1,Library,26125.0
2,Houston Police Department-HPD,45279.0
3,Houston Fire Department (HFD),63166.0
4,General Services Department,56347.0


In [52]:
dept_sal = dept_sal.sort_values(['DEPARTMENT', 'BASE_SALARY'], ascending=[True, False])

In [53]:
dept_sal.head()

Unnamed: 0,DEPARTMENT,BASE_SALARY
1494,Admn. & Regulatory Affairs,140416.0
237,Admn. & Regulatory Affairs,130416.0
1679,Admn. & Regulatory Affairs,103776.0
988,Admn. & Regulatory Affairs,72741.0
693,Admn. & Regulatory Affairs,66825.0


In [54]:
max_dept_sal = dept_sal.drop_duplicates(subset=['DEPARTMENT'])
max_dept_sal.head(5)

Unnamed: 0,DEPARTMENT,BASE_SALARY
1494,Admn. & Regulatory Affairs,140416.0
149,City Controller's Office,64251.0
236,City Council,100000.0
647,Convention and Entertainment,38397.0
1500,Dept of Neighborhoods (DON),89221.0


In [55]:
max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
employee = employee.set_index('DEPARTMENT')

In [56]:
employee.index

Index(['Municipal Courts Department', 'Library',
       'Houston Police Department-HPD', 'Houston Fire Department (HFD)',
       'General Services Department', 'Houston Police Department-HPD',
       'Public Works & Engineering-PWE', 'Houston Airport System (HAS)',
       'Public Works & Engineering-PWE', 'Houston Airport System (HAS)',
       ...
       'Parks & Recreation', 'Houston Police Department-HPD',
       'Houston Airport System (HAS)', 'Houston Police Department-HPD',
       'Houston Police Department-HPD', 'Houston Police Department-HPD',
       'Houston Fire Department (HFD)', 'Houston Police Department-HPD',
       'Houston Police Department-HPD', 'Houston Fire Department (HFD)'],
      dtype='object', name='DEPARTMENT', length=2000)

In [57]:
max_dept_sal.index

Index(['Admn. & Regulatory Affairs', 'City Controller's Office',
       'City Council', 'Convention and Entertainment',
       'Dept of Neighborhoods (DON)', 'Finance', 'Fleet Management Department',
       'General Services Department', 'Health & Human Services',
       'Housing and Community Devp.', 'Houston Airport System (HAS)',
       'Houston Emergency Center (HEC)', 'Houston Fire Department (HFD)',
       'Houston Information Tech Svcs', 'Houston Police Department-HPD',
       'Human Resources Dept.', 'Legal Department', 'Library',
       'Mayor's Office', 'Municipal Courts Department', 'Parks & Recreation',
       'Planning & Development', 'Public Works & Engineering-PWE',
       'Solid Waste Management'],
      dtype='object', name='DEPARTMENT')

In [58]:
employee['MAX_DEPT_SALARY'] = max_dept_sal['BASE_SALARY']
employee.head()

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,RACE,...,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Municipal Courts Department,0,ASSISTANT DIRECTOR (EX LVL),121862.0,Hispanic/Latino,...,Active,2006-06-12,2012-10-13,121862.0
Library,1,LIBRARY ASSISTANT,26125.0,Hispanic/Latino,...,Active,2000-07-19,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE OFFICER,45279.0,White,...,Active,2015-02-03,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER/OPERATOR,63166.0,White,...,Active,1982-02-08,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,56347.0,White,...,Active,1989-06-19,1994-10-22,89194.0


In [59]:
employee.query('BASE_SALARY > MAX_DEPT_SALARY')  # should be zero as all deprtments updated with max salary

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,RACE,...,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


### Getting max value from each column

In [60]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.dtypes

CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
WOMENONLY             float64
                       ...   
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
Length: 26, dtype: object

In [61]:
college.MD_EARN_WNE_P10.iloc[10]

'44200'

In [62]:
college.GRAD_DEBT_MDN_SUPP.iloc[40]

'4500'

In [63]:
college.MD_EARN_WNE_P10.sort_values(ascending=False).head(5)

INSTNM
Sharon Regional Health System School of Nursing          PrivacySuppressed
P&A Scholars Beauty School                               PrivacySuppressed
Fairview Beauty Academy                                  PrivacySuppressed
Rabbi Jacob Joseph School                                PrivacySuppressed
Acupuncture and Integrative Medicine College-Berkeley    PrivacySuppressed
Name: MD_EARN_WNE_P10, dtype: object

In [64]:
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')

In [65]:
college.dtypes.loc[cols]

MD_EARN_WNE_P10       float64
GRAD_DEBT_MDN_SUPP    float64
dtype: object

In [66]:
college_n = college.select_dtypes(include=[np.number])
college_n.head(3)

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,1.0,0.0,0.0,0,...,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,0.0,0.0,0.0,0,...,0.5214,0.2422,39700.0,21941.5
Amridge University,0.0,0.0,0.0,1,...,0.7795,0.854,40100.0,23370.0


In [67]:
criteria = college_n.nunique() == 2
criteria

HBCU                   True
MENONLY                True
WOMENONLY              True
RELAFFIL               True
SATVRMID              False
                      ...  
PCTPELL               False
PCTFLOAN              False
UG25ABV               False
MD_EARN_WNE_P10       False
GRAD_DEBT_MDN_SUPP    False
Length: 24, dtype: bool

In [68]:
binary_cols = college_n.columns[criteria].tolist()  # columns with only two values (binary ??)
binary_cols

['HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL', 'DISTANCEONLY', 'CURROPER']

In [69]:
college_n2 = college_n.drop(labels=binary_cols, axis='columns')

In [70]:
college_n2.head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,424.0,420.0,4206.0,0.0333,...,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,570.0,565.0,11383.0,0.5922,...,0.5214,0.2422,39700.0,21941.5
Amridge University,,,291.0,0.299,...,0.7795,0.854,40100.0,23370.0
University of Alabama in Huntsville,595.0,590.0,5451.0,0.6988,...,0.4596,0.264,45500.0,24097.0
Alabama State University,425.0,430.0,4811.0,0.0158,...,0.7554,0.127,26600.0,33118.5


### getting the max value of each column ( the index value is returned)

In [71]:
max_cols = college_n2.idxmax()

In [72]:
max_cols

SATVRMID                             California Institute of Technology
SATMTMID                             California Institute of Technology
UGDS                                      University of Phoenix-Arizona
UGDS_WHITE                       Mr Leon's School of Hair Design-Moscow
UGDS_BLACK                           Velvatex College of Beauty Culture
UGDS_HISP                       Thunderbird School of Global Management
UGDS_ASIAN                          Cosmopolitan Beauty and Tech School
UGDS_AIAN                             Haskell Indian Nations University
UGDS_NHPI                                       Palau Community College
UGDS_2MOR                                                 LIU Brentwood
UGDS_NRA               California University of Management and Sciences
UGDS_UNKN             Le Cordon Bleu College of Culinary Arts-San Fr...
PPTUG_EF                        Thunderbird School of Global Management
PCTPELL                                        MTI Business Coll

In [73]:
unique_max_cols = max_cols.unique()
unique_max_cols[:5]

array(['California Institute of Technology',
       'University of Phoenix-Arizona',
       "Mr Leon's School of Hair Design-Moscow",
       'Velvatex College of Beauty Culture',
       'Thunderbird School of Global Management'], dtype=object)

In [74]:
college_n2.loc[unique_max_cols].style.highlight_max()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.2787,0.0153,0.1221,0.4385,0.001,0.0,0.057,0.0875,0.0,0.0,0.1126,0.2303,0.0082,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.3098,0.1555,0.076,0.0082,0.0042,0.005,0.1131,0.0131,0.3152,0.0,0.6009,0.592,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.625,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,
Cosmopolitan Beauty and Tech School,,,110.0,0.0091,0.0,0.0182,0.9727,0.0,0.0,0.0,0.0,0.0,0.3182,0.7761,0.1244,0.9545,,
Haskell Indian Nations University,430.0,440.0,805.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0224,0.8396,0.0,0.2089,22800.0,
Palau Community College,,,602.0,0.0,0.0017,0.0,0.0,0.0,0.9983,0.0,0.0,0.0,0.3887,0.856,0.0,0.2616,24700.0,
LIU Brentwood,,,15.0,0.0,0.1333,0.2667,0.0,0.0,0.0,0.5333,0.0,0.0667,0.4,0.5652,0.7826,0.7826,44600.0,25499.0
California University of Management and Sciences,,,98.0,0.0102,0.0204,0.0,0.0408,0.0,0.0,0.0,0.9286,0.0,0.0,0.0926,0.0556,0.6852,,


In [75]:
### highlighting maximum values of each row

In [76]:
college_ugds = college.filter(like='UGDS_').head()

In [77]:
college_ugds.head().style.highlight_max(axis='columns')

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [78]:
college_n.head()

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,1.0,0.0,0.0,0,...,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,0.0,0.0,0.0,0,...,0.5214,0.2422,39700.0,21941.5
Amridge University,0.0,0.0,0.0,1,...,0.7795,0.854,40100.0,23370.0
University of Alabama in Huntsville,0.0,0.0,0.0,0,...,0.4596,0.264,45500.0,24097.0
Alabama State University,1.0,0.0,0.0,0,...,0.7554,0.127,26600.0,33118.5


In [79]:
college_n.max().head()

HBCU           1.0
MENONLY        1.0
WOMENONLY      1.0
RELAFFIL       1.0
SATVRMID     765.0
dtype: float64

In [80]:
college_n.eq(college_n.max()).head()

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,False,False,False,...,False,False,False,False
University of Alabama at Birmingham,False,False,False,False,...,False,False,False,False
Amridge University,False,False,False,True,...,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,...,False,False,False,False
Alabama State University,True,False,False,False,...,False,False,False,False


In [81]:
has_row_max = college_n.eq(college_n.max()).any(axis='columns')
has_row_max

INSTNM
Alabama A & M University                                  True
University of Alabama at Birmingham                       True
Amridge University                                        True
University of Alabama in Huntsville                       True
Alabama State University                                  True
                                                          ... 
SAE Institute of Technology  San Francisco                True
Rasmussen College - Overland Park                         True
National Personal Training Institute of Cleveland         True
Bay Area Medical Academy - San Jose Satellite Location    True
Excel Learning Center-San Antonio South                   True
Length: 7535, dtype: bool

In [82]:
college_n.shape

(7535, 24)

In [83]:
has_row_max.sum()

7210

In [84]:
college_n.eq(college_n.max()).cumsum()

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,1,0,0,0,...,0,0,0,0
University of Alabama at Birmingham,1,0,0,0,...,0,0,0,0
Amridge University,1,0,0,1,...,0,0,0,0
University of Alabama in Huntsville,1,0,0,1,...,0,0,0,0
Alabama State University,2,0,0,1,...,0,0,0,0
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,102,66,38,1435,...,55,12,1,2
Rasmussen College - Overland Park,102,66,38,1436,...,55,12,1,2
National Personal Training Institute of Cleveland,102,66,38,1437,...,55,12,1,2
Bay Area Medical Academy - San Jose Satellite Location,102,66,38,1438,...,55,12,1,2


In [85]:
college_n.eq(college_n.max()).cumsum().cumsum()

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,1,0,0,0,...,0,0,0,0
University of Alabama at Birmingham,2,0,0,0,...,0,0,0,0
Amridge University,3,0,0,1,...,0,0,0,0
University of Alabama in Huntsville,4,0,0,2,...,0,0,0,0
Alabama State University,6,0,0,3,...,0,0,0,0
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,555503,310361,208208,5141571,...,170273,36183,3445,10266
Rasmussen College - Overland Park,555605,310427,208246,5143007,...,170328,36195,3446,10268
National Personal Training Institute of Cleveland,555707,310493,208284,5144444,...,170383,36207,3447,10270
Bay Area Medical Academy - San Jose Satellite Location,555809,310559,208322,5145882,...,170438,36219,3448,10272


In [86]:
has_row_max2 = college_n.eq(college_n.max()) \
                             .cumsum() \
                             .cumsum() \
                             .eq(1) \
                             .any(axis='columns')
has_row_max2

INSTNM
Alabama A & M University                                   True
University of Alabama at Birmingham                       False
Amridge University                                         True
University of Alabama in Huntsville                       False
Alabama State University                                  False
                                                          ...  
SAE Institute of Technology  San Francisco                False
Rasmussen College - Overland Park                         False
National Personal Training Institute of Cleveland         False
Bay Area Medical Academy - San Jose Satellite Location    False
Excel Learning Center-San Antonio South                   False
Length: 7535, dtype: bool

In [87]:
has_row_max2.sum()

20

In [88]:
college_n2.shape

(7535, 18)

In [89]:
idxmax_cols = has_row_max2[has_row_max2].index

In [90]:
idxmax_cols

Index(['Alabama A & M University', 'Amridge University', 'Judson College',
       'Thunderbird School of Global Management',
       'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
       'Velvatex College of Beauty Culture',
       'California Institute of Technology',
       'Le Cordon Bleu College of Culinary Arts-San Francisco',
       'MTI Business College Inc', 'Dongguk University-Los Angeles',
       'Yeshiva Ohr Elchonon Chabad West Coast Talmudical Seminary',
       'Mr Leon's School of Hair Design-Moscow',
       'Haskell Indian Nations University', 'LIU Brentwood',
       'Medical College of Wisconsin', 'Palau Community College',
       'California University of Management and Sciences',
       'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
      dtype='object', name='INSTNM')

In [91]:
set(college_n.idxmax().unique()) == set(idxmax_cols)

True

In [92]:
### finding the most common maximum 

In [93]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,...,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,...,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,...,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,...,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,...,0.0006,0.0098,0.0243,0.0137


In [94]:
college_ugds.shape

(7535, 9)

In [95]:
college_ugds.idxmax()

UGDS_WHITE               Mr Leon's School of Hair Design-Moscow
UGDS_BLACK                   Velvatex College of Beauty Culture
UGDS_HISP               Thunderbird School of Global Management
UGDS_ASIAN                  Cosmopolitan Beauty and Tech School
UGDS_AIAN                     Haskell Indian Nations University
UGDS_NHPI                               Palau Community College
UGDS_2MOR                                         LIU Brentwood
UGDS_NRA       California University of Management and Sciences
UGDS_UNKN     Le Cordon Bleu College of Culinary Arts-San Fr...
dtype: object

In [96]:
highest_percent_races = college_ugds.idxmax(axis='columns')
highest_percent_races.head()

INSTNM
Alabama A & M University               UGDS_BLACK
University of Alabama at Birmingham    UGDS_WHITE
Amridge University                     UGDS_BLACK
University of Alabama in Huntsville    UGDS_WHITE
Alabama State University               UGDS_BLACK
dtype: object

In [97]:
highest_percent_races.value_counts(normalize=True)

UGDS_WHITE    0.670352
UGDS_BLACK    0.151586
UGDS_HISP     0.129473
UGDS_UNKN     0.023422
UGDS_ASIAN    0.012074
UGDS_AIAN     0.006110
UGDS_NRA      0.004073
UGDS_NHPI     0.001746
UGDS_2MOR     0.001164
dtype: float64

In [98]:
college_black = college_ugds[highest_percent_races == 'UGDS_BLACK']

In [99]:
college_black.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,...,0.0019,0.0,0.0059,0.0138
Amridge University,0.299,0.4192,0.0069,0.0034,...,0.0,0.0,0.0,0.2715
Alabama State University,0.0158,0.9208,0.0121,0.0019,...,0.0006,0.0098,0.0243,0.0137
Concordia College Alabama,0.028,0.8758,0.0373,0.0093,...,0.0,0.0031,0.0466,0.0
South University-Montgomery,0.3046,0.6054,0.0153,0.0153,...,0.0096,0.0,0.0019,0.0326


In [100]:
college_black = college_black.drop('UGDS_BLACK', axis='columns')

In [101]:
college_black.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama A & M University,0.0333,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
Amridge University,0.299,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
Alabama State University,0.0158,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137
Concordia College Alabama,0.028,0.0373,0.0093,0.0,0.0,0.0031,0.0466,0.0
South University-Montgomery,0.3046,0.0153,0.0153,0.0153,0.0096,0.0,0.0019,0.0326


In [102]:
college_black.idxmax(axis='columns')

INSTNM
Alabama A & M University                                 UGDS_WHITE
Amridge University                                       UGDS_WHITE
Alabama State University                                   UGDS_NRA
Concordia College Alabama                                  UGDS_NRA
South University-Montgomery                              UGDS_WHITE
                                                            ...    
New Horizons Medical Institute-Winder                    UGDS_WHITE
SAE Institute of Technology-Chicago                      UGDS_WHITE
Fayette Beauty Academy-Ritz Beauty Academy               UGDS_WHITE
National American University-Houston                     UGDS_WHITE
Hollywood Institute of Beauty Careers-West Palm Beach     UGDS_HISP
Length: 1042, dtype: object

In [103]:
college_black.idxmax(axis='columns').value_counts(normalize=True)

UGDS_WHITE    0.661228
UGDS_HISP     0.230326
UGDS_UNKN     0.071977
UGDS_NRA      0.018234
UGDS_ASIAN    0.009597
UGDS_2MOR     0.006718
UGDS_AIAN     0.000960
UGDS_NHPI     0.000960
dtype: float64