## CH 02: Essential DataFrame Operations

In [1]:
import pandas as pd
import numpy as np
# pd.set_option('display.max_columns', 10, 'display.max_rows', 10)

## Introduction

## Selecting Multiple DataFrame Columns

### How to do it\...

In [None]:
movies = pd.read_csv('../data/movie.csv')
movies.head()

In [None]:
# Read in the movie dataset, and pass in a list of the desired columns to the indexing operator

movie_actor_director = movies[['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']]
movie_actor_director.head()

### Selecting Cols as a series or as a data frame

In [None]:
# Here two double [[]] brackets mean a list. So we get a data frame here.

type(movies[['director_name']])

In [None]:
# Here a single double [] bracket means that if we pass in just a string with
the column name, we will get a Series back:

type(movies['director_name'])

In [None]:
# Using .loc method to get a data frame.

# We can also use .loc to pull out a column by name. Because this index operation
# requires that we pass in a row selector first, we will use a colon ( :) to indicate a slice
# that selects all of the rows. This can also return either a DataFrame or a Series:


type(movies.loc[:, ['director_name']])

In [None]:
# Using .loc method to get a series.

type(movies.loc[:, 'director_name'])

### How it works\...

### There\'s more\...

In [None]:
# Passing a long list inside the indexing operator might cause readability issues. 
# To help with this, you may save all your column names to a list variable first.

cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']

movie_actor_director = movies[cols]
movie_actor_director.head()

## Selecting Columns with Methods

### How it works\...

In [None]:
# Although column selection is usually done with the indexing operator, there are some
# DataFrame methods that facilitate their selection in an alternative manner. 

# The .select_dtypes and .filter methods are two useful methods to do this.

In [None]:
# Using methods like .select and .filter

In [None]:
# Use the .select_dtypes method to select only the integer columns:

movies.select_dtypes(include='int').head()

In [None]:
# If you would like to select all the numeric columns, you may pass the string number
# to the include parameter:

movies.select_dtypes(include='number').head()

In [None]:
# If we wanted integer and string columns we could do the following:

movies.select_dtypes(include=['int', 'object']).head()

In [None]:
#  To exclude only floating-point columns, do the following:

movies.select_dtypes(exclude='float').head()

In [None]:
#  An alternative method to select columns is with the .filter method. This method
# is flexible and searches column names (or index labels) based on which parameter
# is used. Here, we use the like parameter to search for all the Facebook columns
# or the names that contain the exact string, fb. The like parameter is checking for
# substrings in column names:

movies.filter(like="fb").head()

In [None]:
# The .filter method has more tricks (or parameters) up its sleeve. If you use the
# items parameters, you can pass in a list of column names:


cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movies.filter(items=cols).head()

In [None]:
# The .filter method allows columns to be searched with regular expressions using
# the regex parameter. Here, we search for all columns that have a digit somewhere
# in their name:

movies.filter(regex=r'\d').head()

## Ordering Column Names

### How to do it\...

In [None]:
# One of the first tasks to consider after initially importing a dataset as a DataFrame is to
# analyze the order of the columns.

movies = pd.read_csv('../data/movie.csv')

In [None]:
movies.head()

In [None]:
# The purpose of this function is to shorten column names by replacing specific substrings 
# with shorter alternatives. For example, if you call shorten('facebook_likes_for_reviews'), 
# it would return 'fb' because it replaces 'facebook_likes' with 'fb' and then removes '_for_reviews'.

def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )

In [None]:
# movies.rename(columns=shorten) renames the columns of the DataFrame movies using the shorten function. 
# The columns parameter of the rename method is set to the shorten function, which means that each 
# column name will be passed through the shorten function.

# The shorten function, performs string manipulations on each column name, replacing specific substrings.
# This operation effectively shortens the column names in the DataFrame movies according to the rules 
# specified in the shorten function. 

In [None]:
movies = movies.rename(columns=shorten)

In [None]:
movies.head()

In [None]:
movies.columns

In [None]:
# The columns don't appear to have any logical ordering to them. Organize the names
# sensibly into lists so that the guideline from the previous section is followed:

In [None]:
cat_core = ['movie_title', 'title_year',
            'content_rating', 'genres']
cat_people = ['director_name', 'actor_1_name',
              'actor_2_name', 'actor_3_name']
cat_other = ['color', 'country', 'language',
             'plot_keywords', 'movie_imdb_link']
cont_fb = ['director_fb', 'actor_1_fb',
           'actor_2_fb', 'actor_3_fb',
           'cast_total_fb', 'movie_fb']
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_voted_users', 'num_user',
                    'num_critic']
cont_other = ['imdb_score', 'duration',
               'aspect_ratio', 'facenumber_in_poster']

In [None]:
new_col_order = cat_core + cat_people + \
                cat_other + cont_fb + \
                cont_finance + cont_num_reviews + \
                cont_other
set(movies.columns) == set(new_col_order)

In [None]:
movies.head()

In [None]:
movies_df = movies[new_col_order].head()

In [None]:
movies_df.head()

## Summarizing a DataFrame

### How to do it\...

In [None]:
# Read in the movie dataset, and examine the basic descriptive properties, .shape,
# .size, and .ndim, along with running the len function:

In [None]:
movies = pd.read_csv('../data/movie.csv')
movies.shape

In [None]:
movies.size

In [None]:
movies.ndim

In [None]:
len(movies)

In [None]:
# The .count method shows the number of non-missing values for each column. It is
# an aggregation method as it summarizes every column in a single value. The output
# is a Series that has the original column names as its index:

movies.count()

In [None]:
#  The other methods that compute summary statistics, .min, .max, .mean, .median,
# and .std, return Series that have the column names of the numeric columns in the
# index and their aggregations as the values:

In [None]:
movies.describe().T

In [None]:
#  It is possible to specify exact quantiles in the .describe method using the percentiles parameter:

movies.describe(percentiles=[.01, .3, .99]).T

## Chaining DataFrame Methods

In [None]:
movies = pd.read_csv('../data/movie.csv')

In [None]:
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )

In [None]:
movies = movies.rename(columns=shorten)

In [None]:
movies.isnull().head()

In [None]:
#  We will chain the .sum method that interprets True and False as 1 and 0, 
#  respectively. Because this is a reduction method, it aggregates the results
#  into a Series:


(movies
   .isnull()
   .sum()
   .head()
)

In [None]:
# We can go one step further and take the sum of this Series and return the count
# of the total number of missing values in the entire DataFrame as a scalar value:

movies.isnull().sum().sum()

In [None]:
# A way to determine whether there are any missing values in the DataFrame is to use
# the .any method twice in succession:

movies.isnull().any().any()

## DataFrame Operations

In [None]:
colleges = pd.read_csv('../data/college.csv')

In [None]:
colleges.head()

In [None]:
# It will not work

colleges + 5

In [None]:
# To successfully use an operator with a DataFrame, first select homogeneous data. For this
# recipe, we will select all the columns that begin with 'UGDS_'. These columns represent the
# fraction of undergraduate students by race. 

# To get started, we import the data and use the institution name as the label for our index, and 
# then select the columns we desire with the .filter method

colleges = pd.read_csv('../data/college.csv', index_col='INSTNM')
college_ugds = colleges.filter(like='UGDS_')
college_ugds.head()

In [None]:
#  pandas does bankers rounding, numbers that are exactly halfway between either side
# to the even side. Look at what happens to the UGDS_BLACK row of this series when
# we round it to two decimal places:

In [None]:
name = 'Northwest-Shoals Community College'
college_ugds.loc[name]

In [None]:
college_ugds.loc[name].round(2)

In [None]:
# If we add .0001 before rounding, it changes to rounding up:

(college_ugds.loc[name] + .0001).round(2)

In [None]:
# Let's do this to the DataFrame. To begin our rounding adventure with operators,
# we will first add .00501 to each value of college_ugds:

college_ugds + .00501

In [None]:
# Use the floor division operator, //, to round down to the nearest whole number percentage:

(college_ugds + .00501) // .01

In [None]:
# To complete the rounding exercise, divide by 100:

college_ugds_op_round = (college_ugds + .00501) // .01 / 100
college_ugds_op_round.head()

In [None]:
college_ugds_round = (college_ugds + .00001).round(2)
college_ugds_round

In [None]:
college_ugds_op_round.equals(college_ugds_round)

## Comparing Missing Values

In [None]:
# pandas uses the NumPy NaN (np.nan) object to represent a missing value. This is an
# unusual object and has interesting mathematical properties. For instance, it is not equal to
# itself. Even Python's None object evaluates as True when compared to itself:

In [None]:
np.nan == np.nan

In [None]:
None == None

In [None]:
np.nan > 5

In [None]:
5 > np.nan

In [None]:
np.nan != 5

### Getting ready

In [None]:
college = pd.read_csv('../data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [None]:
college_ugds == .0019

In [None]:
college_self_compare = college_ugds == college_ugds
college_self_compare.head()

In [None]:
college_self_compare.all()

In [None]:
(college_ugds == np.nan).sum()

In [None]:
college_ugds.isnull().sum()

In [None]:
college_ugds.equals(college_ugds)

### How it works\...

### There\'s more\...

In [None]:
college_ugds.eq(.0019)    # same as college_ugds == .0019

In [None]:
from pandas.testing import assert_frame_equal
assert_frame_equal(college_ugds, college_ugds) is None

## Transposing the direction of a DataFrame operation

### How to do it\...

In [None]:
# Many DataFrame methods have an axis parameter. This parameter controls the direction
# in which the operation takes place. Axis parameters can be 'index' (or 0) or 'columns'
# (or 1). 

In [None]:
# Read in the college dataset; the columns that begin with UGDS represent the
# percentage of the undergraduate students of a particular race. 
# Use the filter method to select these columns:

In [2]:
college = pd.read_csv('../data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [None]:
# Now that the DataFrame contains homogenous column data, operations can be
# sensibly done both vertically and horizontally. 

# The .count method returns the number of non-missing values. 
# By default, its axis parameter is set to 0:

In [3]:
college_ugds.count()

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [None]:
# The axis parameter is almost always set to 0. So, the above step is equivalent to both
# college_ugds.count(axis=0) and college_ugds.count(axis='index').

In [4]:
# Changing the axis parameter to 'columns' changes the direction of the operation
# so that we get back a count of non-missing items in each row:

college_ugds.count(axis='columns').head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [7]:
# Instead of counting non-missing values, we can sum all the values in each row. Each
# row of percentages should add up to 1. 

# The .sum method may be used to verify this:

college_ugds.sum(axis='columns').head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [6]:
# To get an idea of the distribution of each column, the .median method can be used:

college_ugds.median(axis='index')

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

In [8]:
# The .cumsum method with axis=1 accumulates the race percentages across each row.
# It gives a slightly different view of the data. For example, it is very easy to see the exact
# percentage of white and black students for each school:

college_ugds_cumsum = college_ugds.cumsum(axis=1)
college_ugds_cumsum.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,0.9784,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.962,0.9863,1.0


## Determining college campus diversity

In [9]:
pd.read_csv('../data/college_diversity.csv', index_col='School')

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74
"University of Houston Houston, TX",0.74
"University of Nevada--Las Vegas Las Vegas, NV",0.74
"University of San Francisco San Francisco, CA",0.74
"San Francisco State University San Francisco, CA",0.73
"University of Illinois--Chicago Chicago, IL",0.73
"New Jersey Institute of Technology Newark, NJ",0.72
"Texas Woman's University Denton, TX",0.72


### How to do it\...

In [10]:
college = pd.read_csv('../data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [11]:
# Many of these colleges have missing values for all their race columns. We can count
# all the missing values for each row and sort the resulting Series from the highest
# to lowest. This will reveal the colleges that have missing values:

(college_ugds.isnull()
   .sum(axis='columns')
   .sort_values(ascending=False)
   .head()
)

INSTNM
Excel Learning Center-San Antonio South              9
Western State College of Law at Argosy University    9
Albany Law School                                    9
Albany Medical College                               9
A T Still University of Health Sciences              9
dtype: int64

In [12]:
# Now that we have seen the colleges that are missing all their race columns, we
# can use the .dropna method to drop all rows that have all nine race percentages
# missing. We can then count the remaining missing values:

college_ugds = college_ugds.dropna(how='all')
college_ugds.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [13]:
# There are no missing values left in the dataset. We can now calculate our diversity
# metric. To get started, we will use the greater than or equal DataFrame method,
# .ge, to return a DataFrame with a Boolean value for each cell:

college_ugds.ge(.15)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,True,False,False,False,False,False,False,False
University of Alabama at Birmingham,True,True,False,False,False,False,False,False,False
Amridge University,True,True,False,False,False,False,False,False,True
University of Alabama in Huntsville,True,False,False,False,False,False,False,False,False
Alabama State University,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,True,False,False,False,False,False,False
Hollywood Institute of Beauty Careers-Casselberry,False,True,True,False,False,False,False,False,False
Coachella Valley Beauty College-Beaumont,True,False,True,False,False,False,False,False,False
Dewey University-Mayaguez,False,False,True,False,False,False,False,False,False


In [14]:
# From here, we can use the .sum method to count the True values for each college.
# Notice that a Series is returned:

diversity_metric = college_ugds.ge(.15).sum(axis='columns')
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [15]:
# To get an idea of the distribution, we will use the .value_counts method on this Series:

diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
Name: count, dtype: int64

In [16]:
# Amazingly, two schools have more than 15% in five different race categories. Let's
# sort the diversity_metric Series to find out which ones they are:

diversity_metric.sort_values(ascending=False).head()

INSTNM
Central Texas Beauty College-Temple                               5
Regency Beauty Institute-Austin                                   5
Westwood College-O'Hare Airport                                   4
Regency Beauty Institute-Pasadena                                 4
Soma Institute-The National School of Clinical Massage Therapy    4
dtype: int64

In [17]:
#  It seems a little suspicious that schools can be that diverse. Let's look at the raw
# percentages from these top two schools. We will use .loc to select rows based
# on the index label:

college_ugds.loc[['Regency Beauty Institute-Austin',
                   'Central Texas Beauty College-Temple']]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


In [18]:
# It appears that several categories were aggregated into the unknown and two or more
# races column. Regardless of this, they both appear to be quite diverse. We can see
# how the top five US News schools fared with this basic diversity metric:

us_news_top = ['Rutgers University-Newark',
                  'Andrews University',
                  'Stanford University',
                  'University of Houston',
                  'University of Nevada-Las Vegas']
diversity_metric.loc[us_news_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64

In [19]:
# Alternatively, we can find the schools that are least diverse by ordering them by their
# maximum race percentage:


(college_ugds
   .max(axis=1)
   .sort_values(ascending=False)
   .head(10)
)

INSTNM
Caribbean University-Ponce                                        1.0
Brighton Institute of Cosmetology                                 1.0
Mesivta Torah Vodaath Rabbinical Seminary                         1.0
Rabbinical College Telshe                                         1.0
University of Puerto Rico-Mayaguez                                1.0
Haskell Indian Nations University                                 1.0
Lake Career and Technical Center                                  1.0
Leon Studio One School of Hair Design & Career Training Center    1.0
Dewey University-Hato Rey                                         1.0
Columbia Central University-Caguas                                1.0
dtype: float64

In [20]:
# We can also determine if any school has all nine race categories exceeding 1%:

(college_ugds > .01).all(axis=1).any()

True

### THE END