# Chapter 2: Essential DataFrame Operations

In [2]:
import pandas as pd
import numpy as np
# pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

In [3]:
import os


In [4]:
os.chdir(r'D:\a00nutstore\Pandas-Cookbook-Second-Edition-master')

## Introduction

## Selecting Multiple DataFrame Columns

### How to do it\...

In [5]:
movies = pd.read_csv('data/movie.csv')
movie_actor_director = movies[['actor_1_name', 'actor_2_name',
    'actor_3_name', 'director_name']]
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [6]:
type(movies[['director_name']])

pandas.core.frame.DataFrame

In [7]:
type(movies['director_name'])

pandas.core.series.Series

In [8]:
type(movies.loc[:, ['director_name']])

pandas.core.frame.DataFrame

In [9]:
type(movies.loc[:, 'director_name'])

pandas.core.series.Series

### How it works\...

### There\'s more\...

In [11]:
cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movie_actor_director = movies[cols]

In [13]:
movies.filter(items = cols)

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker
...,...,...,...,...
4911,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith
4912,Natalie Zea,Valorie Curry,Sam Underwood,
4913,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds
4914,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia


In [12]:
movies['actor_1_name', 'actor_2_name',
      'actor_3_name', 'director_name']

KeyError: ('actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name')

## Selecting Columns with Methods

### How it works\...

In [14]:
movies = pd.read_csv('data/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)
movies.get_dtype_counts()

AttributeError: 'DataFrame' object has no attribute 'get_dtype_counts'

In [None]:
movies.select_dtypes(include='int').head()

In [None]:
movies.select_dtypes(include='number').head()

In [None]:
movies.select_dtypes(include=['int', 'object']).head()

In [None]:
movies.select_dtypes(exclude='float').head()

In [None]:
movies.filter(like='fb').head()

In [None]:
cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movies.filter(items=cols).head()

In [None]:
movies.filter(regex=r'\d').head()

### How it works\...

### There\'s more\...

### See also

## Ordering Column Names

### How to do it\...

In [None]:
movies = pd.read_csv('data/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)

In [15]:
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [16]:
cat_core = ['movie_title', 'title_year',
            'content_rating', 'genres']
cat_people = ['director_name', 'actor_1_name',
              'actor_2_name', 'actor_3_name']
cat_other = ['color', 'country', 'language',
             'plot_keywords', 'movie_imdb_link']
cont_fb = ['director_fb', 'actor_1_fb',
           'actor_2_fb', 'actor_3_fb',
           'cast_total_fb', 'movie_fb']
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_voted_users', 'num_user',
                    'num_critic']
cont_other = ['imdb_score', 'duration',
               'aspect_ratio', 'facenumber_in_poster']

In [17]:
new_col_order = cat_core + cat_people + \
                cat_other + cont_fb + \
                cont_finance + cont_num_reviews + \
                cont_other
set(movies.columns) == set(new_col_order)

True

In [18]:
movies[new_col_order].head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_fb,budget,gross,num_voted_users,num_user,num_critic,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,...,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,...,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,...,0,,,8,,,7.1,,,0.0


### How it works\...

### There\'s more\...

### See also

## Summarizing a DataFrame

### How to do it\...

In [19]:
movies = pd.read_csv('data/movie.csv')
movies.shape

(4916, 28)

In [20]:
movies.size

137648

In [21]:
movies.ndim

2

In [22]:
len(movies)

4916

In [23]:
movies.count()

color                        4897
director_name                4814
num_critic_for_reviews       4867
duration                     4901
director_facebook_likes      4814
actor_3_facebook_likes       4893
actor_2_name                 4903
actor_1_facebook_likes       4909
gross                        4054
genres                       4916
actor_1_name                 4909
movie_title                  4916
num_voted_users              4916
cast_total_facebook_likes    4916
actor_3_name                 4893
facenumber_in_poster         4903
plot_keywords                4764
movie_imdb_link              4916
num_user_for_reviews         4895
language                     4904
country                      4911
content_rating               4616
budget                       4432
title_year                   4810
actor_2_facebook_likes       4903
imdb_score                   4916
aspect_ratio                 4590
movie_facebook_likes         4916
dtype: int64

In [24]:
movies.min()

  movies.min()


num_critic_for_reviews                                                     1.0
duration                                                                   7.0
director_facebook_likes                                                    0.0
actor_3_facebook_likes                                                     0.0
actor_1_facebook_likes                                                     0.0
gross                                                                    162.0
genres                                                                  Action
movie_title                                                            #Horror
num_voted_users                                                              5
cast_total_facebook_likes                                                    0
facenumber_in_poster                                                       0.0
movie_imdb_link              http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user_for_reviews                                

In [25]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4867.0,137.9889,120.2394,1.0,49.0,108.0,191.0,813.0
duration,4901.0,107.0908,25.28602,7.0,93.0,103.0,118.0,511.0
director_facebook_likes,4814.0,691.0145,2832.954,0.0,7.0,48.0,189.75,23000.0
actor_3_facebook_likes,4893.0,631.2763,1625.875,0.0,132.0,366.0,633.0,23000.0
actor_1_facebook_likes,4909.0,6494.488,15106.99,0.0,607.0,982.0,11000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,5019656.25,25043962.0,61108412.75,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,8361.75,33132.5,93772.75,1689764.0
cast_total_facebook_likes,4916.0,9579.816,18164.32,0.0,1394.75,3049.0,13616.75,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,4895.0,267.6688,372.9348,1.0,64.0,153.0,320.5,5060.0


In [26]:
movies.describe(percentiles=[.01, .3, .99]).T

Unnamed: 0,count,mean,std,min,1%,30%,50%,99%,max
num_critic_for_reviews,4867.0,137.9889,120.2394,1.0,2.0,60.0,108.0,546.68,813.0
duration,4901.0,107.0908,25.28602,7.0,43.0,95.0,103.0,189.0,511.0
director_facebook_likes,4814.0,691.0145,2832.954,0.0,0.0,11.0,48.0,16000.0,23000.0
actor_3_facebook_likes,4893.0,631.2763,1625.875,0.0,0.0,176.0,366.0,11000.0,23000.0
actor_1_facebook_likes,4909.0,6494.488,15106.99,0.0,6.08,694.0,982.0,44920.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,8474.8,7914068.6,25043962.0,326412800.0,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,53.0,11864.5,33132.5,681584.6,1689764.0
cast_total_facebook_likes,4916.0,9579.816,18164.32,0.0,6.0,1684.5,3049.0,62413.9,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,0.0,1.0,8.0,43.0
num_user_for_reviews,4895.0,267.6688,372.9348,1.0,1.94,80.0,153.0,1999.24,5060.0


### How it works\...

### There\'s more\...

In [27]:
movies.min(skipna=False)

  movies.min(skipna=False)


num_critic_for_reviews                                                     NaN
duration                                                                   NaN
director_facebook_likes                                                    NaN
actor_3_facebook_likes                                                     NaN
actor_1_facebook_likes                                                     NaN
gross                                                                      NaN
genres                                                                  Action
movie_title                                                            #Horror
num_voted_users                                                              5
cast_total_facebook_likes                                                    0
facenumber_in_poster                                                       NaN
movie_imdb_link              http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user_for_reviews                                

## Chaining DataFrame Methods

### How to do it\...

In [28]:
movies = pd.read_csv('data/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)
movies.isnull().head()

Unnamed: 0,color,director_name,num_critic,duration,director_fb,actor_3_fb,actor_2_name,actor_1_fb,gross,genres,...,num_user,language,country,content_rating,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,False,False,True,False


In [29]:
(movies
   .isnull()
   .sum()
   .head()
)

color             19
director_name    102
num_critic        49
duration          15
director_fb      102
dtype: int64

In [30]:
movies.isnull().sum().sum()

2654

In [31]:
movies.isnull().any().any()

True

### How it works\...

In [32]:
movies.isnull().get_dtype_counts()

AttributeError: 'DataFrame' object has no attribute 'get_dtype_counts'

### There\'s more\...

In [33]:
movies[['color', 'movie_title', 'color']].max()

  movies[['color', 'movie_title', 'color']].max()


movie_title    Æon Flux
dtype: object

In [34]:
with pd.option_context('max_colwidth', 20):
    movies.select_dtypes(['object']).fillna('').max()

In [35]:
with pd.option_context('max_colwidth', 20):
    (movies
        .select_dtypes(['object'])
        .fillna('')
        .max()
    )

### See also

## DataFrame Operations

In [36]:
colleges = pd.read_csv('data/college.csv')
colleges + 5

TypeError: can only concatenate str (not "int") to str

In [None]:
colleges = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = colleges.filter(like='UGDS_')
college_ugds.head()

In [None]:
name = 'Northwest-Shoals Community College'
college_ugds.loc[name]

In [None]:
college_ugds.loc[name].round(2)

In [None]:
(college_ugds.loc[name] + .0001).round(2)

In [None]:
college_ugds + .00501

In [None]:
(college_ugds + .00501) // .01

In [None]:
college_ugds_op_round = (college_ugds + .00501) // .01 / 100
college_ugds_op_round.head()

In [None]:
college_ugds_round = (college_ugds + .00001).round(2)
college_ugds_round

In [None]:
college_ugds_op_round.equals(college_ugds_round)

### How it works\...

In [None]:
.045 + .005

### There\'s more\...

In [None]:
college2 = (college_ugds
    .add(.00501) 
    .floordiv(.01) 
    .div(100)
)
college2.equals(college_ugds_op_round)

### See also

## Comparing Missing Values

In [None]:
np.nan == np.nan

In [None]:
None == None

In [None]:
np.nan > 5

In [None]:
5 > np.nan

In [None]:
np.nan != 5

### Getting ready

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [None]:
college_ugds == .0019

In [None]:
college_self_compare = college_ugds == college_ugds
college_self_compare.head()

In [None]:
college_self_compare.all()

In [None]:
(college_ugds == np.nan).sum()

In [None]:
college_ugds.isnull().sum()

In [None]:
college_ugds.equals(college_ugds)

### How it works\...

### There\'s more\...

In [None]:
college_ugds.eq(.0019)    # same as college_ugds == .0019

In [None]:
from pandas.testing import assert_frame_equal
assert_frame_equal(college_ugds, college_ugds) is None

## Transposing the direction of a DataFrame operation

### How to do it\...

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

In [None]:
college_ugds.count()

In [None]:
college_ugds.count(axis='columns').head()

In [None]:
college_ugds.sum(axis='columns').head()

In [None]:
college_ugds.median(axis='index')

### How it works\...

### There\'s more\...

In [None]:
college_ugds_cumsum = college_ugds.cumsum(axis=1)
college_ugds_cumsum.head()

### See also

## Determining college campus diversity

In [None]:
pd.read_csv('data/college_diversity.csv', index_col='School')

### How to do it\...

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [None]:
(college_ugds.isnull()
   .sum(axis='columns')
   .sort_values(ascending=False)
   .head()
)

In [None]:
college_ugds = college_ugds.dropna(how='all')
college_ugds.isnull().sum()

In [None]:
college_ugds.ge(.15)

In [None]:
diversity_metric = college_ugds.ge(.15).sum(axis='columns')
diversity_metric.head()

In [None]:
diversity_metric.value_counts()

In [None]:
diversity_metric.sort_values(ascending=False).head()

In [None]:
college_ugds.loc[['Regency Beauty Institute-Austin',
                   'Central Texas Beauty College-Temple']]

In [None]:
us_news_top = ['Rutgers University-Newark',
                  'Andrews University',
                  'Stanford University',
                  'University of Houston',
                  'University of Nevada-Las Vegas']
diversity_metric.loc[us_news_top]

### How it works\...

### There\'s more\...

In [None]:
(college_ugds
   .max(axis=1)
   .sort_values(ascending=False)
   .head(10)
)

In [None]:
(college_ugds > .01).all(axis=1).any()

### See also