# Chapter 1: Pandas Foundations

## Recipes
* [Dissecting the anatomy of a DataFrame](#Dissecting-the-anatomy-of-a-DataFrame)
* [Accessing the main DataFrame components](#Accessing-the-main-DataFrame-components)
* [Understanding data types](#Understanding-data-types)
* [Selecting a single column of data as a Series](#Selecting-a-single-column-of-data-as-a-Series)
* [Calling Series methods](#Calling-Series-methods)
* [Working with operators on a Series](#Working-with-operators-on-a-Series)
* [Chaining Series methods together](#Chaining-Series-methods-together)
* [Making the index meaningful](#Making-the-index-meaningful)
* [Renaming row and column names](#Renaming-row-and-column-names)
* [Creating and deleting columns](#Creating-and-deleting-columns)

In [360]:
import pandas as pd
import numpy as np

# Dissecting the anatomy of a DataFrame

#### Change options to get specific output for book

In [235]:
# pd.set_option('max_columns', 8, 'max_rows', 10)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 28)

In [236]:
movie = pd.read_csv('data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


![dataframe anatomy](../images/ch01_dataframe_anatomy.png)

# Accessing the main DataFrame components

In [237]:
columns = movie.columns
index = movie.index
data = movie.values

In [238]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [239]:
index

RangeIndex(start=0, stop=4916, step=1)

In [240]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [241]:
type(index)

pandas.core.indexes.range.RangeIndex

In [242]:
type(columns)

pandas.core.indexes.base.Index

In [243]:
type(data)

numpy.ndarray

In [244]:
issubclass(pd.RangeIndex, pd.Index)

True

## There's more

In [245]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915])

In [246]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

# Understanding data types

In [247]:
movie = pd.read_csv('data/movie.csv')

In [248]:
movie_cp = movie.copy()
dir_names = movie_cp.director_name

In [249]:
temp = dir_names.to_frame()
temp.head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


In [250]:
type(temp)

pandas.core.frame.DataFrame

In [251]:
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [252]:
movie.value_counts()

color            director_name     num_critic_for_reviews  duration  director_facebook_likes  actor_3_facebook_likes  actor_2_name      actor_1_facebook_likes  gross        genres                           actor_1_name       movie_title       num_voted_users  cast_total_facebook_likes  actor_3_name       facenumber_in_poster  plot_keywords                                                              movie_imdb_link                                       num_user_for_reviews  language  country  content_rating  budget      title_year  actor_2_facebook_likes  imdb_score  aspect_ratio  movie_facebook_likes
Black and White  Akira Kurosawa    153.0                   202.0     0.0                      4.0                     Minoru Chiaki     304.0                   269061.0     Action|Adventure|Drama           Takashi Shimura    Seven Samurai     229012           338                        Kamatari Fujiwara  6.0                   16th century|battle|japan|practice|samurai                     

# Selecting a single column of data as a Series

![dataframe anatomy](../images/ch01_series_anatomy.png)

In [253]:
movie = pd.read_csv('data/movie.csv')

In [254]:
movie['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [255]:
movie.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [256]:
type(movie['director_name'])

pandas.core.series.Series

In [357]:
movie.director_name.value_counts()

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: count, Length: 2397, dtype: int64

## There's more

In [257]:
director = movie['director_name'] # save Series to variable
director.name

'director_name'

In [258]:
director.to_frame().head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


# Calling Series methods

## Getting ready...

In [259]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

411

In [260]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

427

In [261]:
len(s_attr_methods & df_attr_methods)

357

## How to do it...

In [262]:
movie = pd.read_csv('data/movie.csv')
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [359]:
director.value_counts()

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: count, Length: 2397, dtype: int64

In [263]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [264]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [265]:
# pd.set_option('max_rows', 8)
pd.set_option('display.max_rows', 10)
director.value_counts()

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: count, Length: 2397, dtype: int64

In [266]:
actor_1_fb_likes.value_counts()

actor_1_facebook_likes
1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
703.0        1
208.0        1
79.0         1
269.0        1
291.0        1
Name: count, Length: 877, dtype: int64

In [267]:
director.size

4916

In [268]:
director.shape

(4916,)

In [269]:
len(director)

4916

In [270]:
director.count()

4814

In [271]:
actor_1_fb_likes.count()

4909

In [272]:
actor_1_fb_likes.quantile() # 50th percentile by default

982.0

In [273]:
actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
actor_1_fb_likes.std(), actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)

In [274]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [275]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [276]:
actor_1_fb_likes.quantile(.2)

510.0

In [277]:
temp = actor_1_fb_likes
temp.dropna(inplace=True)
temp.sort_values(ascending=True)

4542         0.0
4883         0.0
4602         0.0
4866         0.0
4273         0.0
          ...   
1212    164000.0
4312    260000.0
4485    260000.0
4592    260000.0
1885    640000.0
Name: actor_1_facebook_likes, Length: 4909, dtype: float64

In [278]:
# actor_1_fb_likes.quantile()
# actor_1_fb_likes.quantile(.2)
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, ])

0.1    240.0
0.2    510.0
0.3    694.0
0.4    854.0
0.5    982.0
Name: actor_1_facebook_likes, dtype: float64

In [279]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [280]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [281]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
actor_1_fb_likes_filled.count()

4909

In [282]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
actor_1_fb_likes_dropped.size

4909

## There's more...

In [283]:
director.value_counts(normalize=True)

director_name
Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
Ridley Scott        0.003324
                      ...   
John Putch          0.000208
Luca Guadagnino     0.000208
Sam Fell            0.000208
Dan Fogelman        0.000208
Daniel Hsia         0.000208
Name: proportion, Length: 2397, dtype: float64

In [284]:
director.hasnans

True

In [285]:
director.notnull()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

# Working with operators on a Series

In [286]:
pd.options.display.max_rows = 30

In [287]:
5 + 9    # plus operator example. Adds 5 and 9

14

In [288]:
4 ** 2   # exponentiation operator. Raises 4 to the second power

16

In [289]:
a = 10   # assignment operator.

In [290]:
5 <= 9   # less than or equal to operator

True

In [291]:
'abcde' + 'fg'    # plus operator for strings. C

'abcdefg'

In [292]:
not (5 <= 9)      # not is an operator that is a reserved keyword and reverse a boolean

False

In [293]:
7 in [1, 2, 6]    # in operator checks for membership of a list

False

In [294]:
set([1,2,3]) & set([2,3,4])

{2, 3}

In [295]:
# [1, 2, 3] - 3   # TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [296]:
# a = set([1,2,3])     
# a[0]                 # the indexing operator does not work with sets | TypeError: 'set' object does not support indexing

## Getting ready...

In [297]:
movie = pd.read_csv('data/movie.csv')
type(movie.dtypes)

pandas.core.series.Series

In [298]:
# movie.dtypes
numeric_dtypes = movie.dtypes[movie.dtypes.apply(lambda x: np.issubdtype(x, np.number))]
numeric_dtypes

num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_1_facebook_likes       float64
gross                        float64
num_voted_users                int64
cast_total_facebook_likes      int64
facenumber_in_poster         float64
num_user_for_reviews         float64
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
dtype: object

In [299]:
imdb_score = movie['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [300]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [301]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [302]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [303]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [304]:
director = movie['director_name']

In [305]:
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

## There's more...

In [306]:
imdb_score.add(1)              # imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [307]:
imdb_score.mul(2.5)            # imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [308]:
imdb_score.floordiv(7)         # imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [309]:
imdb_score.gt(7)               # imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [310]:
director.eq('James Cameron')   # director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [311]:
imdb_score.astype(int).mod(5)

0       2
1       2
2       1
3       3
4       2
       ..
4911    2
4912    2
4913    1
4914    1
4915    1
Name: imdb_score, Length: 4916, dtype: int64

In [312]:
a = type(1)

In [313]:
type(a)

type

In [314]:
a = type(imdb_score)

In [315]:
a([1,2,3])

0    1
1    2
2    3
dtype: int64

# Chaining Series methods together

In [316]:
movie = pd.read_csv('data/movie.csv')
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [317]:
director.value_counts().head(3)

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: count, dtype: int64

In [318]:
actor_1_fb_likes.isnull().sum()

7

In [319]:
actor_1_fb_likes.dtype

dtype('float64')

In [320]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

## There's more...

In [321]:
actor_1_fb_likes.isnull().mean()

0.0014239218877135883

In [322]:
(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

# Making the index meaningful

In [323]:
movie = pd.read_csv('data/movie.csv')

In [324]:
movie.shape

(4916, 28)

In [325]:
movie2 = movie.set_index('movie_title')
movie2

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...
Signed Sealed Delivered,Color,Scott Smith,1.0,87.0,...,470.0,7.7,,84
The Following,Color,,43.0,43.0,...,593.0,7.5,16.00,32000
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660


In [326]:
pd.read_csv('data/movie.csv', index_col='movie_title')

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...
Signed Sealed Delivered,Color,Scott Smith,1.0,87.0,...,470.0,7.7,,84
The Following,Color,,43.0,43.0,...,593.0,7.5,16.00,32000
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660


# There's more...

In [327]:
movie2.reset_index()

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,...,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,...,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,...,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,...,23000.0,8.5,2.35,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,,...,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...
4911,Signed Sealed Delivered,Color,Scott Smith,1.0,...,470.0,7.7,,84
4912,The Following,Color,,43.0,...,593.0,7.5,16.00,32000
4913,A Plague So Pleasant,Color,Benjamin Roberds,13.0,...,0.0,6.3,,16
4914,Shanghai Calling,Color,Daniel Hsia,14.0,...,719.0,6.3,2.35,660


# Renaming row and column names

In [328]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')

In [329]:
idx_rename = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
col_rename = {'director_name':'Director Name', 
              'num_critic_for_reviews': 'Critical Reviews'} 

In [330]:
movie.rename(index=idx_rename, columns=col_rename).head()

Unnamed: 0_level_0,color,Director Name,Critical Reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


# There's more

In [331]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
index = movie.index
columns = movie.columns

# using tolist function
index_list = index.tolist()
column_list = columns.tolist()

index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

In [332]:
print(index_list[:5])

['Ratava', "Pirates of the Caribbean: At World's End", 'Ertceps', 'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens']


In [333]:
print(column_list)

['color', 'Director Name', 'Critical Reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes']


In [334]:
movie.index = index_list
movie.columns = column_list

In [335]:
movie.head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


# Creating and deleting columns

In [336]:
movie = pd.read_csv('data/movie.csv')

In [337]:
movie['has_seen'] = 0

In [338]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'has_seen'],
      dtype='object')

In [339]:
# create new actor_director_facebook_likes
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [340]:
movie['actor_director_facebook_likes'].isnull()

0       False
1       False
2       False
3       False
4        True
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: actor_director_facebook_likes, Length: 4916, dtype: bool

In [341]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [342]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [343]:
movie['actor_director_facebook_likes'].isnull().sum()

0

In [344]:
# create new "is_cast_likes_more"
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= movie['actor_director_facebook_likes'])

In [345]:
movie['is_cast_likes_more'].value_counts()

is_cast_likes_more
True     3973
False     943
Name: count, dtype: int64

In [346]:
movie['is_cast_likes_more'].all()

False

In [347]:
# drop actor_director_facebook_likes column
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [348]:
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [349]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all()

True

In [350]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / movie['cast_total_facebook_likes'])

In [351]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

(0.0, 1.0)

In [352]:
movie.set_index('movie_title')['pct_actor_cast_like'].head()

movie_title
Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    0.000000
Name: pct_actor_cast_like, dtype: float64

## There's more...

In [353]:
movie.columns.get_loc('gross')

8

In [354]:
profit_index = movie.columns.get_loc('gross') + 1
profit_index

9

In [355]:
movie.insert(loc=profit_index, column='profit', value=movie['gross'] - movie['budget'])

In [356]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,...,0,True,73000.0,0.683783
4,,Doug Walker,,,...,0,True,0.0,0.0
