In [1]:
import pandas as pd
import os

#### configure pandas dataset printing.

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 20)

#### configure to print full output of cell, not just the last line

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
file_suffix = '.csv'
if os.name == 'posix':   # mac data file ext is not .csv
    file_suffix = '.txt'

## DataFrame

In [5]:
movie = pd.read_csv('data/movie' + file_suffix)
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


##### Missing values are shown as NaN in Pandas

In [6]:
movie.index

RangeIndex(start=0, stop=4916, step=1)

In [7]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [8]:
movie.head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


In [9]:
movie.tail(3)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
4913,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660
4915,Color,Jon Gunn,43.0,90.0,...,23.0,6.6,1.85,456


## DataFrame Components  (Index, Column, Data)

In [10]:
index = movie.index
columns = movie.columns
data = movie.values

In [11]:
type(index)
type(columns)
type(data)

issubclass(pd.RangeIndex, pd.Index)

pandas.core.indexes.range.RangeIndex

pandas.core.indexes.base.Index

numpy.ndarray

True

In [12]:
movie.index

RangeIndex(start=0, stop=4916, step=1)

In [13]:
index.values
columns.values

array([   0,    1,    2, ..., 4913, 4914, 4915])

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

In [14]:
movie.dtypes

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

## Series

In [15]:
movie.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [16]:
directors = movie['director_name']

In [17]:
type(directors)

pandas.core.series.Series

In [18]:
directors.name

'director_name'

In [19]:
new_df = directors.to_frame()
new_df

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker
...,...
4911,Scott Smith
4912,
4913,Benjamin Roberds
4914,Daniel Hsia


In [20]:
type(new_df)

pandas.core.frame.DataFrame

In [21]:
series_attrs_methods = set(dir(pd.Series))
df_attrs_methods = set(dir(pd.DataFrame))
len(series_attrs_methods & df_attrs_methods)

384

In [22]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [23]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [24]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [25]:
director.value_counts()

Steven Spielberg       26
Woody Allen            22
Clint Eastwood         20
Martin Scorsese        20
Ridley Scott           16
                       ..
Stuart Hazeldine        1
Matthew O'Callaghan     1
Mark Tonderai           1
Gabe Ibáñez             1
David LaChapelle        1
Name: director_name, Length: 2397, dtype: int64

In [26]:
director.value_counts(normalize=True)

Steven Spielberg       0.005401
Woody Allen            0.004570
Clint Eastwood         0.004155
Martin Scorsese        0.004155
Ridley Scott           0.003324
                         ...   
Stuart Hazeldine       0.000208
Matthew O'Callaghan    0.000208
Mark Tonderai          0.000208
Gabe Ibáñez            0.000208
David LaChapelle       0.000208
Name: director_name, Length: 2397, dtype: float64

In [27]:
director.size, len(director), director.shape

(4916, 4916, (4916,))

In [28]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [29]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [30]:
director.hasnans

True

In [31]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [32]:
len(actor_1_fb_likes), actor_1_fb_likes.count()

(4916, 4909)

In [33]:
fb_likes_filled = actor_1_fb_likes.fillna(0)

In [34]:
len(fb_likes_filled), fb_likes_filled.count()

(4916, 4916)

In [35]:
fb_no_likes_dropped = actor_1_fb_likes.dropna()

In [36]:
len(fb_no_likes_dropped), fb_no_likes_dropped.count()

(4909, 4909)

### Series Operations

In [37]:
imdb_score = movie['imdb_score']
type(imdb_score)
imdb_score

pandas.core.series.Series

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [38]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [39]:
imdb_score * 10

0       79.0
1       71.0
2       68.0
3       85.0
4       71.0
        ... 
4911    77.0
4912    75.0
4913    63.0
4914    63.0
4915    66.0
Name: imdb_score, Length: 4916, dtype: float64

In [40]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [41]:
imdb_score.add(10)

0       17.9
1       17.1
2       16.8
3       18.5
4       17.1
        ... 
4911    17.7
4912    17.5
4913    16.3
4914    16.3
4915    16.6
Name: imdb_score, Length: 4916, dtype: float64

#### Series method chaining

In [42]:
directors.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Name: director_name, dtype: int64

In [43]:
actor_1_fb_likes.isnull().sum()

7

In [44]:
actor_1_fb_likes.dtypes

dtype('float64')

In [45]:
actor_1_fb_likes.fillna(0).astype(int).head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

### Index object

In [46]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


In [47]:
movie2 = movie.set_index('movie_title')
movie2.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


In [48]:
movie2 = pd.read_csv('data/movie' + file_suffix, index_col='movie_title')
movie2.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


In [49]:
movie2.reset_index()

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,...,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,...,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,...,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,...,23000.0,8.5,2.35,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,,...,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...
4911,Signed Sealed Delivered,Color,Scott Smith,1.0,...,470.0,7.7,,84
4912,The Following,Color,,43.0,...,593.0,7.5,16.00,32000
4913,A Plague So Pleasant,Color,Benjamin Roberds,13.0,...,0.0,6.3,,16
4914,Shanghai Calling,Color,Daniel Hsia,14.0,...,719.0,6.3,2.35,660


#### renaming columns and rows

In [50]:
movie = movie.set_index('movie_title')

In [51]:
idx_rename = {'Avatar' : 'Ratava', 'Spectre' : 'Ertceps'}
cols_rename = {'director_name' : 'Director Name', 'num_critic_for_reviews' : 'Critics Review'}

In [52]:
movie_renamed = movie.rename(index = idx_rename, columns=cols_rename)

In [53]:
movie_renamed.head()

Unnamed: 0_level_0,color,Director Name,Critics Review,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


#### creating new columns and deleting columns

In [54]:
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


In [55]:
movie['has_seen'] = 0
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,7.9,1.78,33000,0
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,7.1,2.35,0,0
Spectre,Color,Sam Mendes,602.0,148.0,...,6.8,2.35,85000,0
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,8.5,2.35,164000,0
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,7.1,,0,0


In [56]:
movie['actor_director_facebook_likes'] = \
            (movie['actor_1_facebook_likes'] + 
             movie['actor_2_facebook_likes'] + 
             movie['actor_3_facebook_likes'] + 
             movie['director_facebook_likes'])
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,aspect_ratio,movie_facebook_likes,has_seen,actor_director_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,1.78,33000,0,2791.0
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,2.35,0,0,46563.0
Spectre,Color,Sam Mendes,602.0,148.0,...,2.35,85000,0,11554.0
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,2.35,164000,0,95000.0
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,,0,0,


In [57]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [58]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)
movie['actor_director_facebook_likes'].isnull().sum()

0

In [59]:
movie['cast_total_facebook_likes']

movie_title
Avatar                                          4834
Pirates of the Caribbean: At World's End       48350
Spectre                                        11700
The Dark Knight Rises                         106759
Star Wars: Episode VII - The Force Awakens       143
                                               ...  
Signed Sealed Delivered                         2283
The Following                                   1753
A Plague So Pleasant                               0
Shanghai Calling                                2386
My Date with Drew                                163
Name: cast_total_facebook_likes, Length: 4916, dtype: int64

In [60]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= movie['actor_director_facebook_likes']) 

In [61]:
movie['is_cast_likes_more'].all()

False

In [62]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')
movie = movie.drop('is_cast_likes_more', axis='columns')

In [63]:
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,7.9,1.78,33000,0
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,7.1,2.35,0,0
Spectre,Color,Sam Mendes,602.0,148.0,...,6.8,2.35,85000,0
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,8.5,2.35,164000,0
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,7.1,,0,0


In [64]:
movie['actor_total_facebook_likes'] = \
            (movie['actor_1_facebook_likes'] + 
             movie['actor_2_facebook_likes'] + 
             movie['actor_3_facebook_likes'])

In [65]:
movie['actor_total_facebook_likes'] = \
         movie['actor_total_facebook_likes'].fillna(0)

In [66]:
movie['is_cast_likes_more'] = \
            (movie['cast_total_facebook_likes'] >= movie['actor_total_facebook_likes'])

In [67]:
movie['is_cast_likes_more'].all()

True

In [68]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / movie['cast_total_facebook_likes'])

In [69]:
(movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max())

(0.0, 1.0)

In [70]:
movie['pct_actor_cast_like'].head() * 100   # percentage of facebook likes from the cast

movie_title
Avatar                                        57.736864
Pirates of the Caribbean: At World's End      95.139607
Spectre                                       98.752137
The Dark Knight Rises                         68.378310
Star Wars: Episode VII - The Force Awakens     0.000000
Name: pct_actor_cast_like, dtype: float64

In [71]:
profit_index = movie.columns.get_loc('gross') + 1
profit_index

9

In [72]:
movie.insert(loc=profit_index,
                 column='profit',
                 value=movie['gross'] - movie['budget'])

In [73]:
movie['profit'].head()

movie_title
Avatar                                        523505847.0
Pirates of the Caribbean: At World's End        9404152.0
Spectre                                       -44925825.0
The Dark Knight Rises                         198130642.0
Star Wars: Episode VII - The Force Awakens            NaN
Name: profit, dtype: float64