# Chapter 1: Pandas Foundations

## Recipes
* [Dissecting the anatomy of a DataFrame](#Dissecting-the-anatomy-of-a-DataFrame)
* [Accessing the main DataFrame components](#Accessing-the-main-DataFrame-components)
* [Understanding data types](#Understanding-data-types)
* [Selecting a single column of data as a Series](#Selecting-a-single-column-of-data-as-a-Series)
* [Calling Series methods](#Calling-Series-methods)
* [Working with operators on a Series](#Working-with-operators-on-a-Series)
* [Chaining Series methods together](#Chaining-Series-methods-together)
* [Making the index meaningful](#Making-the-index-meaningful)
* [Renaming row and column names](#Renaming-row-and-column-names)
* [Creating and deleting columns](#Creating-and-deleting-columns)

In [142]:
import pandas as pd
import polars as pl
import numpy as np
import time

# Dissecting the anatomy of a DataFrame

#### Change options to get specific output for book

In [143]:
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)

# pl.config.set_option('max_columns', 8)
# pl.config.set_option('max_rows', 30)

In [144]:
start_time_pandas = time.time()
movie_pandas = pd.read_csv('data/movie.csv')
end_time_pandas = time.time()
execution_time_pandas = end_time_pandas - start_time_pandas
print("Execution time for Pandas:", execution_time_pandas)

Execution time for Pandas: 0.04725837707519531


In [145]:
start_time_polars = time.time()
movie_polars = pl.read_csv('data/movie.csv')
end_time_polars = time.time()
execution_time_polars = end_time_polars - start_time_polars
print("Execution time for Polars:", execution_time_polars)

Execution time for Polars: 0.004992961883544922


In [146]:
movie_pl = pl.read_csv('data/movie.csv')
movie_pl.head()

color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
str,str,f64,f64,f64,f64,str,f64,f64,str,str,str,i64,i64,str,f64,str,str,f64,str,str,str,f64,f64,f64,f64,f64,i64
"""Color""","""James Cameron""",723.0,178.0,0.0,855.0,"""Joel David Moo…",1000.0,760505847.0,"""Action|Adventu…","""CCH Pounder""","""Avatar""",886204,4834,"""Wes Studi""",0.0,"""avatar|future|…","""http://www.imd…",3054.0,"""English""","""USA""","""PG-13""",237000000.0,2009.0,936.0,7.9,1.78,33000
"""Color""","""Gore Verbinski…",302.0,169.0,563.0,1000.0,"""Orlando Bloom""",40000.0,309404152.0,"""Action|Adventu…","""Johnny Depp""","""Pirates of the…",471220,48350,"""Jack Davenport…",0.0,"""goddess|marria…","""http://www.imd…",1238.0,"""English""","""USA""","""PG-13""",300000000.0,2007.0,5000.0,7.1,2.35,0
"""Color""","""Sam Mendes""",602.0,148.0,0.0,161.0,"""Rory Kinnear""",11000.0,200074175.0,"""Action|Adventu…","""Christoph Walt…","""Spectre""",275868,11700,"""Stephanie Sigm…",1.0,"""bomb|espionage…","""http://www.imd…",994.0,"""English""","""UK""","""PG-13""",245000000.0,2015.0,393.0,6.8,2.35,85000
"""Color""","""Christopher No…",813.0,164.0,22000.0,23000.0,"""Christian Bale…",27000.0,448130642.0,"""Action|Thrille…","""Tom Hardy""","""The Dark Knigh…",1144337,106759,"""Joseph Gordon-…",0.0,"""deception|impr…","""http://www.imd…",2701.0,"""English""","""USA""","""PG-13""",250000000.0,2012.0,23000.0,8.5,2.35,164000
,"""Doug Walker""",,,131.0,,"""Rob Walker""",131.0,,"""Documentary""","""Doug Walker""","""Star Wars: Epi…",8,143,,0.0,,"""http://www.imd…",,,,,,,12.0,7.1,,0


![dataframe anatomy](../images/ch01_dataframe_anatomy.png)

# Accessing the main DataFrame components

In [147]:
columns = movie_pl.columns
columns

['color',
 'director_name',
 'num_critic_for_reviews',
 'duration',
 'director_facebook_likes',
 'actor_3_facebook_likes',
 'actor_2_name',
 'actor_1_facebook_likes',
 'gross',
 'genres',
 'actor_1_name',
 'movie_title',
 'num_voted_users',
 'cast_total_facebook_likes',
 'actor_3_name',
 'facenumber_in_poster',
 'plot_keywords',
 'movie_imdb_link',
 'num_user_for_reviews',
 'language',
 'country',
 'content_rating',
 'budget',
 'title_year',
 'actor_2_facebook_likes',
 'imdb_score',
 'aspect_ratio',
 'movie_facebook_likes']

In [148]:
type(movie_pl.rows()), type(columns)

(list, list)

In [149]:
for row in movie_pl.rows()[:2]:
    print(row)
    print(type(row))

('Color', 'James Cameron', 723.0, 178.0, 0.0, 855.0, 'Joel David Moore', 1000.0, 760505847.0, 'Action|Adventure|Fantasy|Sci-Fi', 'CCH Pounder', 'Avatar', 886204, 4834, 'Wes Studi', 0.0, 'avatar|future|marine|native|paraplegic', 'http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1', 3054.0, 'English', 'USA', 'PG-13', 237000000.0, 2009.0, 936.0, 7.9, 1.78, 33000)
<class 'tuple'>
('Color', 'Gore Verbinski', 302.0, 169.0, 563.0, 1000.0, 'Orlando Bloom', 40000.0, 309404152.0, 'Action|Adventure|Fantasy', 'Johnny Depp', "Pirates of the Caribbean: At World's End", 471220, 48350, 'Jack Davenport', 0.0, 'goddess|marriage ceremony|marriage proposal|pirate|singapore', 'http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1', 1238.0, 'English', 'USA', 'PG-13', 300000000.0, 2007.0, 5000.0, 7.1, 2.35, 0)
<class 'tuple'>


# Understanding data types

In [150]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv')

In [151]:
# movie_pl.dtypes

In [152]:
movie_pandas.value_counts()

color            director_name     num_critic_for_reviews  duration  director_facebook_likes  actor_3_facebook_likes  actor_2_name      actor_1_facebook_likes  gross        genres                           actor_1_name       movie_title       num_voted_users  cast_total_facebook_likes  actor_3_name       facenumber_in_poster  plot_keywords                                                              movie_imdb_link                                       num_user_for_reviews  language  country  content_rating  budget      title_year  actor_2_facebook_likes  imdb_score  aspect_ratio  movie_facebook_likes
Black and White  Akira Kurosawa    153.0                   202.0     0.0                      4.0                     Minoru Chiaki     304.0                   269061.0     Action|Adventure|Drama           Takashi Shimura    Seven Samurai     229012           338                        Kamatari Fujiwara  6.0                   16th century|battle|japan|practice|samurai                     

In [153]:
movie_pl['actor_1_facebook_likes'].value_counts()

actor_1_facebook_likes,count
f64,u32
11.0,3
797.0,1
380.0,4
311.0,2
165.0,1
…,…
319.0,3
846.0,1
905.0,2
308.0,2


In [154]:
# value_counts = movie_pl.group_by(columns).len()
value_counts = movie_pl.group_by(['director_name', 'genres']).len()
value_counts

director_name,genres,len
str,str,u32
"""Norman Jewison…","""Drama|Thriller…",1
"""Josh Gordon""","""Comedy|Sport""",1
"""Mark Waters""","""Comedy""",1
"""Gregory Poirie…","""Comedy""",1
"""Nicholas Hytne…","""Drama|Music|Ro…",1
…,…,…
"""Frank Coraci""","""Comedy|Family|…",1
"""Spike Jonze""","""Comedy|Drama|F…",1
"""Clint Eastwood…","""Western""",1
"""Colin Trevorro…","""Comedy|Drama|R…",1


# Selecting a single column of data as a Series

In [155]:
# movie = pd.read_csv('data/movie.csv')
movie_pl = pl.read_csv('data/movie.csv')


In [156]:
movie_pl['director_name']

director_name
str
"""James Cameron"""
"""Gore Verbinski…"
"""Sam Mendes"""
"""Christopher No…"
"""Doug Walker"""
…
"""Scott Smith"""
""
"""Benjamin Rober…"
"""Daniel Hsia"""


In [157]:
# movie_pl.director_name  # AttributeError: 'DataFrame' object has no attribute 'director_name'

In [158]:
type(movie_pl['director_name'])

polars.series.series.Series

## There's more

In [159]:
director = movie_pl['director_name'] # save Series to variable
director.name

'director_name'

In [160]:
director.to_frame().head()

director_name
str
"""James Cameron"""
"""Gore Verbinski…"
"""Sam Mendes"""
"""Christopher No…"
"""Doug Walker"""


# Calling Series methods

## Getting ready...

In [161]:
s_attr_methods = set(dir(pl.Series))
len(s_attr_methods)

285

In [162]:
df_attr_methods = set(dir(pl.DataFrame))
len(df_attr_methods)

197

In [163]:
len(s_attr_methods & df_attr_methods)

108

## How to do it...

In [256]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv')
director = movie_pl['director_name']
actor_1_fb_likes = movie_pl['actor_1_facebook_likes']

In [257]:
director.head()

director_name
str
"""James Cameron"""
"""Gore Verbinski…"
"""Sam Mendes"""
"""Christopher No…"
"""Doug Walker"""
"""Andrew Stanton…"
"""Sam Raimi"""
"""Nathan Greno"""
"""Joss Whedon"""
"""David Yates"""


In [258]:
actor_1_fb_likes.head()

actor_1_facebook_likes
f64
1000.0
40000.0
11000.0
27000.0
131.0
640.0
24000.0
799.0
26000.0
25000.0


In [259]:
# pd.set_option('max_rows', 8)
# pd.set_option('display.max_rows', 10)
director.value_counts()

director_name,count
str,u32
"""Corbin Bernsen…",1
"""Peyton Reed""",4
"""Shana Feste""",3
"""Ronny Yu""",4
"""Patrick Meaney…",1
…,…
"""Douglas Aarnio…",2
"""Mora Stephens""",1
"""Jared Hess""",3
"""Youssef Delara…",2


In [260]:
director.value_counts().limit(10)  

director_name,count
str,u32
"""Nicholas Hytne…",3
"""Paul McGuigan""",3
"""Ian Sharp""",1
"""Dror Moreh""",1
"""Nancy Walker""",1
"""Billy Wilder""",4
"""Ron Shelton""",4
"""Elia Kazan""",3
"""Rob Marshall""",5
"""Michael Polish…",5


In [261]:
actor_1_fb_likes.value_counts()

actor_1_facebook_likes,count
f64,u32
260000.0,3
14.0,2
612.0,1
466.0,2
735.0,2
…,…
39.0,2
123.0,1
736.0,2
334.0,1


In [263]:
# director.size
len(director)

4916

In [264]:
director.shape

(4916,)

In [265]:
director.count()

4814

In [266]:
actor_1_fb_likes.count()

4909

In [269]:
# actor_1_fb_likes.quantile() # 50th percentile by default
actor_1_fb_likes.quantile(0.5) # need define


982.0

In [270]:
actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
actor_1_fb_likes.std(), actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848185, 31881444.0)

In [271]:
actor_1_fb_likes.describe()

statistic,value
str,f64
"""count""",4909.0
"""null_count""",7.0
"""mean""",6494.488491
"""std""",15106.986884
"""min""",0.0
"""25%""",607.0
"""50%""",982.0
"""75%""",11000.0
"""max""",640000.0


In [272]:
director.describe()

statistic,value
str,str
"""count""","""4814"""
"""null_count""","""102"""
"""min""","""A. Raven Cruz"""
"""max""","""Étienne Faure"""


In [273]:
actor_1_fb_likes.quantile(.2)

510.0

In [280]:
temp = actor_1_fb_likes

# temp.dropna(inplace=True)
# temp.sort_values(ascending=True)

# temp.drop_nulls_mut(inplace=True)   # for df
temp = temp.drop_nulls()
sorted_series = temp.sort()
sorted_series

actor_1_facebook_likes
f64
0.0
0.0
0.0
0.0
0.0
…
164000.0
260000.0
260000.0
260000.0


In [282]:
# actor_1_fb_likes.quantile()
# actor_1_fb_likes.quantile(.2)
# actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, ])

quantiles = [0.25, 0.5, 0.75]  # Quantiles to calculate
quantile_values = [actor_1_fb_likes.quantile(q) for q in quantiles]
quantile_values

[607.0, 982.0, 11000.0]

In [285]:
# actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

quantiles = [.1, .2, .3, .4, .5, .6, .7, .8, .9]  # Quantiles to calculate
quantile_values = [actor_1_fb_likes.quantile(q) for q in quantiles]
quantile_values

[240.0, 510.0, 694.0, 854.0, 982.0, 1000.0, 8000.0, 13000.0, 18000.0]

In [287]:
director.is_not_null()

director_name
bool
true
true
true
true
true
…
true
false
true
true


In [290]:
actor_1_fb_likes.count()

4909

In [289]:
# actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
actor_1_fb_likes_filled = actor_1_fb_likes.fill_null(0)
actor_1_fb_likes_filled.count()

4916

In [292]:
# actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
actor_1_fb_likes_dropped = actor_1_fb_likes.drop_nulls()
actor_1_fb_likes_dropped.shape

(4909,)

## There's more...

In [296]:
# director.value_counts(normalize=True)


counts = director.value_counts()
total_count = counts.sum()
normalized_counts = counts / total_count
normalized_counts

thread 'polars-0' panicked at crates/polars-core/src/series/arithmetic/borrowed.rs:492:42:
data types don't match: InvalidOperation(ErrString("div operation not supported for dtypes `str` and `str`"))
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: data types don't match: InvalidOperation(ErrString("div operation not supported for dtypes `str` and `str`"))

In [186]:
director.hasnans

True

In [187]:
director.notnull()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

# Working with operators on a Series

In [188]:
pd.options.display.max_rows = 6

In [189]:
5 + 9    # plus operator example. Adds 5 and 9

14

In [190]:
4 ** 2   # exponentiation operator. Raises 4 to the second power

16

In [191]:
a = 10   # assignment operator.

In [192]:
5 <= 9   # less than or equal to operator

True

In [193]:
'abcde' + 'fg'    # plus operator for strings. C

'abcdefg'

In [194]:
not (5 <= 9)      # not is an operator that is a reserved keyword and reverse a boolean

False

In [195]:
7 in [1, 2, 6]    # in operator checks for membership of a list

False

In [196]:
set([1,2,3]) & set([2,3,4])

{2, 3}

In [197]:
# [1, 2, 3] - 3   # TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [198]:
# a = set([1,2,3])     
# a[0]                 # the indexing operator does not work with sets | TypeError: 'set' object does not support indexing

## Getting ready...

In [199]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv')
imdb_score = movie['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
       ... 
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [200]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
       ... 
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [201]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
        ...  
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [202]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
       ... 
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [203]:
imdb_score > 7

0        True
1        True
2       False
        ...  
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [204]:
director = movie['director_name']

In [205]:
director == 'James Cameron'

0        True
1       False
2       False
        ...  
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

## There's more...

In [206]:
imdb_score.add(1)              # imdb_score + 1

0       8.9
1       8.1
2       7.8
       ... 
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [207]:
imdb_score.mul(2.5)            # imdb_score * 2.5

0       19.75
1       17.75
2       17.00
        ...  
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [208]:
imdb_score.floordiv(7)         # imdb_score // 7

0       1.0
1       1.0
2       0.0
       ... 
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [209]:
imdb_score.gt(7)               # imdb_score > 7

0        True
1        True
2       False
        ...  
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [210]:
director.eq('James Cameron')   # director == 'James Cameron'

0        True
1       False
2       False
        ...  
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [211]:
imdb_score.astype(int).mod(5)

0       2
1       2
2       1
       ..
4913    1
4914    1
4915    1
Name: imdb_score, Length: 4916, dtype: int64

In [212]:
a = type(1)

In [213]:
type(a)

type

In [214]:
a = type(imdb_score)

In [215]:
a([1,2,3])

0    1
1    2
2    3
dtype: int64

# Chaining Series methods together

In [216]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv')
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [217]:
director.value_counts().head(3)

director_name
Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: count, dtype: int64

In [218]:
actor_1_fb_likes.isnull().sum()

7

In [219]:
actor_1_fb_likes.dtype

dtype('float64')

In [220]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

## There's more...

In [221]:
actor_1_fb_likes.isnull().mean()

0.0014239218877135883

In [222]:
(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

# Making the index meaningful

In [223]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv')

In [224]:
movie.shape

(4916, 33)

In [225]:
movie2 = movie.set_index('movie_title')
movie2

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
Spectre,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
...,...,...,...,...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0,True,0.0,
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,0,True,2154.0,0.902766
My Date with Drew,Color,Jon Gunn,43.0,90.0,...,0,True,125.0,0.766871


In [226]:
movie_pl = pl.read_csv('data/movie.csv')
# pd.read_csv('data/movie.csv', index_col='movie_title')

# There's more...

In [227]:
movie2.reset_index()

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Avatar,Color,James Cameron,723.0,...,0,True,2791.0,0.577369
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,...,0,True,46000.0,0.951396
2,Spectre,Color,Sam Mendes,602.0,...,0,True,11554.0,0.987521
...,...,...,...,...,...,...,...,...,...
4913,A Plague So Pleasant,Color,Benjamin Roberds,13.0,...,0,True,0.0,
4914,Shanghai Calling,Color,Daniel Hsia,14.0,...,0,True,2154.0,0.902766
4915,My Date with Drew,Color,Jon Gunn,43.0,...,0,True,125.0,0.766871


# Renaming row and column names

In [228]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv', index_col='movie_title')

In [229]:
idx_rename = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
col_rename = {'director_name':'Director Name', 
              'num_critic_for_reviews': 'Critical Reviews'} 

In [230]:
movie.rename(index=idx_rename, columns=col_rename).head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,...,0,True,73000.0,0.683783
4,,Doug Walker,,,...,0,True,0.0,0.0


# There's more

In [231]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv', index_col='movie_title')
index = movie.index
columns = movie.columns

# using tolist function
index_list = index.tolist()
column_list = columns.tolist()

index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

In [232]:
print(index_list[:5])

['Ratava', 1, 'Ertceps', 3, 4]


In [233]:
print(column_list)

['color', 'Director Name', 'Critical Reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'profit', 'genres', 'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'has_seen', 'is_cast_likes_more', 'actor_total_facebook_likes', 'pct_actor_cast_like']


In [234]:
movie.index = index_list
movie.columns = column_list

In [235]:
movie.head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
Ratava,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
Ertceps,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,...,0,True,73000.0,0.683783
4,,Doug Walker,,,...,0,True,0.0,0.0


# Creating and deleting columns

In [236]:
movie_pl = pl.read_csv('data/movie.csv')
# movie = pd.read_csv('data/movie.csv')

In [237]:
movie['has_seen'] = 0

In [238]:
movie.columns

Index(['color', 'Director Name', 'Critical Reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'profit', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'has_seen',
       'is_cast_likes_more', 'actor_total_facebook_likes',
       'pct_actor_cast_like'],
      dtype='object')

In [239]:
# create new actor_director_facebook_likes
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [240]:
movie['actor_director_facebook_likes'].isnull()

Ratava     False
1          False
Ertceps    False
           ...  
4913       False
4914       False
4915       False
Name: actor_director_facebook_likes, Length: 4916, dtype: bool

In [241]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [242]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [243]:
movie['actor_director_facebook_likes'].isnull().sum()

0

In [244]:
# create new "is_cast_likes_more"
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= movie['actor_director_facebook_likes'])

In [245]:
movie['is_cast_likes_more'].value_counts()

is_cast_likes_more
True     3973
False     943
Name: count, dtype: int64

In [246]:
movie['is_cast_likes_more'].all()

False

In [247]:
# drop actor_director_facebook_likes column
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [248]:
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [249]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all()

True

In [250]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / movie['cast_total_facebook_likes'])

In [251]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

(0.0, 1.0)

In [252]:
movie.set_index('movie_title')['pct_actor_cast_like'].head()

movie_title
Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    0.000000
Name: pct_actor_cast_like, dtype: float64

## There's more...

In [253]:
movie.columns.get_loc('gross')

8

In [254]:
profit_index = movie.columns.get_loc('gross') + 1
profit_index

9

In [255]:
movie.insert(loc=profit_index, column='profit', value=movie['gross'] - movie['budget'])

ValueError: cannot insert profit, already exists

In [None]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
3,Color,Christopher Nolan,813.0,164.0,...,0,True,73000.0,0.683783
4,,Doug Walker,,,...,0,True,0.0,0.0
