# Chapter 1: Pandas Foundations

In [1]:
import pandas as pd
import numpy as np

In [None]:
# More specifically, the index is axis 0, and the columns are axis 1.
# Axis 0 --> Rows/Index. We call rows as index in python language.
# Axis 1 --> Columns

## Dissecting the anatomy of a DataFrame

In [None]:
pd.set_option('display.max_columns', 4, 'display.max_rows', 10)

In [None]:
movies = pd.read_csv('../data/movie.csv')
movies.head()

## DataFrame Attributes

In [None]:
columns = movies.columns
index = movies.index
data = movies.values

In [None]:
columns

In [None]:
index

In [None]:
data

In [None]:
type(index)

In [None]:
type(columns)

In [None]:
type(data)

In [None]:
# The index and the columns are closely related. Both of them are subclasses of
# Index. This allows you to perform similar operations on both the index and the
# columns:

issubclass(pd.RangeIndex, pd.Index)

In [None]:
index.values

In [None]:
columns.values

## Understanding data types

In [None]:
movies.dtypes

In [None]:
movies.info()

## Selecting a Column

In [None]:
movies['director_name']

In [None]:
movies.director_name

In [None]:
# The usage of .loc specifies a selector for both rows and columns separated by
# a comma. The row selector is a slice with no start or end name (:) which means
# select all of the rows. The column selector will just pull out the column named
# director_name.

movies.loc[:, 'director_name']

In [None]:
# The .iloc index operation also specifies both row and column selectors. The row
# selector is the slice with no start or end index (:) that selects all of the rows. The
# column selector, 1, pulls off the second column (remember that Python is zerobased):

movies.iloc[:, 1]

In [None]:
movies['director_name'].index

In [None]:
movies['director_name'].dtype

In [None]:
movies['director_name'].size

In [None]:
movies['director_name'].name

In [None]:
# In pandas, a Series is a one-dimensional labeled array capable of holding any data type.

type(movies['director_name'])

In [None]:
# In summary, the code is used to determine the unique data types present in the 
# 'director_name' column of the movies DataFrame by applying the type function to 
# each element in that column and then extracting the unique types.

movies['director_name'].apply(type).unique()

In [None]:

director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

In [None]:
director.dtype

In [None]:
fb_likes.dtype

In [None]:
director.head()

In [None]:
director.sample(n=5, random_state=42)

In [None]:
fb_likes.head()

In [None]:
# One of the most useful methods for the object data type Series is .value_counts, 
# which calculates the frequencies:

director.value_counts()

In [None]:
fb_likes.value_counts()

In [None]:
director.size

In [None]:
director.shape

In [None]:
len(director)

In [None]:
director.unique()

In [None]:
director.count()

In [None]:
fb_likes.count()

In [None]:
fb_likes.min()

In [None]:
fb_likes.max()

In [None]:
fb_likes.mean()

In [None]:
fb_likes.median()

In [None]:
fb_likes.std()

In [None]:
fb_likes.describe()

In [None]:
director.describe()

In [None]:
fb_likes.quantile(0.2)

In [None]:
fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

In [None]:
# The .isna method can be used to determine whether each individual value is
# missing or not. The result is a Series. You may see this referred to as a 
# Boolean array (a Series with Boolean values that has the same index and 
# length as the original Series):

director.isna()

In [None]:
# It is possible to replace all missing values within a Series with the .fillna method

fb_likes_filled = fb_likes.fillna(0)

In [None]:
# This will output the number of non-null values in the fb_likes_filled Series. 

fb_likes_filled.count()

In [None]:
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

In [None]:
# This will output a new Series where the unique directors are listed as the index, 
# and the values represent the proportion of movies directed by each director relative 
# to the total number of movies in the original director Series. 

# This can be useful for understanding the distribution of directors in the dataset 
# in terms of their contribution to the movies.

director.value_counts(normalize=True)

In [None]:
# The director.hasnans expression is used with a pandas Series named director to 
# check whether there are any missing values (NaN, Not a Number) in the Series.

director.hasnans

In [None]:
director.notna()

## Series Operations

In [None]:
movies = pd.read_csv('../data/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

In [None]:
imdb_score + 1

In [None]:
imdb_score * 2.5

In [None]:
imdb_score // 7

In [None]:
imdb_score > 7

In [None]:
director = movies['director_name']
director == 'James Cameron'

In [None]:
imdb_score.add(1)   # imdb_score + 1

In [None]:
imdb_score.gt(7)   # imdb_score > 7

## Chaining Series Methods

In [None]:
movies = pd.read_csv('../data/movie.csv')
fb_likes = movies['actor_1_facebook_likes']
director = movies['director_name']

In [None]:
director.value_counts().head(3)

In [None]:
fb_likes.isna().sum()

In [None]:
fb_likes.dtype

In [None]:
# Putting it all together, the code takes the fb_likes Series, fills any 
# missing values with 0, converts the elements to integers, and then displays 
# the first few rows of the resulting Series.

(fb_likes.fillna(0)
         .astype(int)
         .head()
)

In [None]:
(fb_likes.fillna(0)
         #.astype(int)
         #.head()
)

In [None]:
(fb_likes.fillna(0)
         .astype(int)
         #.head()
)

In [None]:
fb_likes.isna().mean()

In [None]:
fb_likes.fillna(0).astype(int).head()

In [None]:
x = fb_likes.fillna(0).astype(int).head()

In [None]:
x

In [None]:
fb_likes.fillna(0) \
        .astype(int) \
        .head()

In [None]:
def debug_df(df):
    print("BEFORE")
    print(df)
    print("AFTER")
    return df

In [None]:
(fb_likes.fillna(0)
         .pipe(debug_df)
         .astype(int) 
         .head()
)

# In summary, this code snippet is performing a sequence of operations on the 
# fb_likes Series. It fills missing values with 0, prints the state of the Series 
# before and after this operation using the debug_df function, converts the Series 
# to integers, and finally, it displays the first few rows of the transformed Series 
# for inspection. 

# This kind of pipeline is common in data preprocessing and allows for a clear and 
# organized sequence of data transformation steps. The debug_df function aids in 
# understanding the intermediate states of the data during these transformations.

In [None]:
intermediate = None
def get_intermediate(df):
    global intermediate
    intermediate = df
    return df

In [None]:
res = (fb_likes.fillna(0)
         .pipe(get_intermediate)
         .astype(int) 
         .head()
)


# The key addition here is the use of the get_intermediate function via the pipe method. 
# This means that the intermediate state of the DataFrame or Series, after filling missing 
# values, is stored in the intermediate variable. 

# This can be useful for debugging or understanding the data at various stages of the 
# processing pipeline. Keep in mind that using global variables, as seen in the 
# get_intermediate function, should be done cautiously to avoid potential issues 
# with code maintainability.

In [None]:
intermediate

## Renaming Column Names

In [2]:
movies = pd.read_csv('../data/movie.csv')

In [3]:
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [4]:
# The renamed DataFrame method accepts dictionaries that map the old value to the
# new value. Let's create one for the columns:

col_map = {'director_name':'Director Name', 
             'num_critic_for_reviews': 'Critical Reviews'} 

In [5]:
# Pass the dictionaries to the rename method, and assign the result to a new variable:

movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [7]:
# In this recipe, we changed the names of the columns. You can also rename the 
# index using the .rename method if you want to. This makes more sense if the 
# columns are string values.

# So we will set the index to the movie_title column and then map those values 
# to new ones:


idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}

In [8]:
(movies
   .set_index('movie_title')
   .rename(index=idx_map, columns=col_map)
   .head(3)
)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [10]:
# There are multiple ways to rename row and column labels. It is possible to 
# reassign the index and column attributes to a Python list. 

# This assignment works when the list has the same number of elements as the 
# row and column labels.


# The following code shows an example. We will read the data from the CSV file, 
# and use the index_col parameter to tell pandas to use the movie_title column 
# as the index. 

# Then we use the .tolist method on each Index object to create a Python list of labels. 

# We then modify three values in each of the lists and reassign them to the .index 
# and .column attributes:


movies = pd.read_csv('../data/movie.csv', index_col='movie_title')
ids = movies.index.tolist()
columns = movies.columns.tolist()

## Rename the row and column labels with list assignments

In [11]:
ids[0] = 'Ratava'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[1] = 'director'
columns[-2] = 'aspect'
columns[-1] = 'fblikes'
movies.index = ids
movies.columns = columns

In [12]:
movies.head(3)

Unnamed: 0,color,director,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [None]:
# The combined effect of these operations is to create a cleaned and formatted 
# version of the input string. The string is stripped of leading and trailing 
# whitespaces, converted to lowercase, and spaces are replaced with underscores.

def to_clean(val):
    return val.strip().lower().replace(' ', '_')

In [None]:
movies.rename(columns=to_clean).head(3)

In [None]:
cols = [col.strip().lower().replace(' ', '_')
        for col in movies.columns]
movies.columns = cols
movies.head(3)

## Creating and Deleting columns

In [None]:
# One way to create a new column is to do an index assignment. Note that this will not
# return a new DataFrame but mutate the existing DataFrame. If you assign the column
# to a scalar value, it will use that value for every cell in the column. Let's create the
# has_seen column in the movie dataset to indicate whether or not we have seen the
# movie. We will assign zero for every value. By default, new columns are appended to
# the end:

In [15]:
movies = pd.read_csv('../data/movie.csv')
movies['has_seen'] = 0

In [17]:
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0


In [None]:
# You can also use the .assign method instead. This will return a new DataFrame
# with the new column. Because it uses the parameter name as the column name, the
# column name must be a valid parameter name:

In [18]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .rename(index=idx_map, columns=col_map)
   .assign(has_seen=0)
)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,English,Canada,,,2013.0,470.0,7.7,,84,0
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,English,USA,TV-14,,,593.0,7.5,16.00,32000,0
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,English,USA,,1400.0,2013.0,0.0,6.3,,16,0
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660,0


In [None]:
# There are several columns that contain data on the number of Facebook likes. Let's
# add up all actor and director Facebook like columns and assign them to the total_
# likes column.

In [19]:
total = (movies['actor_1_facebook_likes'] +
         movies['actor_2_facebook_likes'] + 
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])

In [20]:
total.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4        NaN
dtype: float64

In [None]:
# My preference is to use methods that we can chain, so I prefer calling .sum here.
# I will pass in a list of columns to select to .loc to pull out just those columns that
# I want to sum:

In [21]:
cols = ['actor_1_facebook_likes','actor_2_facebook_likes',
    'actor_3_facebook_likes','director_facebook_likes']
sum_col = movies[cols].sum(axis='columns')
sum_col.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4      274.0
dtype: float64

In [None]:
# Then we can assign this Series to the new column. Note that when we called the
# + operator, the result had missing numbers (NaN), but the .sum method ignores
# missing numbers by default, so we get a different result:

In [22]:
movies.assign(total_likes=sum_col).head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,total_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,274.0


-----------------------------------------------------------------------------------------------------------------

In [None]:
def sum_likes(df):
   return df[[c for c in df.columns
              if 'like' in c]].sum(axis=1)

In [None]:
movies.assign(total_likes=sum_likes).head(5)

In [None]:
(movies
   .assign(total_likes=sum_col)
   ['total_likes']
   .isna()
   .sum()
)

In [None]:
(movies
   .assign(total_likes=total)
   ['total_likes']
   .isna()
   .sum()
)

In [None]:
(movies
   .assign(total_likes=total.fillna(0))
   ['total_likes']
   .isna()
   .sum()
)

In [None]:
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= \
           df['total_likes']

In [None]:
df2 = (movies
   .assign(total_likes=total,
           is_cast_likes_more = cast_like_gt_actor_director)
)

In [None]:
df2['is_cast_likes_more'].all()

In [None]:
df2 = df2.drop(columns='total_likes')

In [None]:
actor_sum = (movies
   [[c for c in movies.columns if 'actor_' in c and '_likes' in c]]
   .sum(axis='columns')
)

In [None]:
actor_sum.head(5)

In [None]:
movies['cast_total_facebook_likes'] >= actor_sum

In [None]:
movies['cast_total_facebook_likes'].ge(actor_sum)

In [None]:
movies['cast_total_facebook_likes'].ge(actor_sum).all()

In [None]:
pct_like = (actor_sum
    .div(movies['cast_total_facebook_likes'])
)

In [None]:
pct_like.describe()

In [None]:
pd.Series(pct_like.values,
    index=movies['movie_title'].values).head()

In [None]:
profit_index = movies.columns.get_loc('gross') + 1
profit_index

In [None]:
movies.insert(loc=profit_index,
              column='profit',
              value=movies['gross'] - movies['budget'])

In [None]:
del movies['director_name']

### THE END