# Pandas Tutorial

In [1]:
import pandas as pd

## Series

In [2]:
#Pandas series
ser = pd.Series([10, 20, 30, 40, 50])
ser2 = pd.Series([15, 93, 5, 27, 9])
ser

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
#Series to list
ls = ser.tolist()
print(ls)

#List to series
ser = pd.Series(ls)
print(ser)

[10, 20, 30, 40, 50]
0    10
1    20
2    30
3    40
4    50
dtype: int64


In [4]:
#Add series
ser3 = ser + ser2
ser3

0     25
1    113
2     35
3     67
4     59
dtype: int64

In [5]:
#Compare series
ser2 > ser

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [12]:
#Dict to series
d = {'a': 100, 'b': 200, 'c':300, 'd':400, 'e':800}
pd.Series(d)

a    100
b    200
c    300
d    400
e    800
dtype: int64

In [13]:
#Custom indices and name
sr = pd.Series([30000, 35000, 40000], 
          index=['year 1', 'year 2', 'year 3'], 
          name='Product A')
sr

year 1    30000
year 2    35000
year 3    40000
Name: Product A, dtype: int64

In [14]:
# Setting indices
sr.index = ['2015 sales', '2016 sales', '2017 sales']
print(sr)

# Getting indices
print([i for i in sr.index])

2015 sales    30000
2016 sales    35000
2017 sales    40000
Name: Product A, dtype: int64
['2015 sales', '2016 sales', '2017 sales']


## Dataframe

In [15]:
#Pandas Dataframe
tempdf = pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']})
#keys: column names, values: entries for column
tempdf

Unnamed: 0,Bob,Sue
0,I liked it.,Pretty good.
1,It was awful.,Bland.


In [16]:
#Custom indices
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


In [17]:
#Read dataframe to csv 
#tempdf.to_csv('tempdf.csv')

In [19]:
#Read csv to dataframe
df = pd.read_csv("datasets/tv_shows.csv")
df.head()

Unnamed: 0,Title,Genre,Premiere,No_of_Seasons,No_of_Episodes
0,Stranger Things,Science Fiction Horror,"July 15, 2016",3,25
1,The Crown,Historical Drama,"November 4, 2016",4,40
2,Ozark,Crime Drama,"July 21, 2017",3,30
3,Lost in Space,Science Fiction,"April 13, 2018",2,20
4,Narcos: Mexico,Crime Drama,"November 16, 2018",2,20


In [20]:
# Can also set a column as index
df_indexed = pd.read_csv("datasets/tv_shows.csv", index_col = 0)
df_indexed.head()

Unnamed: 0_level_0,Genre,Premiere,No_of_Seasons,No_of_Episodes
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Stranger Things,Science Fiction Horror,"July 15, 2016",3,25
The Crown,Historical Drama,"November 4, 2016",4,40
Ozark,Crime Drama,"July 21, 2017",3,30
Lost in Space,Science Fiction,"April 13, 2018",2,20
Narcos: Mexico,Crime Drama,"November 16, 2018",2,20


In [21]:
# Number of rows and cols
df.shape

(641, 5)

In [22]:
# Get column names
print(df.columns)

#Renaming columns
columns = ['title', 'genre', 'premiere', 'seasons', 'episodes']
df.columns = columns
print(df.columns)

Index(['Title', 'Genre', 'Premiere', 'No_of_Seasons', 'No_of_Episodes'], dtype='object')
Index(['title', 'genre', 'premiere', 'seasons', 'episodes'], dtype='object')


In [24]:
#To set a column as index
df1 = pd.read_csv("datasets/tv_shows.csv", index_col=0) 

#or,
df1 = df1.set_index('Genre')
df1.head()

Unnamed: 0_level_0,Premiere,No_of_Seasons,No_of_Episodes
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Science Fiction Horror,"July 15, 2016",3,25
Historical Drama,"November 4, 2016",4,40
Crime Drama,"July 21, 2017",3,30
Science Fiction,"April 13, 2018",2,20
Crime Drama,"November 16, 2018",2,20


In [25]:
#Column
col = df.title
#or
col = df["title"]

print(col)
print(type(col)) #Series

0                        Stranger Things
1                              The Crown
2                                  Ozark
3                          Lost in Space
4                         Narcos: Mexico
                     ...                
636                        The Last Narc
637    All or Nothing: Tottenham Hotspur
638                             Fernando
639                      El Desafío: ETA
640                  James May: Oh Cook!
Name: title, Length: 641, dtype: object
<class 'pandas.core.series.Series'>


### Indexing

In [26]:
#Index based selection: iloc
#Selecting rows and columns

df.iloc[:] # All rows
df.iloc[0] # Row 0
df.iloc[0:10] # Rows 0 to 9
df.iloc[[1, 2, 3]] # Rows 1, 2, 3
df.iloc[:, 1] # Column 1
df.iloc[0:10, 1] # Rows 0 to 9, Column 1
df.iloc[[1, 2, 3], [0]] # Rows 1, 2, 3 and Column 0
df.iloc[:5, :3] # Rows 0 to 4, Columns 0 to 2
df.iloc[-5] # Row 5 from the bottom

#Selecting single entry
df.iloc[0][0]

'Stranger Things'

In [27]:
#Label based selection for df: loc

# Single row
df.loc[0] #Pass row label

# Multiple rows
df.loc[[0,1,2]]

# Slicing rows
df.loc[6:9] # Inclusive
#or
df[6:9] # Slicing without loc is only for row selection

# Single column
df.loc[:, 'genre']

# Multiple columns
df.loc[:, ['title', 'episodes']]

# Slicing columns
df.loc[:, 'title':'premiere']

# Specific rows and columns
df.loc[[5, 7, 10], ['title', 'genre']] 

# Slicing rows and columns
df.loc[4:8, 'title':'premiere']

Unnamed: 0,title,genre,premiere
4,Narcos: Mexico,Crime Drama,"November 16, 2018"
5,The Umbrella Academy,Superhero Action,"February 15, 2019"
6,Black Summer,Zombie Drama,"April 11, 2019"
7,Another Life,Science Fiction Drama,"July 25, 2019"
8,Criminal: UK,Police Procedural Anthology Series,"September 20, 2019"


In [29]:
# Label based indexing for series
s = df['genre']

# Single item
s.loc[0]
s[0]

# Multiple items
s.loc[[2, 6, 9]]
s[[2, 6, 9]]

# Slice of items
s.loc[3:8]
s[3:8]

3          Science Fiction
4              Crime Drama
5         Superhero Action
6             Zombie Drama
7    Science Fiction Drama
Name: genre, dtype: object

- With loc[], the ending slice is included.
- With iloc[], the ending slice is not included.

value_counts(): This method for a series(column) displays each unique non-null value and their counts in order.

In [22]:
s.value_counts()

Docu Series                                  78
Comedy                                       60
Drama                                        35
Comedy Drama                                 30
Crime Drama                                  25
                                             ..
Superhero Fiction Crime Drama Legal Drama     1
Horror Thriller                               1
Coming Of Age Documentary                     1
Making Of                                     1
Cooking Show                                  1
Name: genre, Length: 171, dtype: int64

In [32]:
# Including null values in count
# True by default
s.value_counts(dropna = False)

Docu Series                                  78
Comedy                                       60
Drama                                        35
Comedy Drama                                 30
Crime Drama                                  25
                                             ..
Superhero Fiction Crime Drama Legal Drama     1
Horror Thriller                               1
Coming Of Age Documentary                     1
Making Of                                     1
Cooking Show                                  1
Name: genre, Length: 171, dtype: int64

#### Applying a boolean condition for selection

In [37]:
# Printing all historical dramas titles
 
# condition = df['genre'] == 'Historical Drama' #type is dataframe
# condition = condition.squeeze()  #converting to series

condition = df['genre'] == 'Historical Drama'
hist_dram = df.loc[condition, ['title']] # condition applied to rows
hist_dram

Unnamed: 0,title
1,The Crown
93,Barbarians
103,Luna Nera
212,Marco Polo
310,The English Game
403,Roman Empire
407,Five Came Back
559,The Forgotten Army
593,El Cid


In [44]:
# Finding columns with some missing values

condition = df['title'] == 'Title'
temp = df.loc[condition, ['title']] #indexing
temp

Unnamed: 0,title


In [45]:
# Changing those rows with missing values

for index, row in df.iterrows():
    row = ['title', 'genre', 'date', 0, 0]
    if (row[0] == 'Title'):
        df.loc[index] = row

In [46]:
# Or, we could just drop those rows

drop_indices = [i for i in temp.index]
df.drop(drop_indices, inplace = True) # inplace means the changes happen in df itself, no copy is returned

df.shape

(639, 5)

In [48]:
# Converting seasons and episodes cols to numeric type

df['seasons']= pd.to_numeric(df['seasons'], errors = 'coerce')
df['episodes']= pd.to_numeric(df['episodes'], errors = 'coerce')

In [49]:
# Getting shows with more than 10 seasons

df.loc[df.seasons >= 10]

Unnamed: 0,title,genre,premiere,seasons,episodes
163,ARASHI's Diary -Voyage-,Docu Series,"December 31, 2019",23.0,
364,My Holo Love,Science Fiction Romance,"February 7, 2020",12.0,
507,Too Old to Die Young,Crime Drama,"June 14, 2019",10.0,


### Vectorized operations

In [50]:
df2 = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50], 'C':[100, 200, 300, 400, 500]})
df2

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [51]:
df2['A'] + df2['B']

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [52]:
df2['A'] * df2['B'] * df2['C']

0      1000
1      8000
2     27000
3     64000
4    125000
dtype: int64

In [53]:
# Series methods
s = df2['A'] 
s.max()
s.min()
s.mean()
s.median()
s.mode()
s.sum()
s.count()

5

### Summary functions:

In [54]:
# Summary of all numeric columns
df.describe()

Unnamed: 0,seasons,episodes
count,638.0,558.0
mean,2.059561,13.551971
std,1.919796,11.869578
min,1.0,2.0
25%,1.0,7.0
50%,1.0,10.0
75%,2.0,16.0
max,23.0,91.0


In [35]:
# Summary of all non-numeric columns
df.describe(include=['O'])

Unnamed: 0,title,genre,premiere
count,639,639,639
unique,639,170,388
top,Stranger Things,Docu Series,"November 16, 2018"
freq,1,78,7


In [55]:
# Summary of single numeric column
df.seasons.describe()

count    638.000000
mean       2.059561
std        1.919796
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       23.000000
Name: seasons, dtype: float64

In [56]:
# Summary of single non-numeric column
df.title.describe()

count                 639
unique                639
top       Stranger Things
freq                    1
Name: title, dtype: object

- count: Number of non-null values
- unique: Number of unique values in the series
- top: Most common value in the series
- freq: Frequency of the most common value

In [57]:
# Get a stats from a series

df.seasons.max()
df.seasons.mean()
df.episodes.std()
df.genre.unique()
df.title.count()

639

In [58]:
# Unique values in a column

unique_genres = df.genre.unique()  # List
len(unique_genres)

170

In [60]:
# Find number of movies with 5 episodes
# loc can be used to get single value from a series

df['episodes'].value_counts().loc[5]

17

To get df stats, we need to specify axis (to calculate along rows or columns?)
- axis = 0 or "index" means along the row axis (for each column)
- axis = 1 or "columns" means along the columns axis (for each row)
<br> Default is axis=0

In [61]:
# Get stats from a dataframe
# Along row (vertical) axis
df.count(axis=0)

title       639
genre       639
premiere    639
seasons     638
episodes    558
dtype: int64

In [62]:
# Along column (horizontal) axis
df2.mean(axis=1)

0     37.0
1     74.0
2    111.0
3    148.0
4    185.0
dtype: float64

In [65]:
# Stats for multiple columns
df[['seasons', 'episodes']].mean(axis=0)

seasons      2.059561
episodes    13.551971
dtype: float64

In [66]:
# Same, but for selected rows
df.loc[0:10, ['seasons', 'episodes']].mean()

seasons      2.090909
episodes    19.000000
dtype: float64

In [67]:
# Get names of all the numeric columns in a list
import numpy as np
list(df.select_dtypes(include=[np.number]).columns)

['seasons', 'episodes']

In [46]:
# Stats of only numeric columns
df.select_dtypes(include=[np.number]).mean()

seasons      2.059561
episodes    13.551971
dtype: float64

In [47]:
# Select the titles where genre is Horror
horror_bool = df['genre'] == 'Horror'
df.loc[horror_bool, 'title']

22      The Haunting of Bly Manor
108                JU-ON: Origins
115                    Sweet Home
247               October Faction
302    The Haunting of Hill House
349                         Ghoul
352                    Typewriter
Name: title, dtype: object

In [48]:
# Set the value of all 10s in column B to nan
df2.loc[df2['B'] == 10, 'B'] = np.nan
df2

Unnamed: 0,A,B,C
0,1,,100
1,2,20.0,200
2,3,30.0,300
3,4,40.0,400
4,5,50.0,500


In [49]:
# Add a column to df2 that id the difference between A and B
df2['D'] = np.abs(df2['A'] - df2['B'])
df2

Unnamed: 0,A,B,C,D
0,1,,100,
1,2,20.0,200,18.0
2,3,30.0,300,27.0
3,4,40.0,400,36.0
4,5,50.0,500,45.0


In [50]:
# Get top 3 most frequent values of episodes for Crime Dramas
# Return series of episodes and counts

df.loc[df["genre"] == 'Crime Drama', "episodes"].value_counts().head(3)

10.0    4
30.0    3
20.0    3
Name: episodes, dtype: int64

Some functions that return boolean masks:
- Series.isnull()
- Series.notnull()

In [51]:
# Get the rows where episodes is null
df.loc[df['episodes'].isnull()].head()

Unnamed: 0,title,genre,premiere,seasons,episodes
22,The Haunting of Bly Manor,Horror,"October 9, 2020",9.0,
25,The Queen's Gambit,Drama,"October 23, 2020",7.0,
70,The Liberator,War Drama,"November 11, 2020",4.0,
80,Transformers: War for Cybertron Trilogy: Siege,Science Fiction,"July 30, 2020",6.0,
83,Transformers: War for Cybertron Trilogy: Earth...,Science Fiction,"December 30, 2020",6.0,


In [52]:
# Select the first row from the above selection
# Use index 0 for iloc
df.loc[df['episodes'].isnull()].iloc[0]

#or, use index label 22 for loc
df.loc[df['episodes'].isnull()].loc[22]

title       The Haunting of Bly Manor
genre                          Horror
premiere              October 9, 2020
seasons                           9.0
episodes                          NaN
Name: 22, dtype: object

Every operation will align on the index labels. Pandas will also:
- Discard any items that have an index that doesn't match the dataframe (like arugula).
- Fill any remaining rows with NaN.
<br>
<br>
If we choose only not null episodes and multiply with seasons, the length of the new series will be smaller than original columns. If we try to append this to df, the rows will align on index labels, the missing indices will be NaN.

In [53]:
# create a new column where you multiply non-null episodes with seasons
# new_df has the rows with non-null episodes
new_df = df[df["episodes"].notnull()]
df["new_col"] = new_df["seasons"] * new_df["episodes"]

In [54]:
df

Unnamed: 0,title,genre,premiere,seasons,episodes,new_col
0,Stranger Things,Science Fiction Horror,"July 15, 2016",3.0,25.0,75.0
1,The Crown,Historical Drama,"November 4, 2016",4.0,40.0,160.0
2,Ozark,Crime Drama,"July 21, 2017",3.0,30.0,90.0
3,Lost in Space,Science Fiction,"April 13, 2018",2.0,20.0,40.0
4,Narcos: Mexico,Crime Drama,"November 16, 2018",2.0,20.0,40.0
...,...,...,...,...,...,...
636,The Last Narc,Drug Documentary,"July 31, 2020",1.0,4.0,4.0
637,All or Nothing: Tottenham Hotspur,Sports Documentary,"August 31, 2020",1.0,9.0,9.0
638,Fernando,Sports Documentary,"September 25, 2020",1.0,5.0,5.0
639,El Desafío: ETA,Docuseries,"October 30, 2020",1.0,8.0,8.0


In [62]:
df.drop(['new_col'], axis=1, inplace = True)

#### Boolean operators (&, |, ~) for selection

In [90]:
# Select the rows with more than 5 seasons AND of the Crime Drama genre
cond_1 = df["seasons"] > 5
cond_2 = df["genre"] == "Crime Drama"
combine = cond_1 & cond_2
df.loc[combine]

Unnamed: 0,title,genre,premiere,seasons,episodes
140,The Great Heist,Crime Drama,"August 14, 2020",6.0,
409,The Keepers,Crime Drama,"May 19, 2017",7.0,
507,Too Old to Die Young,Crime Drama,"June 14, 2019",10.0,


In [101]:
# Select cooking shows where episodes is not Nan
df.loc[(df["genre"] == "Cooking Show") & (df["episodes"].notnull())]

Unnamed: 0,title,genre,premiere,seasons,episodes
640,James May: Oh Cook!,Cooking Show,"November 13, 2020",1.0,7.0


#### sort_values() method

In [112]:
# Find the Horror show title with the most seasons
df.loc[df["genre"] == 'Horror'].sort_values(by = "seasons", ascending = False).iloc[0]['title']

'The Haunting of Bly Manor'

#### For loops in pandas

In [119]:
# Find the highest number of seasons for each genre
result = {} 
genres = df["genre"].unique()
for g in genres:
    # select rows of that genre only
    rows = df[df["genre"] == g]
    max_seasons = rows["seasons"].max()
    result[g] = max_seasons
result

{'Science Fiction Horror': 3.0,
 'Historical Drama': 5.0,
 'Crime Drama': 10.0,
 'Science Fiction': 6.0,
 'Superhero Action': 2.0,
 'Zombie Drama': 1.0,
 'Science Fiction Drama': 1.0,
 'Police Procedural Anthology Series': 2.0,
 'Superhero Science Fiction Drama': 1.0,
 'Romantic Drama': 2.0,
 'Fantasy Drama': 1.0,
 'Horror Teen Drama': 1.0,
 'Fantasy': 3.0,
 'Coming Of Age Drama': 2.0,
 'Romance Drama': 1.0,
 'Teen Drama': 3.0,
 'Anthology Series': 1.0,
 'Detective Drama': 1.0,
 'Drama': 8.0,
 'Horror': 9.0,
 'Romance': 1.0,
 'Biopic': 1.0,
 'Teen Drama Fantasy': 1.0,
 'Comedy Drama': 7.0,
 'Comedy': 6.0,
 'Satire Drama': 3.0,
 'Coming Of Age Comedy': 4.0,
 'Sketch Comedy': 1.0,
 'Dark Comedy': 2.0,
 'Black Comedy Drama': 6.0,
 'Sitcom': 8.0,
 'Dramedy': 1.0,
 'Musical Comedy': 1.0,
 'Medieval Fantasy Comedy': 3.0,
 'Anthology': 1.0,
 'Mockumentary': 2.0,
 'Science Fantasy Dark Comedy': 1.0,
 'War Drama': 4.0,
 'Dark Fantasy': 3.0,
 'Suspense': 1.0,
 'Workplace Comedy': 3.0,
 "Children

In [120]:
# Find the title name with highest number of seasons for each genre
result = {}
genres = df["genre"].unique()
for g in genres:
    # select rows of that genre only
    rows = df[df["genre"] == g]
    sorted_rows = rows.sort_values("seasons", ascending = False)
    title = sorted_rows.iloc[0]['title']
    result[g] = title
result

{'Science Fiction Horror': 'Stranger Things',
 'Historical Drama': 'The Forgotten Army',
 'Crime Drama': 'Too Old to Die Young',
 'Science Fiction': 'Transformers: War for Cybertron Trilogy: Siege',
 'Superhero Action': 'The Umbrella Academy',
 'Zombie Drama': 'Black Summer',
 'Science Fiction Drama': 'Another Life',
 'Police Procedural Anthology Series': 'Criminal: UK',
 'Superhero Science Fiction Drama': 'Raising Dion',
 'Romantic Drama': 'Virgin River',
 'Fantasy Drama': 'The Witcher',
 'Horror Teen Drama': 'Locke & Key',
 'Fantasy': 'Just Add Magic',
 'Coming Of Age Drama': 'Trinkets',
 'Romance Drama': 'Sweet Magnolias',
 'Teen Drama': 'Free Rein',
 'Anthology Series': 'Homemade',
 'Detective Drama': 'Young Wallander',
 'Drama': 'The Mess You Leave Behind',
 'Horror': 'The Haunting of Bly Manor',
 'Romance': 'Dash & Lily',
 'Biopic': 'Selena: The Series',
 'Teen Drama Fantasy': 'Fate: The Winx Saga',
 'Comedy Drama': 'Orange Is the New Black',
 'Comedy': 'Project Mc2',
 'Satire Dr

### Groupby

In [55]:
df3 = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                    'Type': ['Captive', 'Wild', 
                             'Captive', 'Wild'],
                   'Max Speed': [380., 370., 24., 26.]})
df3

Unnamed: 0,Animal,Type,Max Speed
0,Falcon,Captive,380.0
1,Falcon,Wild,370.0
2,Parrot,Captive,24.0
3,Parrot,Wild,26.0


In [56]:
# Groupby

grouped = df3.groupby(['Type']).mean()
grouped.columns = ['Mean']
grouped

Unnamed: 0_level_0,Mean
Type,Unnamed: 1_level_1
Captive,202.0
Wild,198.0


In [57]:
# Mean seasons and episodes for each genre
# Shows mean for all numeric columns

df.groupby(['genre']).mean()

Unnamed: 0_level_0,seasons,episodes,new_col
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Action,2.0,14.0,28.0
Action Comedy,1.0,8.0,8.0
Action Comedy Drama,1.0,8.0,8.0
Action Drama,1.5,15.0,24.0
Action Historical,3.0,47.5,189.5
...,...,...,...
Western,1.0,7.0,7.0
Workplace Comedy,3.0,30.0,90.0
Young Adult Drama,1.0,10.0,10.0
Zombie Drama,1.0,8.0,8.0


In [58]:
# Latest premiere date in each genre

df.groupby(['genre']).max()['premiere']

genre
Action                    March 29, 2019
Action Comedy          November 10, 2017
Action Comedy Drama    February 28, 2020
Action Drama                 May 4, 2018
Action Historical       October 10, 2019
                             ...        
Western                November 22, 2017
Workplace Comedy          April 20, 2018
Young Adult Drama      December 11, 2020
Zombie Drama              April 11, 2019
Zombie Horror              June 10, 2020
Name: premiere, Length: 170, dtype: object