In [1]:
# Loading a Sample Pandas DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame.from_dict({
    'Name': ['Nik', 'Kate', 'Evan', 'Kyra', np.NaN],
    'Age': [33, 32, 40, 57, np.NaN],
    'Location': ['Toronto', 'London', 'New York', np.NaN, np.NaN]
})
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York
3  Kyra  57.0       NaN
4   NaN   NaN       NaN


In [2]:
# Exploring the .isnull() method
print(df.isnull())

    Name    Age  Location
0  False  False     False
1  False  False     False
2  False  False     False
3  False  False      True
4   True   True      True


In [3]:
print(df.isnull().sum())

Name        1
Age         1
Location    2
dtype: int64


In [None]:
# Exploring the Pandas .dropna() method
df.dropna(
    axis=0,         # Whether to drop rows or columns
    how='any',      # Whether to drop records if 'all' or 'any' records are missing
    thresh=None,    # How many columns/rows must be missing to drop
    subset=None,    # Which rows/columns to consider
    inplace=False   # Whether to drop in place (i.e., without needing to re-assign)
)

In [4]:
# Dropping Values with Default Arguments
df = df.dropna()
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York


In [5]:
# Dropping Records Only if All Records are Missing
df = df.dropna(how='all')
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York


In [6]:
# Using .fillna() to Fill Missing Data
df = df.fillna(0)
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York


In [7]:
# Filling Columns with Different Values
df = df.fillna({'Name': 'Someone', 'Age': 25, 'Location': 'USA'})
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York


In [8]:
# Imputing a Missing Value
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)


   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York


In [9]:
# Loading a Sample Pandas DataFrame
import pandas as pd
df = pd.DataFrame.from_dict({
    'Name': ['Nik', 'Kate', 'Evan', 'Kyra', 'Nik', 'Kate'],
    'Age': [33, 32, 40, 57, 33, 32],
    'Location': ['Toronto', 'London', 'New York', 'Atlanta', 'Toronto', 'Paris'],
    'Date Modified': ['2022-01-01', '2022-02-24', '2022-08-12', '2022-09-12', '2022-01-01', '2022-12-09']
})

print(df)

   Name  Age  Location Date Modified
0   Nik   33   Toronto    2022-01-01
1  Kate   32    London    2022-02-24
2  Evan   40  New York    2022-08-12
3  Kyra   57   Atlanta    2022-09-12
4   Nik   33   Toronto    2022-01-01
5  Kate   32     Paris    2022-12-09


In [10]:
# Identifying Duplicate Records in a Pandas DataFrame
print(df.duplicated())


0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool


In [11]:
# Counting Duplicate Records in a DataFrame
print(df.duplicated().sum())

1


In [12]:
# The Pandas .drop_duplicates() method
df.drop_duplicates(
    subset=None,            # Which columns to consider 
    keep='first',           # Which duplicate record to keep
    inplace=False,          # Whether to drop in place
    ignore_index=False      # Whether to relabel the index
)

Unnamed: 0,Name,Age,Location,Date Modified
0,Nik,33,Toronto,2022-01-01
1,Kate,32,London,2022-02-24
2,Evan,40,New York,2022-08-12
3,Kyra,57,Atlanta,2022-09-12
5,Kate,32,Paris,2022-12-09


In [13]:
# Dropping Duplicates with Default Arguments
df = df.drop_duplicates()
print(df)

   Name  Age  Location Date Modified
0   Nik   33   Toronto    2022-01-01
1  Kate   32    London    2022-02-24
2  Evan   40  New York    2022-08-12
3  Kyra   57   Atlanta    2022-09-12
5  Kate   32     Paris    2022-12-09


In [14]:
# Dropping Based on a Subset of Columns
df = df.sort_values(by='Date Modified', ascending=False)
df = df.drop_duplicates(subset=['Name', 'Age'], keep='first')
print(df)

   Name  Age  Location Date Modified
5  Kate   32     Paris    2022-12-09
3  Kyra   57   Atlanta    2022-09-12
2  Evan   40  New York    2022-08-12
0   Nik   33   Toronto    2022-01-01


In [15]:
# Loading a Sample Pandas DataFrame
import pandas as pd
df = pd.DataFrame.from_dict({
    'Name': ['Tranter, Melvyn', 'Lana, Courtney', 'Abel, Shakti', 'Vasu, Imogene', 'Aravind, Shelly'],
    'Region': ['Region A', 'Region A', 'Region B', 'Region C', 'Region D'],
    'Location': ['TORONTO', 'LONDON', 'New york', 'ATLANTA', 'toronto'],
    'Favorite Color': ['   green  ', 'red', '  yellow', 'blue', 'purple  ']
})

print(df)

              Name    Region  Location Favorite Color
0  Tranter, Melvyn  Region A   TORONTO        green  
1   Lana, Courtney  Region A    LONDON            red
2     Abel, Shakti  Region B  New york         yellow
3    Vasu, Imogene  Region C   ATLANTA           blue
4  Aravind, Shelly  Region D   toronto       purple  


In [16]:
# Trimming Whitespace from a Pandas Column
df['Favorite Color'] = df['Favorite Color'].str.strip()
print(df)

              Name    Region  Location Favorite Color
0  Tranter, Melvyn  Region A   TORONTO          green
1   Lana, Courtney  Region A    LONDON            red
2     Abel, Shakti  Region B  New york         yellow
3    Vasu, Imogene  Region C   ATLANTA           blue
4  Aravind, Shelly  Region D   toronto         purple


In [17]:
# Applying .split on a column
print(df['Name'].str.split(','))

0    [Tranter,  Melvyn]
1     [Lana,  Courtney]
2       [Abel,  Shakti]
3      [Vasu,  Imogene]
4    [Aravind,  Shelly]
Name: Name, dtype: object


In [18]:
# Splitting a Column into Two Columns
df[['Last Name', 'First Name']] = df['Name'].str.split(',', expand=True)

print(df)

              Name    Region  Location Favorite Color Last Name First Name
0  Tranter, Melvyn  Region A   TORONTO          green   Tranter     Melvyn
1   Lana, Courtney  Region A    LONDON            red      Lana   Courtney
2     Abel, Shakti  Region B  New york         yellow      Abel     Shakti
3    Vasu, Imogene  Region C   ATLANTA           blue      Vasu    Imogene
4  Aravind, Shelly  Region D   toronto         purple   Aravind     Shelly


In [19]:
# Replacing a Substring in Pandas
df['Region'] = df['Region'].str.replace('Region ', '')
print(df)


              Name Region  Location Favorite Color Last Name First Name
0  Tranter, Melvyn      A   TORONTO          green   Tranter     Melvyn
1   Lana, Courtney      A    LONDON            red      Lana   Courtney
2     Abel, Shakti      B  New york         yellow      Abel     Shakti
3    Vasu, Imogene      C   ATLANTA           blue      Vasu    Imogene
4  Aravind, Shelly      D   toronto         purple   Aravind     Shelly


In [20]:
# Changing Text to Title Case in Pandas
df['Location'] = df['Location'].str.title()
print(df)

              Name Region  Location Favorite Color Last Name First Name
0  Tranter, Melvyn      A   Toronto          green   Tranter     Melvyn
1   Lana, Courtney      A    London            red      Lana   Courtney
2     Abel, Shakti      B  New York         yellow      Abel     Shakti
3    Vasu, Imogene      C   Atlanta           blue      Vasu    Imogene
4  Aravind, Shelly      D   Toronto         purple   Aravind     Shelly


**Exercise 1.3C5**

It’s time to check your learning! Try and solve the exercises below. If you want to verify your solution, simply toggle the box to see a sample solution. Load the sample DataFrame below to answer the questions:



In [21]:
# Loading a DataFrame
import pandas as pd
import numpy as np

df = pd.DataFrame.from_dict({
    'Name': ['Tranter; Melvyn', 'Lana; Courtney', 'Abel; Shakti', 'Vasu; Imogene', 'Aravind; Shelly', 'Tranter; Melvyn'],
    'Location': ['TORONTO', 'LONDON', 'New york', np.NaN, 'toronto', 'Madrid'],
    'Sales': [123, 243, 654, np.NaN, 345, np.NaN]
})

**Question 1 -  Solution**

In [22]:
# Q1 -Calculate the percentage of missing records in each column.
# Q1 - Solution - Divide the output of df.isnull().sum() by the length of the dataframe:

print(df.isnull().sum() / len(df))

Name        0.000000
Location    0.166667
Sales       0.333333
dtype: float64



**Question 2 - Solution**



In [23]:
# Q2 - Drop any duplicate records based only on the Name column, keeping the last record.
df = df.drop_duplicates(subset='Name', keep='last')

**Question 3 - Solution**

In [24]:
# Q3 - Create a First Name and a Last Name column. Note that there is a semi-colon between names.
df[['Last Name', 'First Name']] = df['Name'].str.split(';', expand=True)