# Creating pandas dataframe

In [247]:
# when we work with python 
# we can add information in dictionary
# person take only one person's information 
person = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

In [248]:
# people will take multiple persons info

people = {
    "first": ["Corey"], 
    "last": ["Schafer"], 
    "email": ["CoreyMSchafer@gmail.com"]
}

In [249]:
# multiple persons information inserted
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

# Accessing rows and columns



In [250]:
import pandas as pd 

In [251]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

# accessing data using key
people['email']

['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [252]:
### Created dataframe from dictionary
df = pd.DataFrame(people)

In [253]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [254]:
# basically printing series 
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [255]:
# this is not returning a list of data only 
# it is basically returning series of data
# which has lot more functionality than list has
# series is rows of single column 

# dataframe is a container of multiple series object

type(df['email'])

pandas.core.series.Series

In [256]:
# using list of column to get multiple column
# this time it will not return a series rather it will return another dataframe
df[['last', 'email']]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [257]:
type(df[['last', 'email']])

pandas.core.frame.DataFrame

In [258]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [259]:
# iloc : integer location
# df.iloc[[row/row list], [column/column list]]
df.iloc[0]

first                      Corey
last                     Schafer
email    CoreyMSchafer@gmail.com
Name: 0, dtype: object

In [260]:
df.iloc[[0, 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [261]:
df.iloc[[0, 1], 2]

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [262]:
# works same as iloc but
# now we can use column name as clumn index 
df.loc[[0, 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [263]:
# using column name as index
df.loc[[0, 1], 'email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [264]:
# column changed from original dataframe
df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


In [265]:
# slicing also possible
# last value is inclusive
# do not use bracket. bracket means list of values
df.loc[0:1, 'first':'email']

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [266]:
# pandas does not enforce index to be uniqe
# setting index without 'inplace=True' will not change original dataframe

# default index will remain there with original dataframe
# df.set_index('email')                   # this will not change original dataframe

# default integer index will be no longer there
df.set_index('email', inplace=True)       # this will change original dataframe

In [267]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [268]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [269]:
df.loc['CoreyMSchafer@gmail.com']

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

In [270]:
df.loc['CoreyMSchafer@gmail.com', 'last']

'Schafer'

In [271]:
# df.loc[0]                         # integer index with loc will not work anymore but
df.iloc[0]                          # iloc works

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

# Filtering dataframe 

In [272]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [273]:
# It is more readable to get the index first in another variable
# then use the index to get filtered df
# filt = (df['last'] == 'Doe')

# df[filt]

In [274]:
# use this
filt = (df['last'] == 'Doe')
df.loc[filt, 'email']

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

In [275]:
# More readable to use parenthesi around each logic
filt = (df['last'] == 'Doe') & (df['first'] == 'John')
df.loc[filt, 'email']

2    JohnDoe@email.com
Name: email, dtype: object

In [276]:
filt = filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[filt, 'email']

0    CoreyMSchafer@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [277]:
### Negatively filtered with tilda 
filt = filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[~filt, 'email']

1    JaneDoe@email.com
Name: email, dtype: object

# Update rows and columns 

In [278]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [279]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [280]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [281]:
df.columns = ['first_name', 'last_name', 'email']

In [282]:
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### - Uses of list comprehension to update columns 

In [283]:
df.columns = [x.upper() for x in df.columns]      # convert column's name in upper case
df.columns = [x.lower() for x in df.columns]      # I want to have the columns name in lower case

In [284]:
df.columns

Index(['first_name', 'last_name', 'email'], dtype='object')

#### - Remove space using string class

In [285]:
# This df does not have any column name with empty space so nothing will change
df.columns = df.columns.str.replace('_', ' ')      # replace '_' by ' '
df.columns = df.columns.str.replace(' ', '_')      # replace ' ' by '_'

In [286]:
df.columns

Index(['first_name', 'last_name', 'email'], dtype='object')

#### - Changing few column's name using replace 

In [287]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)

In [288]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### - Updated whole row, which is usually not needed most of the time

In [289]:
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']

In [290]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnSmith@email.com


#### - Update few values in row

In [291]:
df.loc[2, ['last', 'email']] = ['John', 'JohnDoe@email.com']

In [292]:
df.loc[2, 'last'] = 'Smith'

In [293]:
df.at[2, 'last'] = 'Doe'        # Same as loc 

In [294]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### - Do not ignore this error and write the code differently 

In [295]:
filt = (df['email'] == 'JohnDoe@email.com')
df[filt]['last'] = 'Smith'                    # it will not work or might show warnings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [296]:
filt = (df['email'] == 'JohnDoe@email.com')

df.loc[filt, 'last'] = 'Smith'                 # USE THIS METHOD to update data

In [297]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


#### - Update all the row value of a column

In [298]:
df['email'].str.lower()  

0    coreymschafer@email.com
1          janedoe@email.com
2          johndoe@email.com
Name: email, dtype: object

In [299]:
df['email'] = df['email'].str.lower()  

In [300]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@email.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


#### - Apply, Map, Applymap, Replace
* Apply:      &nbsp; &nbsp; &nbsp; &nbsp;it works for both series object and dataframe
* Applymap:   it works for dataframe only
* Map:        &emsp;&emsp;&nbsp;&nbsp;it works only for series object
* Replace:    &ensp;&nbsp;it works on specific series value 

In [301]:
# Apply can work on dataframe or in series object
# Apply in series object 
# len will be applied in each value

df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [302]:
# Using apply to a advance custom funtion
# Here is an example of simple function 

def update_email(email):
    return email.upper()

df['email'].apply(update_email)     # it will return changes but will not update in df

0    COREYMSCHAFER@EMAIL.COM
1          JANEDOE@EMAIL.COM
2          JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [303]:
df['email'] = df['email'].apply(update_email) 

In [304]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,COREYMSCHAFER@EMAIL.COM
1,Jane,Doe,JANEDOE@EMAIL.COM
2,John,Smith,JOHNDOE@EMAIL.COM


In [305]:
# small function can be replaced by lambda function 
df['email'] = df['email'].apply(lambda x: x.lower()) 


In [306]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@email.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [307]:
# Apply applied in df, it returns number of values in each column
df.apply(len)

first    3
last     3
email    3
dtype: int64

In [308]:
df.apply(len, axis='columns')      # it returns number of values in each row

0    3
1    3
2    3
dtype: int64

In [309]:
df.apply(pd.Series.min)             # it returns minimum value of each series

first                      Corey
last                         Doe
email    coreymschafer@email.com
dtype: object

In [310]:
df.apply(lambda x: x.min())         # x is getting series object

first                      Corey
last                         Doe
email    coreymschafer@email.com
dtype: object

#### - Applymap : it only works for dataframe 

In [311]:
df.applymap(len)                    # len function applied in each value of the df

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,5,17


In [312]:
df.applymap(str.lower)              # it might show error if df contains other value than string
# df = df.applymap(str.lower)         # it will not change the original df unless you assign you in df

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@email.com
1,jane,doe,janedoe@email.com
2,john,smith,johndoe@email.com


In [313]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@email.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


#### - Map: it works on series only

In [314]:
# Map will map to those value only which are mentioned in dictionary
# Rest of the data will be Nan, which is not desired
# Therefore we need replace method to change specific values only
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})
# df['first'] = df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})  # to assign in first colunm

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

#### - Replace: It works on specific values on series

In [315]:
# Replace will act on only those suppose to change. 
# Rest of the value will be unchanged
df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})  
# df['first'] = df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})  # to assign in first colunm

0    Chris
1     Mary
2     John
Name: first, dtype: object

# Add/Remove Rows/Columns

In [316]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [317]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [318]:
# Adding two column values, but original df unchanged
df['first'] + ' ' + df['last']           

0    Corey Schafer
1         Jane Doe
2         John Doe
dtype: object

In [319]:
# Adding new column
# Assigning values in it
# df.full_name will not work for assignment

df['full_name'] = df['first'] + ' ' + df['last']           

In [320]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@email.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@email.com,John Doe


In [321]:
df.drop(columns=['first', 'last'], inplace=True)

In [322]:
df

Unnamed: 0,email,full_name
0,CoreyMSchafer@email.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@email.com,John Doe


In [323]:
# divide full name by space 

df['full_name'].str.split(' ')

0    [Corey, Schafer]
1         [Jane, Doe]
2         [John, Doe]
Name: full_name, dtype: object

In [324]:
# splitted string divided into columns
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [325]:
# Assigning values into two different columns
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

In [326]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [327]:
# Adding row
# ignore_index automatically add and create index for it

df.append({'first': 'Tony'}, ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,,,Tony,


In [328]:
people = {
    "first": ["Tony", 'Steve'], 
    "last": ["Stark", 'Rogers'], 
    "email": ["IronMan@avenge.com", 'Cap@email.com']
}

df2 = pd.DataFrame(people)

In [329]:
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@email.com


In [330]:
df.append(df2, ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


In [331]:
# inplace is not valid in append method 
# so we have to create new df like this
df = df.append(df2, ignore_index=True)

In [332]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


In [333]:
df.drop(index=4)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark


In [334]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


In [335]:
# first filtering index 
# and setting that index list to drop those rows

# df[df['last'] == 'Doe']
# df[df['last'] == 'Doe'].index

filt = (df['last'] == 'Doe')
df.drop(index = df[filt].index)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


# Sorting data

In [336]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [337]:
# Sorting by last name in descending order
# There could be same value in multiple row, then you should try to sort by multiple column

df.sort_values(by='last', ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [338]:
# Initially sort by last name then sort by first name

df.sort_values(by=['last', 'first'], ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
2,John,Doe,JohnDoe@email.com
1,Jane,Doe,JaneDoe@email.com


In [339]:
people = {
    "first": ["Corey", 'Jane', 'John', 'Adam'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com', 'A@email.com']
}

df = pd.DataFrame(people)

df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,A@email.com


In [340]:
# Sort by last name in descending order then sort by first name in ascending order

df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)

In [341]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
3,Adam,Doe,A@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [342]:
# sorted by index 
df.sort_index(inplace=True)

In [343]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,A@email.com


In [344]:
# sorted specif column

df['last'].sort_values()

1        Doe
2        Doe
3        Doe
0    Schafer
Name: last, dtype: object

# Cleaning data

In [345]:
import pandas as pd
import numpy as np

people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

df = pd.DataFrame(people)

df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [346]:
# Replace custom missing values by np.nan
# Otherwise pandas can not deal with that 

df.replace(['NA', 'Missing'], np.nan, inplace=True)     # pasing by list
# df.replace('NA', np.nan, inplace=True)                # replacing individually
# df.replace('Missing', np.nan, inplace=True)

# CSV
# In case of loading data from CSV, you can run this operation during loading data

df

In [347]:
# by default axis=index, how='any'
# That means, if any missing values in row then drop it
# It will not delete custom missing values such as NA or Missing
df.dropna()                     

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [348]:
# it will delete row only if all the values are missing
# possible to change axis='columns'

df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [349]:
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [350]:
# drop rows only when email column's value is missing

df.dropna(axis='index', how='any', subset=['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [351]:
# drop rows only when email and last name both are missing
# to change the original dataframe inplace=True need to use

df.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [352]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [353]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [354]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [355]:
# when it says object then it means it is string or mixed data type
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [356]:
# if you have NaN values its actually float data type under the hood
# so if you try to convert a column having NaN will throw an error
type(np.nan) 

float

In [357]:
# to calculate mean its not a good idea to convert missing values to zero by fillna()
# Rather better to keep it as a missing value 
# So need convert the data type to float to remain NaN as it is

df['age'] = df['age'].astype('float')

# to convert whole df at once if necessary
# df.astype('float')

In [358]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [359]:
df['age'].mean()

46.75