## Creating pandas dataframe

In [95]:
# when we work with python 
# we can add information in dictionary
# person take only one person's information 
person = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

In [96]:
# people will take multiple persons info

people = {
    "first": ["Corey"], 
    "last": ["Schafer"], 
    "email": ["CoreyMSchafer@gmail.com"]
}

In [97]:
# multiple persons information inserted
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

## Accessing rows and columns



In [98]:
import pandas as pd 

In [99]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

# accessing data using key
people['email']

['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [100]:
### Created dataframe from dictionary
df = pd.DataFrame(people)

In [101]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [102]:
# basically printing series 
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [103]:
# this is not returning a list of data only 
# it is basically returning series of data
# which has lot more functionality than list has
# series is rows of single column 

# dataframe is a container of multiple series object

type(df['email'])

pandas.core.series.Series

In [104]:
# using list of column to get multiple column
# this time it will not return a series rather it will return another dataframe
df[['last', 'email']]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [105]:
type(df[['last', 'email']])

pandas.core.frame.DataFrame

In [106]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [107]:
# iloc : integer location
# df.iloc[[row/row list], [column/column list]]
df.iloc[0]

first                      Corey
last                     Schafer
email    CoreyMSchafer@gmail.com
Name: 0, dtype: object

In [108]:
df.iloc[[0, 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [109]:
df.iloc[[0, 1], 2]

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [110]:
# works same as iloc but
# now we can use column name as clumn index 
df.loc[[0, 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [111]:
# using column name as index
df.loc[[0, 1], 'email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [112]:
# column changed from original dataframe
df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


In [113]:
# slicing also possible
# last value is inclusive
# do not use bracket. bracket means list of values
df.loc[0:1, 'first':'email']

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [114]:
# pandas does not enforce index to be uniqe
# setting index without 'inplace=True' will not change original dataframe

# default index will remain there with original dataframe
# df.set_index('email')                   # this will not change original dataframe

# default integer index will be no longer there
df.set_index('email', inplace=True)       # this will change original dataframe

In [115]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [116]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [117]:
df.loc['CoreyMSchafer@gmail.com']

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

In [118]:
df.loc['CoreyMSchafer@gmail.com', 'last']

'Schafer'

In [119]:
# df.loc[0]                         # integer index with loc will not work anymore but
df.iloc[0]                          # iloc works

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

## Filtering dataframe 

In [176]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [121]:
# It is more readable to get the index first in another variable
# then use the index to get filtered df
# filt = (df['last'] == 'Doe')

# df[filt]

In [122]:
# use this
filt = (df['last'] == 'Doe')
df.loc[filt, 'email']

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

In [123]:
# More readable to use parenthesi around each logic
filt = (df['last'] == 'Doe') & (df['first'] == 'John')
df.loc[filt, 'email']

2    JohnDoe@email.com
Name: email, dtype: object

In [124]:
filt = filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[filt, 'email']

0    CoreyMSchafer@gmail.com
2          JohnDoe@email.com
Name: email, dtype: object

In [125]:
### Negatively filtered with tilda 
filt = filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[~filt, 'email']

1    JaneDoe@email.com
Name: email, dtype: object

## Update rows and columns 

In [126]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [127]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [128]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [129]:
df.columns = ['first_name', 'last_name', 'email']

In [130]:
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### - Uses of list comprehension to update columns 

In [131]:
df.columns = [x.upper() for x in df.columns]      # convert column's name in upper case
df.columns = [x.lower() for x in df.columns]      # I want to have the columns name in lower case

In [132]:
df.columns

Index(['first_name', 'last_name', 'email'], dtype='object')

#### - Remove space using string class

In [133]:
# This df does not have any column name with empty space so nothing will change
df.columns = df.columns.str.replace('_', ' ')      # replace '_' by ' '
df.columns = df.columns.str.replace(' ', '_')      # replace ' ' by '_'

In [134]:
df.columns

Index(['first_name', 'last_name', 'email'], dtype='object')

#### - Changing few column's name using replace 

In [135]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)

In [136]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### - Updated whole row, which is usually not needed most of the time

In [137]:
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']

In [138]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnSmith@email.com


#### - Update few values in row

In [146]:
df.loc[2, ['last', 'email']] = ['John', 'JohnDoe@email.com']

In [147]:
df.loc[2, 'last'] = 'Smith'

In [148]:
df.at[2, 'last'] = 'Doe'        # Same as loc 

In [149]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### - Do not ignore this error and write the code differently 

In [150]:
filt = (df['email'] == 'JohnDoe@email.com')
df[filt]['last'] = 'Smith'                    # it will not work or might show warnings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [151]:
filt = (df['email'] == 'JohnDoe@email.com')

df.loc[filt, 'last'] = 'Smith'                 # USE THIS METHOD to update data

In [152]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


#### - Update all the row value of a column

In [154]:
df['email'].str.lower()  

0    coreymschafer@gmail.com
1          janedoe@email.com
2          johndoe@email.com
Name: email, dtype: object

In [155]:
df['email'] = df['email'].str.lower()  

In [156]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


#### - Apply, Map, Applymap, Replace
* Apply:      &nbsp; &nbsp; &nbsp; &nbsp;it works for both series object and dataframe
* Applymap:   it works for dataframe only
* Map:        &emsp;&emsp;&nbsp;&nbsp;it works only for series object
* Replace:    &ensp;&nbsp;it works on specific series value 

In [158]:
# Apply can work on dataframe or in series object
# Apply in series object 
# len will be applied in each value

df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [159]:
# Using apply to a advance custom funtion
# Here is an example of simple function 

def update_email(email):
    return email.upper()

df['email'].apply(update_email)     # it will return changes but will not update in df

0    COREYMSCHAFER@GMAIL.COM
1          JANEDOE@EMAIL.COM
2          JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [160]:
df['email'] = df['email'].apply(update_email) 

In [161]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,COREYMSCHAFER@GMAIL.COM
1,Jane,Doe,JANEDOE@EMAIL.COM
2,John,Smith,JOHNDOE@EMAIL.COM


In [162]:
# small function can be replaced by lambda function 
df['email'] = df['email'].apply(lambda x: x.lower()) 


In [163]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [164]:
# Apply applied in df, it returns number of values in each column
df.apply(len)

first    3
last     3
email    3
dtype: int64

In [165]:
df.apply(len, axis='columns')      # it returns number of values in each row

0    3
1    3
2    3
dtype: int64

In [166]:
df.apply(pd.Series.min)             # it returns minimum value of each series

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [167]:
df.apply(lambda x: x.min())         # x is getting series object

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

#### - Applymap : it only works for dataframe 

In [169]:
df.applymap(len)                    # len function applied in each value of the df

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,5,17


In [174]:
df.applymap(str.lower)              # it might show error if df contains other value than string
# df = df.applymap(str.lower)         # it will not change the original df unless you assign you in df

In [175]:
df

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@gmail.com
1,jane,doe,janedoe@email.com
2,john,smith,johndoe@email.com


#### - Map: it works on series only

In [177]:
# Map will map to those value only which are mentioned in dictionary
# Rest of the data will be Nan, which is not desired
# Therefore we need replace method to change specific values only
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})
# df['first'] = df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})  # to assign in first colunm

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

#### - Replace: It works on specific values on series

In [179]:
# Replace will act on only those suppose to change. 
# Rest of the value will be unchanged
df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})  
# df['first'] = df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})  # to assign in first colunm

0    Chris
1     Mary
2     John
Name: first, dtype: object