# Creating pandas dataframe

In [None]:
# when we work with python 
# we can add information in dictionary
# person take only one person's information 
person = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

In [None]:
# people will take multiple persons info

people = {
    "first": ["Corey"], 
    "last": ["Schafer"], 
    "email": ["CoreyMSchafer@gmail.com"]
}

In [None]:
# multiple persons information inserted
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

# Accessing rows and columns



In [None]:
import pandas as pd 

In [None]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

# accessing data using key
people['email']

In [None]:
### Created dataframe from dictionary
df = pd.DataFrame(people)

In [None]:
df

In [None]:
# basically printing series 
df['email']

In [None]:
# this is not returning a list of data only 
# it is basically returning series of data
# which has lot more functionality than list has
# series is rows of single column 

# dataframe is a container of multiple series object

type(df['email'])

In [None]:
# using list of column to get multiple column
# this time it will not return a series rather it will return another dataframe
df[['last', 'email']]

In [None]:
type(df[['last', 'email']])

In [None]:
df.columns

In [None]:
# iloc : integer location
# df.iloc[[row/row list], [column/column list]]
df.iloc[0]

In [None]:
df.iloc[[0, 1]]

In [None]:
df.iloc[[0, 1], 2]

In [None]:
# works same as iloc but
# now we can use column name as clumn index 
df.loc[[0, 1]]

In [None]:
# using column name as index
df.loc[[0, 1], 'email']

In [None]:
# column changed from original dataframe
df.loc[[0, 1], ['email', 'last']]

In [None]:
# slicing also possible
# last value is inclusive
# do not use bracket. bracket means list of values
df.loc[0:1, 'first':'email']

In [None]:
# pandas does not enforce index to be uniqe
# setting index without 'inplace=True' will not change original dataframe

# default index will remain there with original dataframe
# df.set_index('email')                   # this will not change original dataframe

# default integer index will be no longer there
df.set_index('email', inplace=True)       # this will change original dataframe

In [None]:
df

In [None]:
df.index

In [None]:
df.loc['CoreyMSchafer@gmail.com']

In [None]:
df.loc['CoreyMSchafer@gmail.com', 'last']

In [None]:
# df.loc[0]                         # integer index with loc will not work anymore but
df.iloc[0]                          # iloc works

# Filtering dataframe 

In [None]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [None]:
# It is more readable to get the index first in another variable
# then use the index to get filtered df
# filt = (df['last'] == 'Doe')

# df[filt]

In [None]:
# use this
filt = (df['last'] == 'Doe')
df.loc[filt, 'email']

In [None]:
# More readable to use parenthesi around each logic
filt = (df['last'] == 'Doe') & (df['first'] == 'John')
df.loc[filt, 'email']

In [None]:
filt = filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[filt, 'email']

In [None]:
### Negatively filtered with tilda 
filt = filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[~filt, 'email']

# Update rows and columns 

In [None]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [None]:
df

In [None]:
df.columns

In [None]:
df.columns = ['first_name', 'last_name', 'email']

In [None]:
df

#### - Uses of list comprehension to update columns 

In [None]:
df.columns = [x.upper() for x in df.columns]      # convert column's name in upper case
df.columns = [x.lower() for x in df.columns]      # I want to have the columns name in lower case

In [None]:
df.columns

#### - Remove space using string class

In [None]:
# This df does not have any column name with empty space so nothing will change
df.columns = df.columns.str.replace('_', ' ')      # replace '_' by ' '
df.columns = df.columns.str.replace(' ', '_')      # replace ' ' by '_'

In [None]:
df.columns

#### - Changing few column's name using replace 

In [None]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)

In [None]:
df

#### - Updated whole row, which is usually not needed most of the time

In [None]:
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']

In [None]:
df

#### - Update few values in row

In [None]:
df.loc[2, ['last', 'email']] = ['John', 'JohnDoe@email.com']

In [None]:
df.loc[2, 'last'] = 'Smith'

In [None]:
df.at[2, 'last'] = 'Doe'        # Same as loc 

In [None]:
df

#### - Do not ignore this error and write the code differently 

In [None]:
filt = (df['email'] == 'JohnDoe@email.com')
df[filt]['last'] = 'Smith'                    # it will not work or might show warnings

In [None]:
filt = (df['email'] == 'JohnDoe@email.com')

df.loc[filt, 'last'] = 'Smith'                 # USE THIS METHOD to update data

In [None]:
df

#### - Update all the row value of a column

In [None]:
df['email'].str.lower()  

In [None]:
df['email'] = df['email'].str.lower()  

In [None]:
df

#### - Apply, Map, Applymap, Replace
* Apply:      &nbsp; &nbsp; &nbsp; &nbsp;it works for both series object and dataframe
* Applymap:   it works for dataframe only
* Map:        &emsp;&emsp;&nbsp;&nbsp;it works only for series object
* Replace:    &ensp;&nbsp;it works on specific series value 

In [None]:
# Apply can work on dataframe or in series object
# Apply in series object 
# len will be applied in each value

df['email'].apply(len)

In [None]:
# Using apply to a advance custom funtion
# Here is an example of simple function 

def update_email(email):
    return email.upper()

df['email'].apply(update_email)     # it will return changes but will not update in df

In [None]:
df['email'] = df['email'].apply(update_email) 

In [None]:
df

In [None]:
# small function can be replaced by lambda function 
df['email'] = df['email'].apply(lambda x: x.lower()) 


In [None]:
df

In [None]:
# Apply applied in df, it returns number of values in each column
df.apply(len)

In [None]:
df.apply(len, axis='columns')      # it returns number of values in each row

In [None]:
df.apply(pd.Series.min)             # it returns minimum value of each series

In [None]:
df.apply(lambda x: x.min())         # x is getting series object

#### - Applymap : it only works for dataframe 

In [None]:
df.applymap(len)                    # len function applied in each value of the df

In [None]:
df.applymap(str.lower)              # it might show error if df contains other value than string
# df = df.applymap(str.lower)         # it will not change the original df unless you assign you in df

In [None]:
df

#### - Map: it works on series only

In [None]:
# Map will map to those value only which are mentioned in dictionary
# Rest of the data will be Nan, which is not desired
# Therefore we need replace method to change specific values only
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})
# df['first'] = df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})  # to assign in first colunm

#### - Replace: It works on specific values on series

In [None]:
# Replace will act on only those suppose to change. 
# Rest of the value will be unchanged
df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})  
# df['first'] = df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})  # to assign in first colunm

# Add/Remove Rows/Columns

In [198]:
import pandas as pd 

people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@email.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df = pd.DataFrame(people)

In [199]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@email.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [200]:
# Adding two column values, but original df unchanged
df['first'] + ' ' + df['last']           

0    Corey Schafer
1         Jane Doe
2         John Doe
dtype: object

In [201]:
# Adding new column
# Assigning values in it
# df.full_name will not work for assignment

df['full_name'] = df['first'] + ' ' + df['last']           

In [202]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@email.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@email.com,John Doe


In [203]:
df.drop(columns=['first', 'last'], inplace=True)

In [204]:
df

Unnamed: 0,email,full_name
0,CoreyMSchafer@email.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@email.com,John Doe


In [205]:
# divide full name by space 

df['full_name'].str.split(' ')

0    [Corey, Schafer]
1         [Jane, Doe]
2         [John, Doe]
Name: full_name, dtype: object

In [206]:
# splitted string divided into columns
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [207]:
# Assigning values into two different columns
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

In [208]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [209]:
# Adding row
# ignore_index automatically add and create index for it

df.append({'first': 'Tony'}, ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,,,Tony,


In [211]:
people = {
    "first": ["Tony", 'Steve'], 
    "last": ["Stark", 'Rogers'], 
    "email": ["IronMan@avenge.com", 'Cap@email.com']
}

df2 = pd.DataFrame(people)

In [212]:
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@email.com


In [215]:
df.append(df2, ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


In [216]:
# inplace is not valid in append method 
# so we have to create new df like this
df = df.append(df2, ignore_index=True)

In [217]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


In [218]:
df.drop(index=4)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark


In [219]:
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers


In [225]:
# first filtering index 
# and setting that index list to drop those rows

# df[df['last'] == 'Doe']
# df[df['last'] == 'Doe'].index

filt = (df['last'] == 'Doe')
df.drop(index = df[filt].index)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@email.com,Corey Schafer,Corey,Schafer
3,IronMan@avenge.com,,Tony,Stark
4,Cap@email.com,,Steve,Rogers
