In [1]:
peson = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

In [2]:
people = {
    "first": ["Corey"], 
    "last": ['Schafer'], 
    "email": ["CoreyMSchafer@gmail.com"]
}

In [3]:
people = {
    "first": ["Corey", "Jane", "John"], 
    "last": ["Schafer", "Doe", "Doe"], 
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@email.com"]
}

In [4]:
people['email']

['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [5]:
import pandas as pd 

In [6]:
df = pd.DataFrame(people)

In [7]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


#### Ways to refer to a column in a dataframe

In [8]:
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [9]:
type(df['email'])

pandas.core.series.Series

In [10]:
df.email

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [11]:
df.count()

first    3
last     3
email    3
dtype: int64

In [12]:
df[['last', 'email']]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [13]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

---

#### loc & iloc for Accessing Rows 

In [14]:
# For accessing rows
# loc 
df.iloc[0] # Access by Integer location 

first                      Corey
last                     Schafer
email    CoreyMSchafer@gmail.com
Name: 0, dtype: object

In [15]:
df.iloc[[0, 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [16]:
# For access select rows & Columns
df.iloc[[0, 1], 2]

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [17]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


---

In [18]:
df.loc[0]

first                      Corey
last                     Schafer
email    CoreyMSchafer@gmail.com
Name: 0, dtype: object

Using loc to print all the entries with column names in the first row

In [19]:
df.loc[[0, 1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com


Selecting & Printing multiple rows using list of row numbers

---

#### Using Labels

In [20]:
df.loc[[0, 1], 'email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

Selecting email entries in 1st & 2nd row

In [21]:
df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


Accessing Email & last column in the first and second row

In [22]:
df.set_index('email', inplace=True)

In [23]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [24]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [25]:
df.loc['CoreyMSchafer@gmail.com', 'last']


'Schafer'

In [26]:
df.loc['CoreyMSchafer@gmail.com']

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

_The above code fails as the index has been set to email column_

In [27]:
df.iloc[1]

first    Jane
last      Doe
Name: JaneDoe@email.com, dtype: object

In [28]:
df.reset_index(inplace=True)
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


---

#### Filtering - Using Conditionals to Filter

In [29]:
filt = (df['last'] == 'Doe') & (df['first'] == 'John')

In [30]:
df[filt]

Unnamed: 0,email,first,last
2,JohnDoe@email.com,John,Doe


In [31]:
df.loc[filt, 'email']

2    JohnDoe@email.com
Name: email, dtype: object

In [32]:
or_filter = (df['last'] == 'Schafer') | (df['first'] == 'John')
print("Filter\n", df[or_filter])
print("Inverting Filter\n", df[~or_filter])

Filter
                      email  first     last
0  CoreyMSchafer@gmail.com  Corey  Schafer
2        JohnDoe@email.com   John      Doe
Inverting Filter
                email first last
1  JaneDoe@email.com  Jane  Doe


#### Updating Rows & Columns

In [33]:
df.columns

Index(['email', 'first', 'last'], dtype='object')

In [34]:
df.columns = ['first_name', 'last_name', 'email']

In [35]:
df.columns = [x.lower() for x in df.columns]

In [36]:
df.columns = df.columns.str.replace(' ', '_')

In [37]:
df

Unnamed: 0,first_name,last_name,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [38]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)

In [39]:
df.loc[2] = ['John', 'Smith', 'JohnSmit@Email.com']

In [40]:
df.loc[2, ['last', 'email']] = ['Doe', 'JohnDoe@Email.com']

In [41]:
df = pd.DataFrame(people)
df.loc[2, 'last'] = 'Smith'

In [42]:
df.at[2, 'last'] = 'Doe'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [43]:
filter_ = (df['email'] == 'JohnDoe@email.com')
df[filter_]['email']

2    JohnDoe@email.com
Name: email, dtype: object

Following doesn't work.
df[filter_]['last'] = 'Smith'

In [44]:
df.loc[filter_, 'last'] = 'Smith'

##### Assigns lower case directly over a set of rows in the column

In [45]:
df['email'] = df['email'].str.lower()

In [46]:
def to_upper(obj: str): 
    return obj.upper()

def to_lower(obj: str): 
    return obj.lower()

In [47]:
df['email'] = df['email'].apply(to_upper)

In [48]:
df['email'] = df['email'].apply(to_lower)

In [49]:
df['email'] = df['email'].apply(lambda x: x.lower())

In [50]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [51]:
df.apply(len)

first    3
last     3
email    3
dtype: int64

In [52]:
len(df['email'])

3

In [53]:
df.apply(pd.Series.min)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Smith,johndoe@email.com


In [54]:
df.apply(lambda x: x.min())

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [55]:
df.applymap(len)

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,5,17


In [56]:
df.applymap(str.lower)

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@gmail.com
1,jane,doe,janedoe@email.com
2,john,smith,johndoe@email.com


In [57]:
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [58]:
df['first'] = df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

In [59]:
df

Unnamed: 0,first,last,email
0,Chris,Schafer,coreymschafer@gmail.com
1,Mary,Doe,janedoe@email.com
2,,Smith,johndoe@email.com


---

### Adding Or Removing Columns from Dataframe

In [60]:
df = pd.DataFrame(people)

In [61]:
df ['first'] + ' ' + df['last']

0    Corey Schafer
1         Jane Doe
2         John Doe
dtype: object

In [62]:
df['Full Name'] = df ['first'] + ' ' + df['last']

*_Without using Inplace_*

In [63]:
df.drop(columns=['first', 'last'])

Unnamed: 0,email,Full Name
0,CoreyMSchafer@gmail.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@email.com,John Doe


_Using Inplace in Drop_

In [64]:
df.drop(columns=['first', 'last'], inplace=True)

In [65]:
df

Unnamed: 0,email,Full Name
0,CoreyMSchafer@gmail.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@email.com,John Doe


_Splitting the Column to different columns_

In [66]:
df['Full Name'].str.split(' ')

0    [Corey, Schafer]
1         [Jane, Doe]
2         [John, Doe]
Name: Full Name, dtype: object

In [67]:
df['Full Name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [68]:
df[['first', 'last']] = df['Full Name'].str.split(' ', expand=True)

In [69]:
df

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [70]:
df.append({'first': 'Tony'}, ignore_index=True)

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,,,Tony,


In [71]:
people = {
    "first": ["Tony", "Steve"], 
    "last": ["Start", "Rogers"], 
    "email": ["TS@gmail.com", "SR@email.com"]
}

In [72]:
df2 = pd.DataFrame(people)

In [73]:
df2

Unnamed: 0,first,last,email
0,Tony,Start,TS@gmail.com
1,Steve,Rogers,SR@email.com


In [74]:
df.append(df2, ignore_index=True)

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,TS@gmail.com,,Tony,Start
4,SR@email.com,,Steve,Rogers


___

#### Sorting data In Pandas

In [75]:
df.sort_values(by='last')

Unnamed: 0,email,Full Name,first,last
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer


In [76]:
df.sort_values(by='last', ascending=False)

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [77]:
df.sort_values(by=['last', 'first'], ascending=False)

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
2,JohnDoe@email.com,John Doe,John,Doe
1,JaneDoe@email.com,Jane Doe,Jane,Doe


In [78]:
df.sort_values(by=['last', 'first'], ascending=[False, True])

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [79]:
df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)

In [80]:
df

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [81]:
df.sort_index()

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe


In [82]:
df=df.append({'email':'A@email.com', 'first': "Ash", 'last': 'Ketchup', 'Full Name': 'Ash Ketchup'}, ignore_index=True)

In [83]:
df

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,A@email.com,Ash Ketchup,Ash,Ketchup


In [84]:
df.sort_index()

Unnamed: 0,email,Full Name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@email.com,Jane Doe,Jane,Doe
2,JohnDoe@email.com,John Doe,John,Doe
3,A@email.com,Ash Ketchup,Ash,Ketchup


In [85]:
df['last'].sort_values()

1        Doe
2        Doe
3    Ketchup
0    Schafer
Name: last, dtype: object

---

##### Cleaning Data - Casting Data Types & Handling Values

In [86]:
import numpy as np 
import pandas as pd

In [87]:
people = {
    "first": ["Corey", "Jane", "John", "Chris", np.nan, None, 'NA'], 
    "last": ["Schafer", "Doe", "Doe","Schafer", np.nan, np.nan, 'Missing'], 
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@email.com", None, np.nan, "A@email.com", "NA"],
    "Age": ['33', '55', '63', '36', None, None, 'Missing']
}

In [100]:
df = pd.DataFrame(people)

df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [101]:
df

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,A@email.com,
6,,,,


In [102]:
df.dropna()

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [103]:
df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,A@email.com,


In [104]:
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,A@email.com,
6,,,,


In [105]:
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6


In [106]:
df.dropna(axis='index', how='all', subset=['email'])

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,A@email.com,


In [108]:
df.dropna(axis='index', how='all', subset=['last','email'])

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,A@email.com,


In [109]:
df.isna()

Unnamed: 0,first,last,email,Age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [110]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,A@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [111]:
df.fillna(0)

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,A@email.com,0
6,0,0,0,0


##### Casting Data Types

In [112]:
df.dtypes

first    object
last     object
email    object
Age      object
dtype: object

In [113]:
df['age'].mean()

KeyError: 'age'

In [119]:
df.fillna(0)

Unnamed: 0,first,last,email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,A@email.com,0
6,0,0,0,0


In [123]:
df.fillna(0, inplace=True)

In [129]:
df['Age'] = df['Age'].astype(float)

In [130]:
df.dtypes

first     object
last      object
email     object
Age      float64
dtype: object

In [131]:
df['Age'].mean()

26.714285714285715