In [75]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

# Series Object

## Create

In [76]:
# Series is a one-dimensional array of indexed data
data = pd.Series(
    [0.25, 0.5, 0.75, 1.0],
    index=['a', 'b', 'c', 'd']
)

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [77]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [78]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

## Check Index Presence

In [79]:
'a' in data, 'z' in data

(True, False)

## Direct Index

In [80]:
data['a']

np.float64(0.25)

## Loc Index & Slice

In [81]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [82]:
data.loc['a'] # explicit indexing

np.float64(0.25)

In [83]:
data.loc['a' : 'c'] # explicit slicing

a    0.25
b    0.50
c    0.75
dtype: float64

## iLoc Index & Slice

In [84]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [85]:
data.iloc[0] # implicit indexing

np.float64(0.25)

In [86]:
data.iloc[0:3] # implicit slicing

a    0.25
b    0.50
c    0.75
dtype: float64

## Mask Index 

In [87]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [88]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

## Fancy Index

In [89]:
data.iloc[[1, 2]]

b    0.50
c    0.75
dtype: float64

# DataFrame Object

## Create

In [90]:
population = pd.Series({
    'California': 39538223, 
    'Texas': 29145505,
    'Florida': 21538187, 
    'New York': 20201249,
    'Pennsylvania': 13002700
})

population

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [91]:
area = pd.Series({
    'California': 423967, 
    'Texas': 695662, 
    'Florida': 170312,
    'New York': 141297, 
    'Pennsylvania': 119280
})

area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64

In [92]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [93]:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [94]:
states.columns

Index(['population', 'area'], dtype='object')

In [95]:
states.values

array([[39538223,   423967],
       [29145505,   695662],
       [21538187,   170312],
       [20201249,   141297],
       [13002700,   119280]])

## Direct Index

In [96]:
ser = states['area']
ser

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [97]:
df = states[['area']]
df

Unnamed: 0,area
California,423967
Texas,695662
Florida,170312
New York,141297
Pennsylvania,119280


In [98]:
df2 = states[['area', 'population']]
df2

Unnamed: 0,area,population
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187
New York,141297,20201249
Pennsylvania,119280,13002700


In [99]:
error = states[0]

KeyError: 0

## iLoc Index & Slice

In [100]:
states

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [101]:
states.iloc[0]

population    39538223
area            423967
Name: California, dtype: int64

In [102]:
states.iloc[0, 0], states.iloc[0, 1]

(np.int64(39538223), np.int64(423967))

In [103]:
# example of Fancy Indexing the columns
states.iloc[0, [0, 1]]

population    39538223
area            423967
Name: California, dtype: int64

In [104]:
states.iloc[:3, :2]

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312


## Loc Index & Slice

In [105]:
states

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [106]:
states.loc['Texas']

population    29145505
area            695662
Name: Texas, dtype: int64

In [107]:
states.loc['Texas', 'population'], states.loc['Texas', 'area']

(np.int64(29145505), np.int64(695662))

In [108]:
# example of Fancy Indexing the columns
states.loc['Texas', ['population', 'area']]

population    29145505
area            695662
Name: Texas, dtype: int64

In [109]:
states.loc[:'Florida', :'population']

Unnamed: 0,population
California,39538223
Texas,29145505
Florida,21538187


## Drop Rows & Columns

In [110]:
states.drop(index=['Texas', 'Florida'])

Unnamed: 0,population,area
California,39538223,423967
New York,20201249,141297
Pennsylvania,13002700,119280


In [111]:
states.drop(columns=['area'])

Unnamed: 0,population
California,39538223
Texas,29145505
Florida,21538187
New York,20201249
Pennsylvania,13002700


# Data Frame Indexing

## Data Frame Direct Indexing

In [50]:
df = pd.DataFrame({"Name": ["Tom", "Mike", "Tiffany"],
                   "Language": ["Python", "Python", "R"],
                   "Courses": [5, 4, 7]})
df

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5
1,Mike,Python,4
2,Tiffany,R,7


In [51]:
df['Name']  # returns a series

0        Tom
1       Mike
2    Tiffany
Name: Name, dtype: object

In [52]:
df[['Name']]  # returns a dataframe!

Unnamed: 0,Name
0,Tom
1,Mike
2,Tiffany


In [53]:
df[['Name', 'Language']]

Unnamed: 0,Name,Language
0,Tom,Python
1,Mike,Python
2,Tiffany,R


You can only index rows by using slices, not single values!

In [54]:
df[0] # doesn't work

KeyError: 0

In [55]:
df[0:1] # does work

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5


In [56]:
df[1:] # does work

Unnamed: 0,Name,Language,Courses
1,Mike,Python,4
2,Tiffany,R,7


## Data Frame ILOC

In [57]:
df

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5
1,Mike,Python,4
2,Tiffany,R,7


In [58]:
df.iloc[0]  # returns a series

Name           Tom
Language    Python
Courses          5
Name: 0, dtype: object

In [59]:
df.iloc[0:2]  # slicing returns a dataframe

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5
1,Mike,Python,4


In [60]:
df.iloc[2, 1]  # returns the indexed object

'R'

In [61]:
df.iloc[[0, 1], [1, 2]]  # returns a dataframe

Unnamed: 0,Language,Courses
0,Python,5
1,Python,4


## Data Frame LOC 

In [62]:
df

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5
1,Mike,Python,4
2,Tiffany,R,7


In [64]:
x = df.loc[:, 'Name']
x # series

0        Tom
1       Mike
2    Tiffany
Name: Name, dtype: object

In [66]:
x = df.loc[:, 'Name':'Language']
x # dataframe

Unnamed: 0,Name,Language
0,Tom,Python
1,Mike,Python
2,Tiffany,R


In [67]:
df.loc[[0, 2], ['Language']]

Unnamed: 0,Language
0,Python
2,R


## Boolean Indexing

In [68]:
df

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5
1,Mike,Python,4
2,Tiffany,R,7


In [74]:
df['Courses'] > 5

0    False
1    False
2     True
Name: Courses, dtype: bool

In [69]:
df[df['Courses'] > 5]

Unnamed: 0,Name,Language,Courses
2,Tiffany,R,7


In [70]:
df[df['Name'] == "Tom"]

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5


## Query

In [71]:
df.query("Courses > 4 & Language == 'Python'")

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5


In [72]:
df[(df['Courses'] > 4) & (df['Language'] == 'Python')]

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5


In [73]:
course_threshold = 4
df.query("Courses > @course_threshold")

Unnamed: 0,Name,Language,Courses
0,Tom,Python,5
2,Tiffany,R,7
