# Getting Started With Pandas

# keywords: 

reindexing, overwriting the indices using lists, accessing and modiying series using indices, values and indices of a pd series, Missing data, NaN, isnull, notnull, naming of a series object, naming of series indices, inplace re_indexing using lists, dataframes and various ways of creating dataframes, loc, iloc, reindexing columns or rows, dropping entries from an axis, indexing, selection and filtering, slicing, add, subtract, divide or multiply along an axis, lambda functions, apply to row or column, sorting and ranking, order, sort_index, indirect indexing (argsort),

In [327]:
%reset -f
import pandas as pd
import numpy as np

# Pandas Series

In [328]:
# A Series is a one-dimensional array-like object containing an array of data (of anyNumPy data type) 
# and an associated array of data labels, called its index. 
#The simplestSeries is formed from only an array of data:
obj = pd.Series([4, 7, -5, 3])

In [329]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [330]:
# get the indices
obj.index

RangeIndex(start=0, stop=4, step=1)

In [331]:
# get the values
obj.values

array([ 4,  7, -5,  3])

In [332]:
# We can specify the indices using the "index" keyword
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [333]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [334]:
# getting the indices of the series
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [335]:
# We can get the values using the indices (linke in Numpy)
obj2['a']

-5

In [336]:
obj2['d']=99

In [337]:
# Indexing can also be done using a list of indices
obj2[['a','b']]

a   -5
b    7
dtype: int64

In [338]:
obj2

d    99
b     7
a    -5
c     3
dtype: int64

In [339]:
# Filtering
obj2>3

d     True
b     True
a    False
c    False
dtype: bool

In [340]:
obj2[obj2>3]

d    99
b     7
dtype: int64

In [341]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mappingof index values to data values. 
# It can be substituted into many functions that expect adict:

'b' in obj2

True

In [342]:
'e' in obj2

False

In [343]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [344]:
obj3 = pd.Series(sdata)

In [345]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [346]:
# The dictionary keys of the Series object can be overwrtten by a list of indices

states = ['California', 'Ohio', 'Oregon', 'Texas']

In [347]:
obj4 = pd.Series(sdata, index=states)

In [348]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [349]:
# NaN is not a number, and can be referred to as 'missing'
# isnull() and notnull() can be used to detect null and not null in a series or dataframe

obj4.isnull() # or pd.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [350]:
obj4.notnull() # or pd.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [351]:
# Series automatically aligns the differently indexed data in arithmetic operations

obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [352]:
# naming of a series object
obj4.name = 'population'

# naming of series indices
obj4.index.name = 'states'

obj4

states
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [353]:
#inplace reindexing

obj.index=[10,20,30,40]

obj


10    4
20    7
30   -5
40    3
dtype: int64

# Pandas DataFrames

In [354]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)

# The resulting DataFrame contains the names of the columns in the sorted order
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [355]:
# The columns can be ordered differently using a list as follows

frame = pd.DataFrame(data, columns = ['year','state','pop'])

frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [356]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                   index=['one', 'two', 'three', 'four', 'five'])

In [357]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [358]:
# Retrieval can be done using square brackets (dictionary-like) or dot operator. For example:

frame2.year

# Or
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [359]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [360]:
# Row indexing
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [361]:
frame2['debt'] = np.arange(5)

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [362]:
# When assigning lists and arrays to a DataFrame column, their lengths must be eqal to the length of the colmn.
# Pandas series however, can be added exactly where the indices of the series match with the indices of the DataFrame

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

frame2['debt'] = val

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [363]:
# Assigning a column that does not exist will create a new column.
# del can be used to delete a column

frame2['eastern'] = frame2['state'] == 'Ohio'

frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [364]:
del frame2['eastern']

# Index Objects

In [365]:
obj = pd.Series(range(3), index= ['a','b','c'])

obj

a    0
b    1
c    2
dtype: int64

In [366]:
frame2.index.name = 'index'
frame2.columns.name= 'columns'

frame2


columns,year,state,pop,debt
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


# ESSENTIAL FUNCTIONALITY

In [367]:
# Reindexing
# This can be used to change the order of the indices

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [368]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [369]:
# Reindex with fill_value

obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [370]:
# Reindex using forward fill ffill

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [371]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],  
                  columns=['Ohio', 'Texas', 'California'])

frame

frame.reindex(columns=['Ohio','Utah','California','Texas'])

frame.reindex(['a','b','c','d'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [372]:
# Both reindexing can be done in one shot

# Dropping Entires from an Axis

In [373]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [374]:
new_obj = obj.drop('c')

In [375]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

# Indexing, selection and Filtering

In [376]:
# Series indexing can be done using integers and strings

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [377]:
obj['b']

1.0

In [378]:
obj[1]

1.0

In [379]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [380]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [381]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [382]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

# Slicing

In [383]:
# Slicing in Pandas Series is inclusive when using strings

obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [384]:
obj[0:3]

a    0.0
b    1.0
c    2.0
dtype: float64

In [385]:
# Setting using these methods

obj['b':'d']=5

obj['d']= 5

In [386]:
# Slicing Pandas DataFrame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [387]:
data['two'] 

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [388]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [389]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [390]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [391]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [392]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# loc and iloc

In [393]:
data.loc['Colorado']

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [394]:
data.loc['Colorado','three']

6

In [395]:
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int64

In [396]:
data.loc[['Colorado','Utah']]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11


In [397]:
data.loc[['Colorado','Utah'],['two','three']]

Unnamed: 0,two,three
Colorado,5,6
Utah,9,10


In [398]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),  
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [399]:
frame

Unnamed: 0,b,d,e
Utah,0.469577,-0.611624,-1.882754
Ohio,-1.373298,0.914311,-0.560795
Texas,1.701877,0.411729,0.240151
Oregon,1.060278,-1.62304,-0.53923


In [400]:
f = lambda x: x.max() - x.min()

# Apply the same fuction for each COLUMN
frame.apply(f)

# Apply the same function for each ROW
frame.apply(f, axis = 1)

Utah      2.352331
Ohio      2.287609
Texas     1.461726
Oregon    2.683318
dtype: float64

In [401]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.373298,-1.62304,-1.882754
max,1.701877,0.914311,0.240151


In [402]:
# Element-wise operation
format = lambda x: '%.2f' % x

frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.47,-0.61,-1.88
Ohio,-1.37,0.91,-0.56
Texas,1.7,0.41,0.24
Oregon,1.06,-1.62,-0.54


In [403]:
# Sorting and Ranking

# Sorting indices
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [404]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [405]:
# Sort the indices (rows)
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [406]:
# Sort the columns
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [407]:
# Sorting the columns in descending order
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [414]:
# Sorting Series data by values
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [415]:
# Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [430]:
# Sort by column

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame



# Sort by list of columns
frame=frame.sort_values(by=['a', 'b'])
frame.sort_index(axis=1)


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [432]:
to_sort = np.array([12,1,40,20,60,4])

np.argsort(to_sort)

array([1, 5, 0, 3, 2, 4])

# ============================================================