# Getting Started With Pandas

# keywords: 

reindexing, overwriting the indices using lists, accessing and modiying series using indices, values and indices of a pd series, Missing data, NaN, isnull, notnull, naming of a series object, naming of series indices, inplace re_indexing using lists, dataframes and various ways of creating dataframes, loc, iloc, reindexing columns or rows, dropping entries from an axis, indexing, selection and filtering, slicing, add, subtract, divide or multiply along an axis, lambda functions, apply to row or column, sorting and ranking, order, sort_index, indirect indexing (argsort), axis indexes with duplicate values, df.sum(), df.mean() along 0 or 1 axis, index of the maximum or minimum along any axis, cummulative sum, describe, correlations [ df.corr() ] and covariance [ df.cov() ], Unique Values, Value Counts, and Membership in 1D, df.isin(lst), isnull, notnull, dropna, fillna with ffill or bfill, or dictionary fill. Hierarchical indexing, multiindex, unstack a hierarchical dataframe

In [2]:
%reset -f
import pandas as pd
import numpy as np

# Pandas Series

In [3]:
# A Series is a one-dimensional array-like object containing an array of data (of anyNumPy data type) 
# and an associated array of data labels, called its index. 
#The simplestSeries is formed from only an array of data:
obj = pd.Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
# get the indices
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
# get the values
obj.values

array([ 4,  7, -5,  3])

In [7]:
# We can specify the indices using the "index" keyword
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [8]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
# getting the indices of the series
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
# We can get the values using the indices (linke in Numpy)
obj2['a']

-5

In [11]:
obj2['d']=99

In [12]:
# Indexing can also be done using a list of indices
obj2[['a','b']]

a   -5
b    7
dtype: int64

In [13]:
obj2

d    99
b     7
a    -5
c     3
dtype: int64

In [14]:
# Filtering
obj2>3

d     True
b     True
a    False
c    False
dtype: bool

In [15]:
obj2[obj2>3]

d    99
b     7
dtype: int64

In [16]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mappingof index values to data values. 
# It can be substituted into many functions that expect adict:

'b' in obj2

True

In [17]:
'e' in obj2

False

In [18]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [19]:
obj3 = pd.Series(sdata)

In [20]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [21]:
# The dictionary keys of the Series object can be overwrtten by a list of indices

states = ['California', 'Ohio', 'Oregon', 'Texas']

In [22]:
obj4 = pd.Series(sdata, index=states)

In [23]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
# NaN is not a number, and can be referred to as 'missing'
# isnull() and notnull() can be used to detect null and not null in a series or dataframe

obj4.isnull() # or pd.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
obj4.notnull() # or pd.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [26]:
# Series automatically aligns the differently indexed data in arithmetic operations

obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [27]:
# naming of a series object
obj4.name = 'population'

# naming of series indices
obj4.index.name = 'states'

obj4

states
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [28]:
#inplace reindexing

obj.index=[10,20,30,40]

obj


10    4
20    7
30   -5
40    3
dtype: int64

# Pandas DataFrames

In [29]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)

# The resulting DataFrame contains the names of the columns in the sorted order
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [30]:
# The columns can be ordered differently using a list as follows

frame = pd.DataFrame(data, columns = ['year','state','pop'])

frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [31]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                   index=['one', 'two', 'three', 'four', 'five'])

In [32]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [33]:
# Retrieval can be done using square brackets (dictionary-like) or dot operator. For example:

frame2.year

# Or
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [34]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [35]:
# Row indexing
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [36]:
frame2['debt'] = np.arange(5)

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [37]:
# When assigning lists and arrays to a DataFrame column, their lengths must be eqal to the length of the colmn.
# Pandas series however, can be added exactly where the indices of the series match with the indices of the DataFrame

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

frame2['debt'] = val

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [38]:
# Assigning a column that does not exist will create a new column.
# del can be used to delete a column

frame2['eastern'] = frame2['state'] == 'Ohio'

frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [39]:
del frame2['eastern']

# Index Objects

In [40]:
obj = pd.Series(range(3), index= ['a','b','c'])

obj

a    0
b    1
c    2
dtype: int64

In [41]:
frame2.index.name = 'index'
frame2.columns.name= 'columns'

frame2


columns,year,state,pop,debt
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


# ESSENTIAL FUNCTIONALITY

In [42]:
# Reindexing
# This can be used to change the order of the indices

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [43]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [44]:
# Reindex with fill_value

obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [45]:
# Reindex using forward fill ffill

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [46]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],  
                  columns=['Ohio', 'Texas', 'California'])

frame

frame.reindex(columns=['Ohio','Utah','California','Texas'])

frame.reindex(['a','b','c','d'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [47]:
# Both reindexing can be done in one shot

# Dropping Entires from an Axis

In [48]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [49]:
new_obj = obj.drop('c')

In [50]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

# Indexing, selection and Filtering

In [51]:
# Series indexing can be done using integers and strings

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [52]:
obj['b']

1.0

In [53]:
obj[1]

1.0

In [54]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [55]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [56]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [57]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

# Slicing

In [58]:
# Slicing in Pandas Series is inclusive when using strings

obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [59]:
obj[0:3]

a    0.0
b    1.0
c    2.0
dtype: float64

In [60]:
# Setting using these methods

obj['b':'d']=5

obj['d']= 5

In [61]:
# Slicing Pandas DataFrame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [62]:
data['two'] 

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [63]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [64]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [65]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [66]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [67]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# loc and iloc

In [68]:
data.loc['Colorado']

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [69]:
data.loc['Colorado','three']

6

In [70]:
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int64

In [71]:
data.loc[['Colorado','Utah']]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11


In [72]:
data.loc[['Colorado','Utah'],['two','three']]

Unnamed: 0,two,three
Colorado,5,6
Utah,9,10


In [73]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),  
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [74]:
frame

Unnamed: 0,b,d,e
Utah,0.258582,0.450076,-1.595201
Ohio,-0.353929,0.250825,-0.395308
Texas,0.248238,0.256546,-1.017486
Oregon,0.59713,-2.030361,-0.620688


In [75]:
f = lambda x: x.max() - x.min()

# Apply the same fuction for each COLUMN
frame.apply(f)

# Apply the same function for each ROW
frame.apply(f, axis = 1)

Utah      2.045278
Ohio      0.646134
Texas     1.274032
Oregon    2.627491
dtype: float64

In [76]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-0.353929,-2.030361,-1.595201
max,0.59713,0.450076,-0.395308


In [77]:
# Element-wise operation
format = lambda x: '%.2f' % x

frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.26,0.45,-1.6
Ohio,-0.35,0.25,-0.4
Texas,0.25,0.26,-1.02
Oregon,0.6,-2.03,-0.62


In [78]:
# Sorting and Ranking

# Sorting indices
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [79]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [80]:
# Sort the indices (rows)
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [81]:
# Sort the columns
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [82]:
# Sorting the columns in descending order
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [83]:
# Sorting Series data by values
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [84]:
# Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [85]:
# Sort by column

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame



# Sort by list of columns
frame=frame.sort_values(by=['a', 'b'])
frame.sort_index(axis=1)


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [86]:
# Argsort
to_sort = np.array([12,1,40,20,60,4])
to_sort[np.argsort(to_sort)]

array([ 1,  4, 12, 20, 40, 60])

In [87]:
# Axis indexes with duplicate values

obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [88]:
# Check if the index is unique
obj.index.is_unique

False

In [89]:
obj[0]

# Same as obj['a']

obj['a']

a    0
a    1
dtype: int64

In [90]:
# DataFrame

df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [91]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [92]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [93]:
# Index of the maximum number

df.idxmax()

one    b
two    d
dtype: object

In [94]:
# Index of the minimum
df.idxmin()

one    d
two    b
dtype: object

In [95]:
# Cummulative sum
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [96]:
# Describe a DataFrame

df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [97]:
df['one'].unique()

array([1.4 , 7.1 ,  nan, 0.75])

In [98]:
df['one'].value_counts(sort= False)

7.10    1
1.40    1
0.75    1
Name: one, dtype: int64

In [99]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})

data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [100]:
counts = data.apply(pd.value_counts).fillna(0)
counts

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [101]:
# Missing data

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [102]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [103]:
string_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [104]:
# dropna
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

data.dropna()


0    1.0
2    3.5
4    7.0
dtype: float64

In [105]:
# Equivalent to

data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [106]:
# dropna
from numpy import nan as NA

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])

data




Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [107]:
cleaned = data.dropna()

cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [108]:
# Default axis is 0 (rows)
# Drops rows that are all NaN
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


# Hierarchical Indexing

In [109]:
data = pd.Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

data

a  1    0.214764
   2   -0.896460
   3    0.398623
b  1   -1.549721
   2    2.112738
   3    0.360993
c  1   -0.485983
   2    1.407936
d  2   -0.411534
   3    0.893691
dtype: float64

In [110]:
data['b']

1   -1.549721
2    2.112738
3    0.360993
dtype: float64

In [111]:
data[['b','d']]

b  1   -1.549721
   2    2.112738
   3    0.360993
d  2   -0.411534
   3    0.893691
dtype: float64

In [112]:
data['b':'d']

b  1   -1.549721
   2    2.112738
   3    0.360993
c  1   -0.485983
   2    1.407936
d  2   -0.411534
   3    0.893691
dtype: float64

In [124]:
# Unstack a multiindex DataFrame

data.unstack()

Unnamed: 0,1,2,3
a,0.214764,-0.89646,0.398623
b,-1.549721,2.112738,0.360993
c,-0.485983,1.407936,
d,,-0.411534,0.893691


In [127]:
# Columns can e used as indices

frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})

frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [145]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [146]:
# Moving back hierarchical indices to columns

frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


# ============================================================