In [1]:
import pandas as pd
import numpy as np

In [2]:
# creating a series using pandas.Series

s = pd.Series([1,2,3,4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [3]:
# agg() - aggregate using one or more operation over specified axis
s.agg(['min','max','sum','average'])

min         1.0
max         4.0
sum        10.0
average     2.5
dtype: float64

In [2]:
# all() - returns wheather elements are True ,potentially over an axis
import pandas as pd
df = pd.DataFrame({'Col1':[True,True],'Col2':[True,False]})
df

Unnamed: 0,Col1,Col2
0,True,True
1,True,False


In [4]:
df.all(axis='rows')

Col1     True
Col2    False
dtype: bool

In [6]:
df.any() # any() - returns wheather any element is True ,potentially over an axis

Col1    True
Col2    True
dtype: bool

In [7]:
# append is used to concatenate two or more series
s1 = pd.Series([1,2,3,4])
s2 = pd.Series([5,6,7,8])
s = s1.append(s2)
s

0    1
1    2
2    3
3    4
0    5
1    6
2    7
3    8
dtype: int64

In [8]:
s = s1.append(s2,ignore_index=True) # resetting the indexing
s

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

In [9]:
# apply() - invoke function on value of Series

s = pd.Series([20,21,12],index=['London','New York','Mumbai'])
s

London      20
New York    21
Mumbai      12
dtype: int64

In [10]:
def square(x):
    return x*x

s.apply(square) # apply () takes the name of the function that needs to beinvoked obver the series


London      400
New York    441
Mumbai      144
dtype: int64

In [11]:
s.apply(lambda x : x*x) # using lambda function

London      400
New York    441
Mumbai      144
dtype: int64

In [12]:
# apply can be also used to pass positional argument using args keyword :
def subtract_custom_value(x,custom_value):
    return x - custom_value
s.apply(subtract_custom_value,args=(5,))

London      15
New York    16
Mumbai       7
dtype: int64

In [13]:
s.apply(np.log)

London      2.995732
New York    3.044522
Mumbai      2.484907
dtype: float64

In [14]:
# astype() -- cast a pandas object to a specified dtype :

s = pd.Series([10,20,25,67],dtype='int32')
s

0    10
1    20
2    25
3    67
dtype: int32

In [15]:
s.astype('float')

0    10.0
1    20.0
2    25.0
3    67.0
dtype: float64

In [5]:
## pandas.api.types.CategoricalDtype() is used to convert to categorical type with custom ordering

s = pd.Series([10,20,40,30,80,20,60,1],dtype='int32')

cat_dtype = pd.api.types.CategoricalDtype(categories=[10,20,30,40,50,60,70],ordered=True)
s.astype(cat_dtype)

0     10
1     20
2     40
3     30
4    NaN
5     20
6     60
7    NaN
dtype: category
Categories (7, int64): [10 < 20 < 30 < 40 < 50 < 60 < 70]

In [19]:
# at_time - used to select values at particular time of day 

i = pd.date_range('2018-04-09',periods=4,freq='12H')
ts = pd.DataFrame({'A':[1,2,3,4]},index=i)
ts

Unnamed: 0,A
2018-04-09 00:00:00,1
2018-04-09 12:00:00,2
2018-04-10 00:00:00,3
2018-04-10 12:00:00,4


In [20]:
ts.at_time('12:00:00')

Unnamed: 0,A
2018-04-09 12:00:00,2
2018-04-10 12:00:00,4


In [21]:
# between() - used to return boolean series relative to the range of values given in between

s = pd.Series([2,0,4,8,np.nan])
s.between(1,4)

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [22]:
# count - is used to return the non-null values
s.count()

4

In [23]:
# diff() -- is used to calculate the differnce of a series element compared with another element in series
s.diff()

0    NaN
1   -2.0
2    4.0
3    4.0
4    NaN
dtype: float64

In [6]:
import numpy as np
np.arange(3)

array([0, 1, 2])

In [24]:
# drop() -- remove element of a series based on specifying the index
s = pd.Series(data=np.arange(3),index=['A','B','C'])
s

A    0
B    1
C    2
dtype: int32

In [25]:
s.drop(labels = ['B','C'])

A    0
dtype: int32

In [8]:
print(dir(s))

['T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_SLICEMAP', '__abs__', '__add__', '__and__', '__array__', '__array_prepare__', '__array_priority__', '__array_wrap__', '__bool__', '__bytes__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__rdivmod_

In [26]:
# drop_duplicates() -- returns series with duplicates values removed

s = pd.Series(['lama','cow','lama','dog'])
s

0    lama
1     cow
2    lama
3     dog
dtype: object

In [29]:
s.drop_duplicates() # passing - keep='last' - removes the first occurance of the duplicated element

0    lama
1     cow
3     dog
dtype: object

In [30]:
# dropna() - returns a new series with missing values removed
s = pd.Series([np.nan,'cow','lama','dog'])
s.dropna()


1     cow
2    lama
3     dog
dtype: object

In [31]:
# duplicated - return boolean output for duplicate series values
s = pd.Series(['lama','cow','lama','dog'])
s.duplicated()


0    False
1    False
2     True
3    False
dtype: bool

In [5]:
# fillna- used to reeplace NA/NaN values in the dataframe using specified method

df = pd.DataFrame([
    [np.nan,1,np.nan,0],
    [3,4,np.nan,0],
    [np.nan,np.nan,np.nan,5],
    [np.nan,3,np.nan,4]
],columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,1.0,,0
1,3.0,4.0,,0
2,,,,5
3,,3.0,,4


In [34]:
df.fillna(0) # filling NaN values with 0

Unnamed: 0,A,B,C,D
0,0.0,1.0,0.0,0
1,3.0,4.0,0.0,0
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [35]:
df.fillna(method='ffill') # filling NaN with forward fill method

Unnamed: 0,A,B,C,D
0,,1.0,,0
1,3.0,4.0,,0
2,3.0,4.0,,5
3,3.0,3.0,,4


In [37]:
values = {'A':10,'B':20,'C':30,'D':40}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,10.0,1.0,30.0,0
1,3.0,4.0,30.0,0
2,10.0,20.0,30.0,5
3,10.0,3.0,30.0,4


In [38]:
# filter() - filter method is used to subset rows or columns of dataframe according to labels specified
df.filter(items = ['A','B'])

Unnamed: 0,A,B
0,,1.0
1,3.0,4.0
2,,
3,,3.0


In [39]:
df.filter(like = 'A',axis=1) # col. name containing A

Unnamed: 0,A
0,
1,3.0
2,
3,


In [40]:
# groupby -- used to group similar rows together 

df = pd.DataFrame({'Animal':['Falcon','Falcon','Parrot','Parrot'],
                  'Speed':[380,370,24,26]})
df

Unnamed: 0,Animal,Speed
0,Falcon,380
1,Falcon,370
2,Parrot,24
3,Parrot,26


In [41]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Speed
Animal,Unnamed: 1_level_1
Falcon,375
Parrot,25


In [43]:
# idmax() - returns the row label of the max value; if multiple equal the maximum,then first row label is returned

s = pd.Series(data=[1,None,4,3,4],
             index = ['A','B','C','D','E'])
s.idxmax() #idxmin() is similar ; returns the min value

'C'

In [44]:
df.isna() # returns a boolean same-sized object indicating if values are NA,None,numpy.NaN

Unnamed: 0,Animal,Speed
0,False,False
1,False,False
2,False,False
3,False,False


In [45]:
#map -- map values of Series according to input correspondence

s = pd.Series(['cat','dog',np.nan,'rabbit'])
s

0       cat
1       dog
2       NaN
3    rabbit
dtype: object

In [46]:
s.map({'cat':'kitten','dog':'puppy'})

0    kitten
1     puppy
2       NaN
3       NaN
dtype: object

In [47]:
s.map('I am a {}'.format,na_action='ignore')

0       I am a cat
1       I am a dog
2              NaN
3    I am a rabbit
dtype: object