In [1]:
import numpy as np
# pandas works on top of numpy
import pandas as pd

### Object Creation

#### Using series

In [3]:
# pandas creates the index
s=pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
list('ASD')

['A', 'S', 'D']

#### Create dataframe with date as index

In [6]:
dates=pd.date_range('20200910',periods=6)
dates

DatetimeIndex(['2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13',
               '2020-09-14', '2020-09-15'],
              dtype='datetime64[ns]', freq='D')

In [8]:
# dataframe from numpy array
df=pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,-1.697445,0.307797,-0.55248
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097
2020-09-13,2.239506,-0.259594,0.17469,-0.941078
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743
2020-09-15,-0.114829,0.425713,-0.276925,-2.067568


In [10]:
# dataframe from dictionary
df2=pd.DataFrame({
    'A':1,
    'B':pd.Timestamp('20200910'),
    'C':pd.Series(1, index=list(range(4)), dtype='float32'),
    'D':np.array([3]*4, dtype='int32'),
    'E':pd.Categorical(['test','train','test','train']),
    'F':'foo'
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2020-09-10,1.0,3,test,foo
1,1,2020-09-10,1.0,3,train,foo
2,1,2020-09-10,1.0,3,test,foo
3,1,2020-09-10,1.0,3,train,foo


In [11]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [12]:
# columns has also become the attributes of dataframe
df2.A # equivalent to df['A']

0    1
1    1
2    1
3    1
Name: A, dtype: int64

See all attributes: *df2.TAB*

###  Viewing Data

In [13]:
df.head()

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,-1.697445,0.307797,-0.55248
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097
2020-09-13,2.239506,-0.259594,0.17469,-0.941078
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743


In [15]:
df.tail(3)

Unnamed: 0,A,B,C,D
2020-09-13,2.239506,-0.259594,0.17469,-0.941078
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743
2020-09-15,-0.114829,0.425713,-0.276925,-2.067568


In [16]:
df.index

DatetimeIndex(['2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13',
               '2020-09-14', '2020-09-15'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [18]:
# pandas dataframe to numpy
df.to_numpy()

array([[ 1.84854571, -1.69744459,  0.30779703, -0.55248   ],
       [ 2.38311921,  0.06656116, -1.62691132,  0.27468331],
       [ 1.38764536,  1.66490918,  0.62299776, -1.11409692],
       [ 2.2395058 , -0.25959397,  0.17469005, -0.94107803],
       [-1.24353997, -0.10943415, -0.91706516,  0.65274328],
       [-0.11482864,  0.4257134 , -0.27692517, -2.067568  ]])

In [19]:
# if dataframe have different data types then above conversion is expensive since numpy is one data type container
df2.to_numpy()

array([[1, Timestamp('2020-09-10 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2020-09-10 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1, Timestamp('2020-09-10 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2020-09-10 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

#### Summary Statistics of dataframe

In [21]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,1.083408,0.015119,-0.285903,-0.624633
std,1.452878,1.087832,0.847051,0.986655
min,-1.24354,-1.697445,-1.626911,-2.067568
25%,0.26079,-0.222054,-0.75703,-1.070842
50%,1.618096,-0.021436,-0.051118,-0.746779
75%,2.141766,0.335925,0.27452,0.067892
max,2.383119,1.664909,0.622998,0.652743


In [22]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [24]:
# Transposing the data
df.T

Unnamed: 0,2020-09-10,2020-09-11,2020-09-12,2020-09-13,2020-09-14,2020-09-15
A,1.848546,2.383119,1.387645,2.239506,-1.24354,-0.114829
B,-1.697445,0.066561,1.664909,-0.259594,-0.109434,0.425713
C,0.307797,-1.626911,0.622998,0.17469,-0.917065,-0.276925
D,-0.55248,0.274683,-1.114097,-0.941078,0.652743,-2.067568


#### Sorting the data

In [25]:
# by axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2020-09-10,-0.55248,0.307797,-1.697445,1.848546
2020-09-11,0.274683,-1.626911,0.066561,2.383119
2020-09-12,-1.114097,0.622998,1.664909,1.387645
2020-09-13,-0.941078,0.17469,-0.259594,2.239506
2020-09-14,0.652743,-0.917065,-0.109434,-1.24354
2020-09-15,-2.067568,-0.276925,0.425713,-0.114829


In [26]:
# by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,-1.697445,0.307797,-0.55248
2020-09-13,2.239506,-0.259594,0.17469,-0.941078
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-15,-0.114829,0.425713,-0.276925,-2.067568
2020-09-12,1.387645,1.664909,0.622998,-1.114097


### Selection

In [27]:
df['A']

2020-09-10    1.848546
2020-09-11    2.383119
2020-09-12    1.387645
2020-09-13    2.239506
2020-09-14   -1.243540
2020-09-15   -0.114829
Freq: D, Name: A, dtype: float64

In [28]:
df[0:3]

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,-1.697445,0.307797,-0.55248
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097


#### selection by labels

for cross selection using label

In [29]:
df.loc[dates[0]]

A    1.848546
B   -1.697445
C    0.307797
D   -0.552480
Name: 2020-09-10 00:00:00, dtype: float64

In [30]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2020-09-10,1.848546,-1.697445
2020-09-11,2.383119,0.066561
2020-09-12,1.387645,1.664909
2020-09-13,2.239506,-0.259594
2020-09-14,-1.24354,-0.109434
2020-09-15,-0.114829,0.425713


#### Selection via position

In [31]:
df.iloc[3]

A    2.239506
B   -0.259594
C    0.174690
D   -0.941078
Name: 2020-09-13 00:00:00, dtype: float64

In [32]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097


In [33]:
df.iloc[1:3,]

Unnamed: 0,A,B,C,D
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097


#### Boolean Indexing

In [34]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,-1.697445,0.307797,-0.55248
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097
2020-09-13,2.239506,-0.259594,0.17469,-0.941078


In [35]:
df[df>0]

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,,0.307797,
2020-09-11,2.383119,0.066561,,0.274683
2020-09-12,1.387645,1.664909,0.622998,
2020-09-13,2.239506,,0.17469,
2020-09-14,,,,0.652743
2020-09-15,,0.425713,,


### Missing Data

In [36]:
df3=df.copy()
df3

Unnamed: 0,A,B,C,D
2020-09-10,1.848546,-1.697445,0.307797,-0.55248
2020-09-11,2.383119,0.066561,-1.626911,0.274683
2020-09-12,1.387645,1.664909,0.622998,-1.114097
2020-09-13,2.239506,-0.259594,0.17469,-0.941078
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743
2020-09-15,-0.114829,0.425713,-0.276925,-2.067568


In [39]:
df3['E']=[1,2,np.nan,6,8,np.nan]

In [40]:
df3

Unnamed: 0,A,B,C,D,E
2020-09-10,1.848546,-1.697445,0.307797,-0.55248,1.0
2020-09-11,2.383119,0.066561,-1.626911,0.274683,2.0
2020-09-12,1.387645,1.664909,0.622998,-1.114097,
2020-09-13,2.239506,-0.259594,0.17469,-0.941078,6.0
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743,8.0
2020-09-15,-0.114829,0.425713,-0.276925,-2.067568,


#### drop missing values rows

In [41]:
df3.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2020-09-10,1.848546,-1.697445,0.307797,-0.55248,1.0
2020-09-11,2.383119,0.066561,-1.626911,0.274683,2.0
2020-09-13,2.239506,-0.259594,0.17469,-0.941078,6.0
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743,8.0


#### fill the missing values

In [42]:
df3.fillna(100)

Unnamed: 0,A,B,C,D,E
2020-09-10,1.848546,-1.697445,0.307797,-0.55248,1.0
2020-09-11,2.383119,0.066561,-1.626911,0.274683,2.0
2020-09-12,1.387645,1.664909,0.622998,-1.114097,100.0
2020-09-13,2.239506,-0.259594,0.17469,-0.941078,6.0
2020-09-14,-1.24354,-0.109434,-0.917065,0.652743,8.0
2020-09-15,-0.114829,0.425713,-0.276925,-2.067568,100.0


## *for more see pandas doc 10 minutes to pandas*