In [1]:
import numpy as np
import pandas as pd

In [2]:
# creating a series
s = pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# Creating a DataFrame by passing a NumPy array with a datetime index and labeled columns

dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,2.105264,0.841475,0.852049,0.440189
2013-01-02,0.830835,-0.295906,0.409144,-0.14494
2013-01-03,-0.681008,0.208177,0.994045,0.441474
2013-01-04,-0.682114,-0.362428,-0.268454,-0.497651
2013-01-05,0.759389,-0.472352,-0.133386,0.693099
2013-01-06,1.560171,1.434012,-0.907497,-0.662624


In [7]:
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({'A':1.,
             'B': pd.Timestamp('20130102'),
             'C': pd.Series(1, index=list(range(4)), dtype='float32'), 
             'D': np.array([3] * 4, dtype='int32'), 
             'E': pd.Categorical(["test", "train", "test", "train"]), 
             'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [10]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,2.105264,0.841475,0.852049,0.440189
2013-01-02,0.830835,-0.295906,0.409144,-0.14494
2013-01-03,-0.681008,0.208177,0.994045,0.441474
2013-01-04,-0.682114,-0.362428,-0.268454,-0.497651
2013-01-05,0.759389,-0.472352,-0.133386,0.693099


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.682114,-0.362428,-0.268454,-0.497651
2013-01-05,0.759389,-0.472352,-0.133386,0.693099
2013-01-06,1.560171,1.434012,-0.907497,-0.662624


In [13]:
# Display the index, columns:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.to_numpy()

array([[ 2.10526401,  0.84147546,  0.85204949,  0.44018875],
       [ 0.83083456, -0.29590649,  0.40914357, -0.1449395 ],
       [-0.68100755,  0.20817652,  0.99404503,  0.44147419],
       [-0.68211363, -0.36242849, -0.26845424, -0.49765058],
       [ 0.75938931, -0.47235182, -0.13338563,  0.69309917],
       [ 1.56017138,  1.4340116 , -0.90749684, -0.66262448]])

In [16]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
# quick summary of your data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.648756,0.225496,0.15765,0.044925
std,1.143413,0.767431,0.727434,0.559406
min,-0.682114,-0.472352,-0.907497,-0.662624
25%,-0.320908,-0.345798,-0.234687,-0.409473
50%,0.795112,-0.043865,0.137879,0.147625
75%,1.377837,0.683151,0.741323,0.441153
max,2.105264,1.434012,0.994045,0.693099


In [18]:
# transposing data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,2.105264,0.830835,-0.681008,-0.682114,0.759389,1.560171
B,0.841475,-0.295906,0.208177,-0.362428,-0.472352,1.434012
C,0.852049,0.409144,0.994045,-0.268454,-0.133386,-0.907497
D,0.440189,-0.14494,0.441474,-0.497651,0.693099,-0.662624


In [19]:
# sorting by an axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.440189,0.852049,0.841475,2.105264
2013-01-02,-0.14494,0.409144,-0.295906,0.830835
2013-01-03,0.441474,0.994045,0.208177,-0.681008
2013-01-04,-0.497651,-0.268454,-0.362428,-0.682114
2013-01-05,0.693099,-0.133386,-0.472352,0.759389
2013-01-06,-0.662624,-0.907497,1.434012,1.560171


In [22]:
# sorting by values
df.sort_values(by='A')

Unnamed: 0,A,B,C,D
2013-01-04,-0.682114,-0.362428,-0.268454,-0.497651
2013-01-03,-0.681008,0.208177,0.994045,0.441474
2013-01-05,0.759389,-0.472352,-0.133386,0.693099
2013-01-02,0.830835,-0.295906,0.409144,-0.14494
2013-01-06,1.560171,1.434012,-0.907497,-0.662624
2013-01-01,2.105264,0.841475,0.852049,0.440189


In [23]:
# selecting a column
df['A']

2013-01-01    2.105264
2013-01-02    0.830835
2013-01-03   -0.681008
2013-01-04   -0.682114
2013-01-05    0.759389
2013-01-06    1.560171
Freq: D, Name: A, dtype: float64

In [24]:
# selecting via [] which slices the rows
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,2.105264,0.841475,0.852049,0.440189
2013-01-02,0.830835,-0.295906,0.409144,-0.14494
2013-01-03,-0.681008,0.208177,0.994045,0.441474


In [26]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.830835,-0.295906,0.409144,-0.14494
2013-01-03,-0.681008,0.208177,0.994045,0.441474
2013-01-04,-0.682114,-0.362428,-0.268454,-0.497651


In [27]:
# selecting by label
df.loc[dates[0]]

A    2.105264
B    0.841475
C    0.852049
D    0.440189
Name: 2013-01-01 00:00:00, dtype: float64

In [28]:
# selecting on multi-axis by label
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,2.105264,0.841475
2013-01-02,0.830835,-0.295906
2013-01-03,-0.681008,0.208177
2013-01-04,-0.682114,-0.362428
2013-01-05,0.759389,-0.472352
2013-01-06,1.560171,1.434012


In [29]:
# showing label slicing, both endpoints are included:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.830835,-0.295906
2013-01-03,-0.681008,0.208177
2013-01-04,-0.682114,-0.362428


In [30]:
# selection by position
df.iloc[3]

A   -0.682114
B   -0.362428
C   -0.268454
D   -0.497651
Name: 2013-01-04 00:00:00, dtype: float64

In [31]:
# selection by position slices, similar to numpy/python
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.682114,-0.362428
2013-01-05,0.759389,-0.472352


In [32]:
# selection by lists of integer position locations, similar to numpy/python
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,0.830835,0.409144
2013-01-03,-0.681008,0.994045
2013-01-05,0.759389,-0.133386
