In [1]:
import numpy as np
import pandas as pd

## OBJECT Creation

In [34]:
s = pd.Series([1,3,4,5,6,np.nan,8]) # Series by passing list
print(s)

0    1.0
1    3.0
2    4.0
3    5.0
4    6.0
5    NaN
6    8.0
dtype: float64


In [35]:
dates = pd.date_range("20220101",periods=6)
print(dates)

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')


In [36]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
print(df) # data frame

                   A         B         C         D
2022-01-01 -0.352191  0.698909  0.732387 -0.972465
2022-01-02  0.650595  0.651618  0.035792  0.656556
2022-01-03  2.332927  0.866211 -0.539567  1.078459
2022-01-04 -0.352783  0.000415  0.320556  1.974563
2022-01-05  0.904199 -0.038232 -1.450394 -0.450577
2022-01-06  0.702843  0.877232  0.552972 -0.335785


In [37]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


## Viewing Data

In [38]:
print(df.head()) # first few records

                   A         B         C         D
2022-01-01 -0.352191  0.698909  0.732387 -0.972465
2022-01-02  0.650595  0.651618  0.035792  0.656556
2022-01-03  2.332927  0.866211 -0.539567  1.078459
2022-01-04 -0.352783  0.000415  0.320556  1.974563
2022-01-05  0.904199 -0.038232 -1.450394 -0.450577


In [39]:
print(df.tail()) # last few records

                   A         B         C         D
2022-01-02  0.650595  0.651618  0.035792  0.656556
2022-01-03  2.332927  0.866211 -0.539567  1.078459
2022-01-04 -0.352783  0.000415  0.320556  1.974563
2022-01-05  0.904199 -0.038232 -1.450394 -0.450577
2022-01-06  0.702843  0.877232  0.552972 -0.335785


In [40]:
print(df.index)
print(df.columns)

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [41]:
print(df.to_numpy()) # will not include index and columns.

[[-3.52191182e-01  6.98909317e-01  7.32386805e-01 -9.72464984e-01]
 [ 6.50594743e-01  6.51617681e-01  3.57917261e-02  6.56556440e-01]
 [ 2.33292692e+00  8.66210872e-01 -5.39567213e-01  1.07845948e+00]
 [-3.52783066e-01  4.15391594e-04  3.20556314e-01  1.97456309e+00]
 [ 9.04198960e-01 -3.82315326e-02 -1.45039358e+00 -4.50577316e-01]
 [ 7.02843203e-01  8.77231574e-01  5.52971685e-01 -3.35784886e-01]]


In [42]:
print(df.describe()) # statistics of data

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.647598  0.509359 -0.058042  0.325125
std    0.990917  0.418980  0.814956  1.106369
min   -0.352783 -0.038232 -1.450394 -0.972465
25%   -0.101495  0.163216 -0.395727 -0.421879
50%    0.676719  0.675263  0.178174  0.160386
75%    0.853860  0.824385  0.494868  0.972984
max    2.332927  0.877232  0.732387  1.974563


In [43]:
print(df.T) # Transposing 

   2022-01-01  2022-01-02  2022-01-03  2022-01-04  2022-01-05  2022-01-06
A   -0.352191    0.650595    2.332927   -0.352783    0.904199    0.702843
B    0.698909    0.651618    0.866211    0.000415   -0.038232    0.877232
C    0.732387    0.035792   -0.539567    0.320556   -1.450394    0.552972
D   -0.972465    0.656556    1.078459    1.974563   -0.450577   -0.335785


## Sorting DF 

In [44]:
print(df.sort_index(axis=1, ascending=False))

                   D         C         B         A
2022-01-01 -0.972465  0.732387  0.698909 -0.352191
2022-01-02  0.656556  0.035792  0.651618  0.650595
2022-01-03  1.078459 -0.539567  0.866211  2.332927
2022-01-04  1.974563  0.320556  0.000415 -0.352783
2022-01-05 -0.450577 -1.450394 -0.038232  0.904199
2022-01-06 -0.335785  0.552972  0.877232  0.702843


In [45]:
print(df.sort_values(by="B"))

                   A         B         C         D
2022-01-05  0.904199 -0.038232 -1.450394 -0.450577
2022-01-04 -0.352783  0.000415  0.320556  1.974563
2022-01-02  0.650595  0.651618  0.035792  0.656556
2022-01-01 -0.352191  0.698909  0.732387 -0.972465
2022-01-03  2.332927  0.866211 -0.539567  1.078459
2022-01-06  0.702843  0.877232  0.552972 -0.335785


## Selection

In [46]:
print(df["A"])

2022-01-01   -0.352191
2022-01-02    0.650595
2022-01-03    2.332927
2022-01-04   -0.352783
2022-01-05    0.904199
2022-01-06    0.702843
Freq: D, Name: A, dtype: float64


In [47]:
print(df[0:3])

                   A         B         C         D
2022-01-01 -0.352191  0.698909  0.732387 -0.972465
2022-01-02  0.650595  0.651618  0.035792  0.656556
2022-01-03  2.332927  0.866211 -0.539567  1.078459


In [48]:
print(df["20220102":"20220105"])

                   A         B         C         D
2022-01-02  0.650595  0.651618  0.035792  0.656556
2022-01-03  2.332927  0.866211 -0.539567  1.078459
2022-01-04 -0.352783  0.000415  0.320556  1.974563
2022-01-05  0.904199 -0.038232 -1.450394 -0.450577


In [49]:
# By label
df.loc[dates[0]]

A   -0.352191
B    0.698909
C    0.732387
D   -0.972465
Name: 2022-01-01 00:00:00, dtype: float64

In [50]:
df.loc[:, ["A", "B"]] # By label

Unnamed: 0,A,B
2022-01-01,-0.352191,0.698909
2022-01-02,0.650595,0.651618
2022-01-03,2.332927,0.866211
2022-01-04,-0.352783,0.000415
2022-01-05,0.904199,-0.038232
2022-01-06,0.702843,0.877232


In [51]:
df.iloc[3] # by position

A   -0.352783
B    0.000415
C    0.320556
D    1.974563
Name: 2022-01-04 00:00:00, dtype: float64

In [52]:
df.iloc[3:5, 0:2] # by position

Unnamed: 0,A,B
2022-01-04,-0.352783,0.000415
2022-01-05,0.904199,-0.038232


## Boolean indexing

In [53]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-01-02,0.650595,0.651618,0.035792,0.656556
2022-01-03,2.332927,0.866211,-0.539567,1.078459
2022-01-05,0.904199,-0.038232,-1.450394,-0.450577
2022-01-06,0.702843,0.877232,0.552972,-0.335785


In [54]:
df[df > 0]

Unnamed: 0,A,B,C,D
2022-01-01,,0.698909,0.732387,
2022-01-02,0.650595,0.651618,0.035792,0.656556
2022-01-03,2.332927,0.866211,,1.078459
2022-01-04,,0.000415,0.320556,1.974563
2022-01-05,0.904199,,,
2022-01-06,0.702843,0.877232,0.552972,


In [55]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
print(df2)
df2[df2["E"].isin(["two", "four"])]

                   A         B         C         D      E
2022-01-01 -0.352191  0.698909  0.732387 -0.972465    one
2022-01-02  0.650595  0.651618  0.035792  0.656556    one
2022-01-03  2.332927  0.866211 -0.539567  1.078459    two
2022-01-04 -0.352783  0.000415  0.320556  1.974563  three
2022-01-05  0.904199 -0.038232 -1.450394 -0.450577   four
2022-01-06  0.702843  0.877232  0.552972 -0.335785  three


Unnamed: 0,A,B,C,D,E
2022-01-03,2.332927,0.866211,-0.539567,1.078459,two
2022-01-05,0.904199,-0.038232,-1.450394,-0.450577,four


## Setting

In [56]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [58]:
df["F"] = s1
df.at[dates[0], "A"] = 0
df.iat[0, 1] = 0
df.loc[:, "D"] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2022-01-01,0.0,0.0,0.732387,5,
2022-01-02,0.650595,0.651618,0.035792,5,
2022-01-03,2.332927,0.866211,-0.539567,5,
2022-01-04,-0.352783,0.000415,0.320556,5,
2022-01-05,0.904199,-0.038232,-1.450394,5,
2022-01-06,0.702843,0.877232,0.552972,5,


In [59]:
# Missing values
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2022-01-01,0.0,0.0,0.732387,5,,1.0
2022-01-02,0.650595,0.651618,0.035792,5,,1.0
2022-01-03,2.332927,0.866211,-0.539567,5,,
2022-01-04,-0.352783,0.000415,0.320556,5,,


In [60]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E


In [61]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2022-01-01,0.0,0.0,0.732387,5,5.0,1.0
2022-01-02,0.650595,0.651618,0.035792,5,5.0,1.0
2022-01-03,2.332927,0.866211,-0.539567,5,5.0,5.0
2022-01-04,-0.352783,0.000415,0.320556,5,5.0,5.0


In [62]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2022-01-01,False,False,False,False,True,False
2022-01-02,False,False,False,False,True,False
2022-01-03,False,False,False,False,True,True
2022-01-04,False,False,False,False,True,True


## Operations

In [65]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)
df

Unnamed: 0,one,two,three
a,0.286367,-1.83033,
b,-0.817257,-0.853601,0.456873
c,-0.150462,1.566086,-1.342931
d,,-1.212645,-0.246103


In [66]:
row = df.iloc[1]
row

one     -0.817257
two     -0.853601
three    0.456873
Name: b, dtype: float64

In [67]:
column = df["two"]
column

a   -1.830330
b   -0.853601
c    1.566086
d   -1.212645
Name: two, dtype: float64

In [68]:
df.sub(row, axis="columns")

Unnamed: 0,one,two,three
a,1.103623,-0.97673,
b,0.0,0.0,0.0
c,0.666795,2.419687,-1.799804
d,,-0.359044,-0.702976


In [69]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,1.103623,-0.97673,
b,0.0,0.0,0.0
c,0.666795,2.419687,-1.799804
d,,-0.359044,-0.702976


In [70]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,2.116697,0.0,
b,0.036344,0.0,1.310474
c,-1.716548,0.0,-2.909017
d,,0.0,0.966541


In [71]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,2.116697,0.0,
b,0.036344,0.0,1.310474
c,-1.716548,0.0,-2.909017
d,,0.0,0.966541
