## 10 minutes to pandas¶


This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the Cookbook.

Customarily, we import as follows:

In [2]:
import pandas as pd
import numpy as np

In [3]:
#Creating a series using a list of values
s = pd.Series([12, 23, 45, 56, np.nan, 67, 79])
s

0    12.0
1    23.0
2    45.0
3    56.0
4     NaN
5    67.0
6    79.0
dtype: float64

### Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [4]:
dates = pd.date_range("20220825", periods=10)
dates

DatetimeIndex(['2022-08-25', '2022-08-26', '2022-08-27', '2022-08-28',
               '2022-08-29', '2022-08-30', '2022-08-31', '2022-09-01',
               '2022-09-02', '2022-09-03'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081
2022-08-28,0.275451,-1.50839,1.721116,0.146027,-0.852859
2022-08-29,-1.146034,0.871547,-0.46535,-0.174185,0.575011
2022-08-30,-0.115018,0.928585,-0.037071,-1.474907,0.891407
2022-08-31,-2.15802,-0.323944,-2.048508,-1.695539,-0.640472
2022-09-01,-0.926845,-0.229892,1.159305,-0.657841,-0.738966
2022-09-02,-1.246496,0.344487,-0.788234,1.759099,0.936389
2022-09-03,-0.414598,-1.283458,-0.39965,0.68531,0.523531


### Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:

In [6]:
df2 = pd.DataFrame(
                   {
                       "A": 13,
                       "B": pd.date_range("20220825", periods=4),
                       "C": pd.Series(32, index=list(range(4)), dtype="float32"),
                       "D": np.array([3, 45, 67, 5], dtype="int32"),
                       "E": pd.Categorical(["test", "train", "valid", "pass"]),
                       "F": "ladoo"
                   })
df2

Unnamed: 0,A,B,C,D,E,F
0,13,2022-08-25,32.0,3,test,ladoo
1,13,2022-08-26,32.0,45,train,ladoo
2,13,2022-08-27,32.0,67,valid,ladoo
3,13,2022-08-28,32.0,5,pass,ladoo


In [7]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
df.head()

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081
2022-08-28,0.275451,-1.50839,1.721116,0.146027,-0.852859
2022-08-29,-1.146034,0.871547,-0.46535,-0.174185,0.575011


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D,E
2022-09-01,-0.926845,-0.229892,1.159305,-0.657841,-0.738966
2022-09-02,-1.246496,0.344487,-0.788234,1.759099,0.936389
2022-09-03,-0.414598,-1.283458,-0.39965,0.68531,0.523531


In [10]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [11]:
df.index
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

### DataFrame.to_numpy() gives a NumPy representation of the underlying data.
**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.**

In [12]:
%%timeit
df.to_numpy()# this will not include index or columns 

9.73 µs ± 415 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [13]:
%%timeit
df2.to_numpy() # for a different dtype it a expensive operation as
               #take a longer time.

314 µs ± 2.39 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
df.describe() # a quick review of statistic of our data.

Unnamed: 0,A,B,C,D,E
count,10.0,10.0,10.0,10.0,10.0
mean,-0.349995,-0.027917,-0.215508,0.056156,-0.029616
std,1.043082,0.84662,1.133836,1.118165,0.821232
min,-2.15802,-1.50839,-2.048508,-1.695539,-0.913331
25%,-1.091237,-0.300431,-0.755558,-0.536927,-0.813953
50%,-0.264808,0.070336,-0.4325,0.060436,-0.05847
75%,0.268343,0.672042,0.481388,0.691323,0.715314
max,1.406171,0.928585,1.721116,1.759099,0.936389


In [15]:
df.T # transposing our data means changing rows and columns

Unnamed: 0,2022-08-25,2022-08-26,2022-08-27,2022-08-28,2022-08-29,2022-08-30,2022-08-31,2022-09-01,2022-09-02,2022-09-03
A,0.24702,1.406171,0.578421,0.275451,-1.146034,-0.115018,-2.15802,-0.926845,-1.246496,-0.414598
B,0.164244,0.781227,-0.023573,-1.50839,0.871547,0.928585,-0.323944,-0.229892,0.344487,-1.283458
C,-0.657532,-1.29336,0.654208,1.721116,-0.46535,-0.037071,-2.048508,1.159305,-0.788234,-0.39965
D,-0.025155,1.305422,0.693327,0.146027,-0.174185,-1.474907,-1.695539,-0.657841,1.759099,0.68531
E,-0.913331,-0.838948,0.762081,-0.852859,0.575011,0.891407,-0.640472,-0.738966,0.936389,0.523531


In [16]:
df.sort_index(axis=1, ascending=False)# sorting by an axis=1 means columns


Unnamed: 0,E,D,C,B,A
2022-08-25,-0.913331,-0.025155,-0.657532,0.164244,0.24702
2022-08-26,-0.838948,1.305422,-1.29336,0.781227,1.406171
2022-08-27,0.762081,0.693327,0.654208,-0.023573,0.578421
2022-08-28,-0.852859,0.146027,1.721116,-1.50839,0.275451
2022-08-29,0.575011,-0.174185,-0.46535,0.871547,-1.146034
2022-08-30,0.891407,-1.474907,-0.037071,0.928585,-0.115018
2022-08-31,-0.640472,-1.695539,-2.048508,-0.323944,-2.15802
2022-09-01,-0.738966,-0.657841,1.159305,-0.229892,-0.926845
2022-09-02,0.936389,1.759099,-0.788234,0.344487,-1.246496
2022-09-03,0.523531,0.68531,-0.39965,-1.283458,-0.414598


In [17]:
df.sort_index(axis=0, ascending=False)# sorting by an axis=0 means rows

Unnamed: 0,A,B,C,D,E
2022-09-03,-0.414598,-1.283458,-0.39965,0.68531,0.523531
2022-09-02,-1.246496,0.344487,-0.788234,1.759099,0.936389
2022-09-01,-0.926845,-0.229892,1.159305,-0.657841,-0.738966
2022-08-31,-2.15802,-0.323944,-2.048508,-1.695539,-0.640472
2022-08-30,-0.115018,0.928585,-0.037071,-1.474907,0.891407
2022-08-29,-1.146034,0.871547,-0.46535,-0.174185,0.575011
2022-08-28,0.275451,-1.50839,1.721116,0.146027,-0.852859
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331


In [18]:
df.sort_values(by="B")# sorting by value found in specific columns.

Unnamed: 0,A,B,C,D,E
2022-08-28,0.275451,-1.50839,1.721116,0.146027,-0.852859
2022-09-03,-0.414598,-1.283458,-0.39965,0.68531,0.523531
2022-08-31,-2.15802,-0.323944,-2.048508,-1.695539,-0.640472
2022-09-01,-0.926845,-0.229892,1.159305,-0.657841,-0.738966
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331
2022-09-02,-1.246496,0.344487,-0.788234,1.759099,0.936389
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-29,-1.146034,0.871547,-0.46535,-0.174185,0.575011
2022-08-30,-0.115018,0.928585,-0.037071,-1.474907,0.891407


## Getting

In [19]:
df["A"]

2022-08-25    0.247020
2022-08-26    1.406171
2022-08-27    0.578421
2022-08-28    0.275451
2022-08-29   -1.146034
2022-08-30   -0.115018
2022-08-31   -2.158020
2022-09-01   -0.926845
2022-09-02   -1.246496
2022-09-03   -0.414598
Freq: D, Name: A, dtype: float64

In [20]:
df[0:3]

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081


In [21]:
df["20220825":"20220902"]

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081
2022-08-28,0.275451,-1.50839,1.721116,0.146027,-0.852859
2022-08-29,-1.146034,0.871547,-0.46535,-0.174185,0.575011
2022-08-30,-0.115018,0.928585,-0.037071,-1.474907,0.891407
2022-08-31,-2.15802,-0.323944,-2.048508,-1.695539,-0.640472
2022-09-01,-0.926845,-0.229892,1.159305,-0.657841,-0.738966
2022-09-02,-1.246496,0.344487,-0.788234,1.759099,0.936389


In [22]:
df.loc[dates[0]] # selecting through label 

A    0.247020
B    0.164244
C   -0.657532
D   -0.025155
E   -0.913331
Name: 2022-08-25 00:00:00, dtype: float64

In [23]:
df.loc[:, ["A", "B"]] # selecting on a multilabel label.

Unnamed: 0,A,B
2022-08-25,0.24702,0.164244
2022-08-26,1.406171,0.781227
2022-08-27,0.578421,-0.023573
2022-08-28,0.275451,-1.50839
2022-08-29,-1.146034,0.871547
2022-08-30,-0.115018,0.928585
2022-08-31,-2.15802,-0.323944
2022-09-01,-0.926845,-0.229892
2022-09-02,-1.246496,0.344487
2022-09-03,-0.414598,-1.283458


In [24]:
df.loc["20220825":"20220902", ["A","B"]]# both end points included.

Unnamed: 0,A,B
2022-08-25,0.24702,0.164244
2022-08-26,1.406171,0.781227
2022-08-27,0.578421,-0.023573
2022-08-28,0.275451,-1.50839
2022-08-29,-1.146034,0.871547
2022-08-30,-0.115018,0.928585
2022-08-31,-2.15802,-0.323944
2022-09-01,-0.926845,-0.229892
2022-09-02,-1.246496,0.344487


In [25]:
%%timeit
df.loc[dates[0], "A"]

74.1 µs ± 427 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [26]:
%%timeit    # for fast access we can use .at method
df.at[dates[0], "A"]

56.5 µs ± 220 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [27]:
df.iloc[3] # selecting via passed integers.

A    0.275451
B   -1.508390
C    1.721116
D    0.146027
E   -0.852859
Name: 2022-08-28 00:00:00, dtype: float64

In [28]:
df.iloc[3:5,0:2] # selecting via slices as in Numpy.

Unnamed: 0,A,B
2022-08-28,0.275451,-1.50839
2022-08-29,-1.146034,0.871547


In [29]:
df.iloc[[1,2,3],[0,2]]# selecting via lists of integer position locations,
                      #similar to the NumPy/Python style

Unnamed: 0,A,C
2022-08-26,1.406171,-1.29336
2022-08-27,0.578421,0.654208
2022-08-28,0.275451,1.721116


In [30]:
df.iloc[1:3, :] # selecting complete values of specific rows.

Unnamed: 0,A,B,C,D,E
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081


In [31]:
df.iloc[:, 1:3] # selecting complete values of specific columns.

Unnamed: 0,B,C
2022-08-25,0.164244,-0.657532
2022-08-26,0.781227,-1.29336
2022-08-27,-0.023573,0.654208
2022-08-28,-1.50839,1.721116
2022-08-29,0.871547,-0.46535
2022-08-30,0.928585,-0.037071
2022-08-31,-0.323944,-2.048508
2022-09-01,-0.229892,1.159305
2022-09-02,0.344487,-0.788234
2022-09-03,-1.283458,-0.39965


In [32]:
%%timeit
df.iloc[2,3] # seclecting the value in a dataframe

50.8 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [33]:
%%timeit
df.iat[2,3] # seclecting the value in a dataframe at faster pace


37.3 µs ± 132 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Boolean indexing

In [34]:
#Using a single column’s values to select data:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,-0.913331
2022-08-26,1.406171,0.781227,-1.29336,1.305422,-0.838948
2022-08-27,0.578421,-0.023573,0.654208,0.693327,0.762081
2022-08-28,0.275451,-1.50839,1.721116,0.146027,-0.852859


In [35]:
#Using a whole dataframe to select data:
df[df > 0]

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,,,
2022-08-26,1.406171,0.781227,,1.305422,
2022-08-27,0.578421,,0.654208,0.693327,0.762081
2022-08-28,0.275451,,1.721116,0.146027,
2022-08-29,,0.871547,,,0.575011
2022-08-30,,0.928585,,,0.891407
2022-08-31,,,,,
2022-09-01,,,1.159305,,
2022-09-02,,0.344487,,1.759099,0.936389
2022-09-03,,,,0.68531,0.523531


### Using the isin() method for filtering:

In [37]:
df2= df.copy()
df2["E"] = ["one", "one", "two", "three", "five", "four", "three", "ten", "four", "eleven"]
df2

Unnamed: 0,A,B,C,D,E
2022-08-25,0.24702,0.164244,-0.657532,-0.025155,one
2022-08-26,1.406171,0.781227,-1.29336,1.305422,one
2022-08-27,0.578421,-0.023573,0.654208,0.693327,two
2022-08-28,0.275451,-1.50839,1.721116,0.146027,three
2022-08-29,-1.146034,0.871547,-0.46535,-0.174185,five
2022-08-30,-0.115018,0.928585,-0.037071,-1.474907,four
2022-08-31,-2.15802,-0.323944,-2.048508,-1.695539,three
2022-09-01,-0.926845,-0.229892,1.159305,-0.657841,ten
2022-09-02,-1.246496,0.344487,-0.788234,1.759099,four
2022-09-03,-0.414598,-1.283458,-0.39965,0.68531,eleven
