## 10 minutes to pandas¶


This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the Cookbook.

Customarily, we import as follows:

In [2]:
import pandas as pd
import numpy as np

In [3]:
#Creating a series using a list of values
s = pd.Series([12, 23, 45, 56, np.nan, 67, 79])
s

0    12.0
1    23.0
2    45.0
3    56.0
4     NaN
5    67.0
6    79.0
dtype: float64

### Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [4]:
dates = pd.date_range("20220825", periods=10)
dates

DatetimeIndex(['2022-08-25', '2022-08-26', '2022-08-27', '2022-08-28',
               '2022-08-29', '2022-08-30', '2022-08-31', '2022-09-01',
               '2022-09-02', '2022-09-03'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373
2022-08-28,-0.360397,-0.201981,-0.279964,-0.153894,0.390967
2022-08-29,-2.136704,0.050696,1.412433,0.225615,0.095404
2022-08-30,0.617531,0.149137,-1.615549,-0.464108,-0.439571
2022-08-31,0.601921,-1.500341,0.076425,0.474716,-0.858218
2022-09-01,-0.936094,-2.876851,-0.752564,1.506076,0.254695
2022-09-02,-1.011871,-0.185457,1.176216,-0.06931,-0.160968
2022-09-03,-0.335959,0.399276,-0.882573,1.000915,0.207915


### Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:

In [6]:
df2 = pd.DataFrame(
                   {
                       "A": 13,
                       "B": pd.date_range("20220825", periods=4),
                       "C": pd.Series(32, index=list(range(4)), dtype="float32"),
                       "D": np.array([3, 45, 67, 5], dtype="int32"),
                       "E": pd.Categorical(["test", "train", "valid", "pass"]),
                       "F": "ladoo"
                   })
df2

Unnamed: 0,A,B,C,D,E,F
0,13,2022-08-25,32.0,3,test,ladoo
1,13,2022-08-26,32.0,45,train,ladoo
2,13,2022-08-27,32.0,67,valid,ladoo
3,13,2022-08-28,32.0,5,pass,ladoo


In [7]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
df.head()

Unnamed: 0,A,B,C,D,E
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373
2022-08-28,-0.360397,-0.201981,-0.279964,-0.153894,0.390967
2022-08-29,-2.136704,0.050696,1.412433,0.225615,0.095404


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D,E
2022-09-01,-0.936094,-2.876851,-0.752564,1.506076,0.254695
2022-09-02,-1.011871,-0.185457,1.176216,-0.06931,-0.160968
2022-09-03,-0.335959,0.399276,-0.882573,1.000915,0.207915


In [10]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [11]:
df.index
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

### DataFrame.to_numpy() gives a NumPy representation of the underlying data.
**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.**

In [12]:
%%timeit
df.to_numpy()# this will not include index or columns 

2.16 µs ± 27.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [13]:
%%timeit
df2.to_numpy() # for a different dtype it a expensive operation as
               #take a longer time.

69.9 µs ± 540 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [14]:
df.describe() # a quick review of statistic of our data.

Unnamed: 0,A,B,C,D,E
count,10.0,10.0,10.0,10.0,10.0
mean,-0.101292,-0.387051,-0.154874,0.390395,0.028764
std,1.107424,1.138191,0.987311,0.599086,0.970562
min,-2.136704,-2.876851,-1.615549,-0.464108,-1.901373
25%,-0.79217,-0.913825,-0.850071,-0.046799,-0.36992
50%,-0.05378,-0.067381,-0.112424,0.350166,0.151659
75%,0.56971,0.336742,0.351412,0.747165,0.356899
max,1.847174,0.724557,1.412433,1.506076,1.391358


In [15]:
df.T # transposing our data means changing rows and columns

Unnamed: 0,2022-08-25,2022-08-26,2022-08-27,2022-08-28,2022-08-29,2022-08-30,2022-08-31,2022-09-01,2022-09-02,2022-09-03
A,1.847174,0.228398,0.47308,-0.360397,-2.136704,0.617531,0.601921,-0.936094,-1.011871,-0.335959
B,0.721562,0.724557,-1.151106,-0.201981,0.050696,0.149137,-1.500341,-2.876851,-0.185457,0.399276
C,0.443074,0.055115,-1.181357,-0.279964,1.412433,-1.615549,0.076425,-0.752564,1.176216,-0.882573
D,0.020732,0.812724,0.550487,-0.153894,0.225615,-0.464108,0.474716,1.506076,-0.06931,1.000915
E,1.307426,1.391358,-1.901373,0.390967,0.095404,-0.439571,-0.858218,0.254695,-0.160968,0.207915


In [16]:
df.sort_index(axis=1, ascending=False)# sorting by an axis=1 means columns


Unnamed: 0,E,D,C,B,A
2022-08-25,1.307426,0.020732,0.443074,0.721562,1.847174
2022-08-26,1.391358,0.812724,0.055115,0.724557,0.228398
2022-08-27,-1.901373,0.550487,-1.181357,-1.151106,0.47308
2022-08-28,0.390967,-0.153894,-0.279964,-0.201981,-0.360397
2022-08-29,0.095404,0.225615,1.412433,0.050696,-2.136704
2022-08-30,-0.439571,-0.464108,-1.615549,0.149137,0.617531
2022-08-31,-0.858218,0.474716,0.076425,-1.500341,0.601921
2022-09-01,0.254695,1.506076,-0.752564,-2.876851,-0.936094
2022-09-02,-0.160968,-0.06931,1.176216,-0.185457,-1.011871
2022-09-03,0.207915,1.000915,-0.882573,0.399276,-0.335959


In [17]:
df.sort_index(axis=0, ascending=False)# sorting by an axis=0 means rows

Unnamed: 0,A,B,C,D,E
2022-09-03,-0.335959,0.399276,-0.882573,1.000915,0.207915
2022-09-02,-1.011871,-0.185457,1.176216,-0.06931,-0.160968
2022-09-01,-0.936094,-2.876851,-0.752564,1.506076,0.254695
2022-08-31,0.601921,-1.500341,0.076425,0.474716,-0.858218
2022-08-30,0.617531,0.149137,-1.615549,-0.464108,-0.439571
2022-08-29,-2.136704,0.050696,1.412433,0.225615,0.095404
2022-08-28,-0.360397,-0.201981,-0.279964,-0.153894,0.390967
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426


In [18]:
df.sort_values(by="B")# sorting by value found in specific columns.

Unnamed: 0,A,B,C,D,E
2022-09-01,-0.936094,-2.876851,-0.752564,1.506076,0.254695
2022-08-31,0.601921,-1.500341,0.076425,0.474716,-0.858218
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373
2022-08-28,-0.360397,-0.201981,-0.279964,-0.153894,0.390967
2022-09-02,-1.011871,-0.185457,1.176216,-0.06931,-0.160968
2022-08-29,-2.136704,0.050696,1.412433,0.225615,0.095404
2022-08-30,0.617531,0.149137,-1.615549,-0.464108,-0.439571
2022-09-03,-0.335959,0.399276,-0.882573,1.000915,0.207915
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358


## Getting

In [19]:
df["A"]

2022-08-25    1.847174
2022-08-26    0.228398
2022-08-27    0.473080
2022-08-28   -0.360397
2022-08-29   -2.136704
2022-08-30    0.617531
2022-08-31    0.601921
2022-09-01   -0.936094
2022-09-02   -1.011871
2022-09-03   -0.335959
Freq: D, Name: A, dtype: float64

In [20]:
df[0:3]

Unnamed: 0,A,B,C,D,E
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373


In [21]:
df["20220825":"20220902"]

Unnamed: 0,A,B,C,D,E
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373
2022-08-28,-0.360397,-0.201981,-0.279964,-0.153894,0.390967
2022-08-29,-2.136704,0.050696,1.412433,0.225615,0.095404
2022-08-30,0.617531,0.149137,-1.615549,-0.464108,-0.439571
2022-08-31,0.601921,-1.500341,0.076425,0.474716,-0.858218
2022-09-01,-0.936094,-2.876851,-0.752564,1.506076,0.254695
2022-09-02,-1.011871,-0.185457,1.176216,-0.06931,-0.160968


In [22]:
df.loc[dates[0]] # selecting through label 

A    1.847174
B    0.721562
C    0.443074
D    0.020732
E    1.307426
Name: 2022-08-25 00:00:00, dtype: float64

In [23]:
df.loc[:, ["A", "B"]] # selecting on a multilabel label.

Unnamed: 0,A,B
2022-08-25,1.847174,0.721562
2022-08-26,0.228398,0.724557
2022-08-27,0.47308,-1.151106
2022-08-28,-0.360397,-0.201981
2022-08-29,-2.136704,0.050696
2022-08-30,0.617531,0.149137
2022-08-31,0.601921,-1.500341
2022-09-01,-0.936094,-2.876851
2022-09-02,-1.011871,-0.185457
2022-09-03,-0.335959,0.399276


In [25]:
df.loc["20220825":"20220902", ["A","B"]]# both end points included.

Unnamed: 0,A,B
2022-08-25,1.847174,0.721562
2022-08-26,0.228398,0.724557
2022-08-27,0.47308,-1.151106
2022-08-28,-0.360397,-0.201981
2022-08-29,-2.136704,0.050696
2022-08-30,0.617531,0.149137
2022-08-31,0.601921,-1.500341
2022-09-01,-0.936094,-2.876851
2022-09-02,-1.011871,-0.185457


In [27]:
%%timeit
df.loc[dates[0], "A"]

16 µs ± 215 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [28]:
%%timeit    # for fast access we can use .at method
df.at[dates[0], "A"]

12.7 µs ± 40.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [29]:
df.iloc[3] # selecting via passed integers.

A   -0.360397
B   -0.201981
C   -0.279964
D   -0.153894
E    0.390967
Name: 2022-08-28 00:00:00, dtype: float64

In [30]:
df.iloc[3:5,0:2] # selecting via slices as in Numpy.

Unnamed: 0,A,B
2022-08-28,-0.360397,-0.201981
2022-08-29,-2.136704,0.050696


In [32]:
df.iloc[[1,2,3],[0,2]]# selecting via lists of integer position locations,
                      #similar to the NumPy/Python style

Unnamed: 0,A,C
2022-08-26,0.228398,0.055115
2022-08-27,0.47308,-1.181357
2022-08-28,-0.360397,-0.279964


In [33]:
df.iloc[1:3, :] # selecting complete values of specific rows.

Unnamed: 0,A,B,C,D,E
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373


In [34]:
df.iloc[:, 1:3] # selecting complete values of specific columns.

Unnamed: 0,B,C
2022-08-25,0.721562,0.443074
2022-08-26,0.724557,0.055115
2022-08-27,-1.151106,-1.181357
2022-08-28,-0.201981,-0.279964
2022-08-29,0.050696,1.412433
2022-08-30,0.149137,-1.615549
2022-08-31,-1.500341,0.076425
2022-09-01,-2.876851,-0.752564
2022-09-02,-0.185457,1.176216
2022-09-03,0.399276,-0.882573


In [37]:
%%timeit
df.iloc[2,3] # seclecting the value in a dataframe

11.1 µs ± 175 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [38]:
%%timeit
df.iat[2,3] # seclecting the value in a dataframe at faster pace


8.27 µs ± 44.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


## Boolean indexing

In [40]:
#Using a single column’s values to select data:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D,E
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,-1.151106,-1.181357,0.550487,-1.901373
2022-08-30,0.617531,0.149137,-1.615549,-0.464108,-0.439571
2022-08-31,0.601921,-1.500341,0.076425,0.474716,-0.858218


In [41]:
#Using a whole dataframe to select data:
df[df > 0]

Unnamed: 0,A,B,C,D,E
2022-08-25,1.847174,0.721562,0.443074,0.020732,1.307426
2022-08-26,0.228398,0.724557,0.055115,0.812724,1.391358
2022-08-27,0.47308,,,0.550487,
2022-08-28,,,,,0.390967
2022-08-29,,0.050696,1.412433,0.225615,0.095404
2022-08-30,0.617531,0.149137,,,
2022-08-31,0.601921,,0.076425,0.474716,
2022-09-01,,,,1.506076,0.254695
2022-09-02,,,1.176216,,
2022-09-03,,0.399276,,1.000915,0.207915
