Import required Packages:

In [1]:
import numpy as np
import pandas as pd

# Object Creation
Create a `Series` by passing a list of values with default integer index

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Create a DataFrame by passing a Numpy array with a datetime index and labeled columns.

In [3]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.209066,2.130952,-1.401477,1.150026
2013-01-02,0.14554,0.189054,0.146913,0.420898
2013-01-03,-0.952307,1.528567,0.73205,-1.218387
2013-01-04,0.131811,-0.877965,-1.164874,0.197635
2013-01-05,-1.401776,-0.755934,0.271662,1.104226
2013-01-06,-0.973918,0.001412,-0.372974,0.268363


# Create a DataFrame from a dict of objects

In [5]:
df2 = pd.DataFrame({'A': 1.,
                       'B': pd.Timestamp('20130102'),
                       'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                       'D': np.array([3] * 4, dtype='int32'),
                       'E': pd.Categorical(["test", "train", "test", "train"]),
                       'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


The columns of the resulting DataFrame have different dtypes.

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In jupyter, ```tab``` completion for column names (as well as public attributes) is automatically enabled. Here’s a subset of the attributes that can be completed:

In [7]:
df2.# PRESS <TAB>

# Viewing Data
Here is how to view the top and bottom rows of the frame:

In [8]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.209066,2.130952,-1.401477,1.150026
2013-01-02,0.14554,0.189054,0.146913,0.420898
2013-01-03,-0.952307,1.528567,0.73205,-1.218387
2013-01-04,0.131811,-0.877965,-1.164874,0.197635
2013-01-05,-1.401776,-0.755934,0.271662,1.104226


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.131811,-0.877965,-1.164874,0.197635
2013-01-05,-1.401776,-0.755934,0.271662,1.104226
2013-01-06,-0.973918,0.001412,-0.372974,0.268363


In [10]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

# Convert DataFrame to numpy array.



In [12]:
df.to_numpy()

array([[ 2.09065900e-01,  2.13095199e+00, -1.40147712e+00,
         1.15002620e+00],
       [ 1.45540036e-01,  1.89053825e-01,  1.46912626e-01,
         4.20898345e-01],
       [-9.52307288e-01,  1.52856733e+00,  7.32049713e-01,
        -1.21838740e+00],
       [ 1.31811368e-01, -8.77964738e-01, -1.16487398e+00,
         1.97634945e-01],
       [-1.40177619e+00, -7.55934176e-01,  2.71662197e-01,
         1.10422564e+00],
       [-9.73917594e-01,  1.41151997e-03, -3.72974255e-01,
         2.68362607e-01]])

NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object. This can lead to very expensive (time and memory consuming) operation.

In [13]:
df.describe() # shows a quick statistic summary of your data

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.473597,0.369348,-0.298117,0.32046
std,0.715106,1.219588,0.84359,0.860047
min,-1.401776,-0.877965,-1.401477,-1.218387
25%,-0.968515,-0.566598,-0.966899,0.215317
50%,-0.410248,0.095233,-0.113031,0.34463
75%,0.142108,1.193689,0.240475,0.933394
max,0.209066,2.130952,0.73205,1.150026


In [14]:
# transpose the data
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.209066,0.14554,-0.952307,0.131811,-1.401776,-0.973918
B,2.130952,0.189054,1.528567,-0.877965,-0.755934,0.001412
C,-1.401477,0.146913,0.73205,-1.164874,0.271662,-0.372974
D,1.150026,0.420898,-1.218387,0.197635,1.104226,0.268363


In [15]:
# sort by column name
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.150026,-1.401477,2.130952,0.209066
2013-01-02,0.420898,0.146913,0.189054,0.14554
2013-01-03,-1.218387,0.73205,1.528567,-0.952307
2013-01-04,0.197635,-1.164874,-0.877965,0.131811
2013-01-05,1.104226,0.271662,-0.755934,-1.401776
2013-01-06,0.268363,-0.372974,0.001412,-0.973918


In [16]:
# We can also sort DataFrame by values in specific column.
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,0.131811,-0.877965,-1.164874,0.197635
2013-01-05,-1.401776,-0.755934,0.271662,1.104226
2013-01-06,-0.973918,0.001412,-0.372974,0.268363
2013-01-02,0.14554,0.189054,0.146913,0.420898
2013-01-03,-0.952307,1.528567,0.73205,-1.218387
2013-01-01,0.209066,2.130952,-1.401477,1.150026


# Data Filtering

In [17]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [18]:
df2 = pd.DataFrame({'A': 1.,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})


## Getting Values

| Operation | Syntax | Result |
| --- | --- | --- |
| Select column	| ```df[col]``` |	Series |
| Select row by label | ```df.loc[label]``` | Series |
| Select row by integer location |```df.iloc[loc]``` | Series |
| Slice rows | ```df[5:10]```	| DataFrame |
| Select rows by boolean vector	| ```df[bool_vec]```	| DataFrame |

Select a single column.

In [19]:
df['A']

2013-01-01    0.727984
2013-01-02   -1.217949
2013-01-03    0.234263
2013-01-04   -0.852261
2013-01-05    1.205890
2013-01-06    1.099223
Freq: D, Name: A, dtype: float64

In [20]:
df.A # equivalent method of calling the column

2013-01-01    0.727984
2013-01-02   -1.217949
2013-01-03    0.234263
2013-01-04   -0.852261
2013-01-05    1.205890
2013-01-06    1.099223
Freq: D, Name: A, dtype: float64

The first of the above two options is recommended because it avoids possible conflicts with any of the DataFrame methods.

Selecting via `[]`, which slices the rows.

In [21]:
df[0:3] # slicing the rows

Unnamed: 0,A,B,C,D
2013-01-01,0.727984,-1.61709,0.924986,-0.501922
2013-01-02,-1.217949,-0.536806,2.212739,-1.664632
2013-01-03,0.234263,-1.03811,-1.533451,1.221825


In [22]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-1.217949,-0.536806,2.212739,-1.664632
2013-01-03,0.234263,-1.03811,-1.533451,1.221825
2013-01-04,-0.852261,0.160874,-1.107833,1.150848


# Selection by label


In [23]:
df.loc["2013-01-01"] # select first row based on index value

A    0.727984
B   -1.617090
C    0.924986
D   -0.501922
Name: 2013-01-01 00:00:00, dtype: float64

In [24]:
df.loc[:, ['A', 'B']] # select one column by column names

Unnamed: 0,A,B
2013-01-01,0.727984,-1.61709
2013-01-02,-1.217949,-0.536806
2013-01-03,0.234263,-1.03811
2013-01-04,-0.852261,0.160874
2013-01-05,1.20589,-0.024379
2013-01-06,1.099223,-0.238357


`:` represents that we want to take all the rows as well as the list, `['A', 'B']`, which represents the columns.

We can also use label slicing and include both endpoints:

In [25]:
df.loc['20130102':'20130104', ['A', 'B']] # we can use label slicing and include both endpoints

Unnamed: 0,A,B
2013-01-02,-1.217949,-0.536806
2013-01-03,0.234263,-1.03811
2013-01-04,-0.852261,0.160874


The data type of the returned object is automatically changed based on the dimension of the object.

A dimension indicates the shape of the object. For example, `DataFrame` is made of rows and columns therefore, its dimension is 2. `Series`, on the other hand, has dimension 1 because there are no columns in `Series`.

In [26]:
df.loc['20130102', ['A', 'B']] # for scalar there is only one dimension

A   -1.217949
B   -0.536806
Name: 2013-01-02 00:00:00, dtype: float64

In [27]:
df.loc[dates[0], 'A']

0.7279839234319424

# Selection by Position

We can also select based on the actual position in DataFrame

In [28]:
df.iloc[3]

A   -0.852261
B    0.160874
C   -1.107833
D    1.150848
Name: 2013-01-04 00:00:00, dtype: float64

In [29]:
df.iloc[3:5, 0:2] # slicing is available in a similar way to numpy/python style

Unnamed: 0,A,B
2013-01-04,-0.852261,0.160874
2013-01-05,1.20589,-0.024379


In [30]:
df.iloc[1:3, :] # if we want to take all rows, we use : again.
# Similarly, we can use : to take all the columns as well.

Unnamed: 0,A,B,C,D
2013-01-02,-1.217949,-0.536806,2.212739,-1.664632
2013-01-03,0.234263,-1.03811,-1.533451,1.221825


# Selection by dtype

The ```select_dtypes()``` method implements subsetting of columns based on their ```dtype```. By subsetting, we mean taking only the selection of columns based on their ```dtype```.

In [31]:
df1 = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})

Select only bool columns from df above.

In [32]:
df1.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


# Boolean Indexing

Use a columns' values to filter data. Take the rows where column A is higher than 0.

In [33]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.727984,-1.61709,0.924986,-0.501922
2013-01-03,0.234263,-1.03811,-1.533451,1.221825
2013-01-05,1.20589,-0.024379,-0.344424,1.306614
2013-01-06,1.099223,-0.238357,-1.716726,-1.427502


We can also use function ```isin()``` for filtering.
1. Create a copy of ```df``` and store it in variable ```df2```.
2. Create a new column E in the DataFrame df2 with values ```['one', 'one', 'two', 'three', 'four', 'three']```.

In [34]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

Now we can use function `isin()` to take only rows where E is two or four.

In [35]:
df2[df2['E'].isin(['one','two'])]

Unnamed: 0,A,B,C,D,E
2013-01-01,0.727984,-1.61709,0.924986,-0.501922,one
2013-01-02,-1.217949,-0.536806,2.212739,-1.664632,one
2013-01-03,0.234263,-1.03811,-1.533451,1.221825,two


We can also set values in the DataFrame.

Setting values by position:

In [39]:
df1.iat[0, 1] = -1 # or

In [40]:
df1.iloc[0, 1] = 2

Setting values by label:

In [41]:
df1.at[0, 'float64'] = -10 # or

In [42]:
df1.loc[0, 'float64'] = -20

Setting by assigning with a NumPy array:

In [43]:
df1.loc[:, 'uint8'] = np.array([50] * len(df))

ValueError: Must have equal len keys and value when setting with an iterable

The length of the array on the right sight of = needs to be the same as the length of the object on the left.