## 10 minutes to pandas
#### https://pandas.pydata.org/docs/user_guide/10min.html

## Import packages

In [1]:
import numpy as np

In [2]:
import pandas as pd

## Object creation

###### Creating a Series by passing a list of values, letting pandas create a default integer index:

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

##### Creating a DataFrame by passing a NumPy array with a datetime index and labeled columns:

In [5]:
dates = pd.date_range('20130101', periods=6)

In [6]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

##### Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [7]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [8]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.079919,0.951168,0.032814,-1.551252
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616
2013-01-04,1.47394,0.607995,0.388921,-0.30985
2013-01-05,-0.38573,0.705376,1.574078,0.176116
2013-01-06,1.017695,-1.54179,-0.357102,0.534296


In [9]:
df2 = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'), 
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(["test", "train", "test", "train"]),
                   'F': 'foo'}) 

In [10]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

df2.<TAB>
df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.columns
df2.align              df2.copy
df2.all                df2.count
df2.any                df2.combine
df2.append             df2.D
df2.apply              df2.describe
df2.applymap           df2.diff
df2.B                  df2.duplicated

## Viewing data

##### Here is how to view the top and bottom rows of the frame:

In [13]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.079919,0.951168,0.032814,-1.551252
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616
2013-01-04,1.47394,0.607995,0.388921,-0.30985
2013-01-05,-0.38573,0.705376,1.574078,0.176116


In [14]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,1.47394,0.607995,0.388921,-0.30985
2013-01-05,-0.38573,0.705376,1.574078,0.176116
2013-01-06,1.017695,-1.54179,-0.357102,0.534296


##### Display the index, columns:

In [15]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

##### For df,  DataFrame.to_numpy() is fast and doesn’t require copying data.

In [17]:
df.to_numpy()

array([[ 1.07991939,  0.95116824,  0.03281374, -1.5512518 ],
       [ 0.93910331, -1.5301398 , -1.45761216,  0.6737342 ],
       [-1.25854395, -0.17295594,  0.28286889, -0.07661633],
       [ 1.47393985,  0.60799516,  0.38892133, -0.3098496 ],
       [-0.38573025,  0.7053759 ,  1.57407831,  0.17611609],
       [ 1.01769484, -1.54179006, -0.35710195,  0.5342956 ]])

##### For df2, the DataFrame with multiple dtypes, DataFrame.to_numpy() is relatively expensive.

In [18]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

##### describe() shows a quick statistic summary of your data:

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.477731,-0.163391,0.077328,-0.092262
std,1.060161,1.12784,0.992608,0.803442
min,-1.258544,-1.54179,-1.457612,-1.551252
25%,-0.054522,-1.190844,-0.259623,-0.251541
50%,0.978399,0.21752,0.157841,0.04975
75%,1.064363,0.681031,0.362408,0.444751
max,1.47394,0.951168,1.574078,0.673734


##### Transposing your data:

In [20]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.079919,0.939103,-1.258544,1.47394,-0.38573,1.017695
B,0.951168,-1.53014,-0.172956,0.607995,0.705376,-1.54179
C,0.032814,-1.457612,0.282869,0.388921,1.574078,-0.357102
D,-1.551252,0.673734,-0.076616,-0.30985,0.176116,0.534296


##### Sorting by an axis:

In [21]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.551252,0.032814,0.951168,1.079919
2013-01-02,0.673734,-1.457612,-1.53014,0.939103
2013-01-03,-0.076616,0.282869,-0.172956,-1.258544
2013-01-04,-0.30985,0.388921,0.607995,1.47394
2013-01-05,0.176116,1.574078,0.705376,-0.38573
2013-01-06,0.534296,-0.357102,-1.54179,1.017695


##### Sorting by values:

In [22]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-06,1.017695,-1.54179,-0.357102,0.534296
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616
2013-01-04,1.47394,0.607995,0.388921,-0.30985
2013-01-05,-0.38573,0.705376,1.574078,0.176116
2013-01-01,1.079919,0.951168,0.032814,-1.551252


## Getting

##### Selecting a single column, which yields a Series, equivalent to df.A:

In [23]:
df['A']

2013-01-01    1.079919
2013-01-02    0.939103
2013-01-03   -1.258544
2013-01-04    1.473940
2013-01-05   -0.385730
2013-01-06    1.017695
Freq: D, Name: A, dtype: float64

##### Selecting via [], which slices the rows.

In [24]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.079919,0.951168,0.032814,-1.551252
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616


In [25]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616
2013-01-04,1.47394,0.607995,0.388921,-0.30985


## Selection by label

##### For getting a cross section using a label:

In [26]:
df.loc[dates[0]]

A    1.079919
B    0.951168
C    0.032814
D   -1.551252
Name: 2013-01-01 00:00:00, dtype: float64

##### Selecting on a multi-axis by label:

In [27]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.079919,0.951168
2013-01-02,0.939103,-1.53014
2013-01-03,-1.258544,-0.172956
2013-01-04,1.47394,0.607995
2013-01-05,-0.38573,0.705376
2013-01-06,1.017695,-1.54179


##### Showing label slicing, both endpoints are included:

In [28]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.939103,-1.53014
2013-01-03,-1.258544,-0.172956
2013-01-04,1.47394,0.607995


##### Reduction in the dimensions of the returned object:

In [29]:
df.loc['20130102',['A', 'B']]

A    0.939103
B   -1.530140
Name: 2013-01-02 00:00:00, dtype: float64

##### For getting a scalar value:

In [30]:
df.loc[dates[0], 'A']

1.0799193853272295

##### For getting fast access to a scalar (equivalent to the prior method):

In [31]:
df.at[dates[0], 'A']

1.0799193853272295

## Selection by position

##### Select via the position of the passed integers:

In [32]:
df.iloc[3]

A    1.473940
B    0.607995
C    0.388921
D   -0.309850
Name: 2013-01-04 00:00:00, dtype: float64

##### By integer slices, acting similar to numpy/python:

In [33]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.47394,0.607995
2013-01-05,-0.38573,0.705376


##### By lists of integer position locations, similar to the numpy/python style:

In [34]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.939103,-1.457612
2013-01-03,-1.258544,0.282869
2013-01-05,-0.38573,1.574078


##### For slicing rows explicitly:

In [35]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616


##### For slicing columns explicitly:

In [36]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.951168,0.032814
2013-01-02,-1.53014,-1.457612
2013-01-03,-0.172956,0.282869
2013-01-04,0.607995,0.388921
2013-01-05,0.705376,1.574078
2013-01-06,-1.54179,-0.357102


##### For getting a value explicitly:

In [37]:
df.iloc[1,1]

-1.5301398042541052

##### For getting fast access to a scalar (equivalent to the prior method):

In [38]:
df.iat[1,1]

-1.5301398042541052

## Boolean indexing

##### Using a single column’s values to select data.

In [39]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.079919,0.951168,0.032814,-1.551252
2013-01-02,0.939103,-1.53014,-1.457612,0.673734
2013-01-04,1.47394,0.607995,0.388921,-0.30985
2013-01-06,1.017695,-1.54179,-0.357102,0.534296


##### Selecting values from a DataFrame where a boolean condition is met.

In [40]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.079919,0.951168,0.032814,
2013-01-02,0.939103,,,0.673734
2013-01-03,,,0.282869,
2013-01-04,1.47394,0.607995,0.388921,
2013-01-05,,0.705376,1.574078,0.176116
2013-01-06,1.017695,,,0.534296


##### Using the isin() method for filtering:

In [41]:
df2 = df.copy()

In [42]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [43]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.079919,0.951168,0.032814,-1.551252,one
2013-01-02,0.939103,-1.53014,-1.457612,0.673734,one
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616,two
2013-01-04,1.47394,0.607995,0.388921,-0.30985,three
2013-01-05,-0.38573,0.705376,1.574078,0.176116,four
2013-01-06,1.017695,-1.54179,-0.357102,0.534296,three


In [44]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.258544,-0.172956,0.282869,-0.076616,two
2013-01-05,-0.38573,0.705376,1.574078,0.176116,four
