In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

In [2]:
#sorting lexicographically by row or column label: use sort_index method
obj = pd.Series(np.arange(4.), index=['d', 'a', 'b', 'c'])
obj #returns a new, sorted object

d    0.0
a    1.0
b    2.0
c    3.0
dtype: float64

In [3]:
obj.sort_index()

a    1.0
b    2.0
c    3.0
d    0.0
dtype: float64

In [4]:
#same as above
obj.sort_index(axis=0)

a    1.0
b    2.0
c    3.0
d    0.0
dtype: float64

In [5]:
#same as above
obj.sort_index(axis='rows')

a    1.0
b    2.0
c    3.0
d    0.0
dtype: float64

In [6]:
#with DataFrame objects, we can sort an index on rows or columns
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index=['three', 'one'],
                    columns=['d', 'a', 'b', 'c'])

In [7]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [8]:
#sort rows lexicographically based on rows
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [9]:
#sort rows lexicographically based on rows in another way with axis=0
frame.sort_index(axis=0)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [10]:
#sort rows lexicographically another way with axis='rows'
frame.sort_index(axis='rows')

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [11]:
#sort rows in descending order
frame.sort_index(axis='rows', ascending=False)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [12]:
#Although it is redundant, we can use the "ascending=True" argument
frame.sort_index(axis='rows', ascending=True)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [13]:
#sort columns lexicographically
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [14]:
#sort columns lexicograhically another way with axis='columns'
frame.sort_index(axis='columns')

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [15]:
#sort columns in reversed order with ascending=False
frame.sort_index(axis='columns', ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [16]:
#Redundant but we can specify that ascending=True also
frame.sort_index(axis='columns', ascending=True)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [17]:
#sorting a series by its values
#use the s.sort_values() method
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [18]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [19]:
#reverse the order
obj.sort_values(ascending=False)

1    7
0    4
3    2
2   -3
dtype: int64

In [20]:
#get the index of the values in order by using argsort
obj.argsort()

0    2
1    3
2    0
3    1
dtype: int64

In [21]:
#missing values: sorted to the end of the series by default
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [22]:
#Default: missing values at the end of the series
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [23]:
#na_position option: lets us specify the position of na_values
obj.sort_values(na_position='first')

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [24]:
obj.sort_values(na_position='last')

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [25]:
#columns to use as sort keys: single column or multiple columns
#1) create dataframe
frame=pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [26]:
#sort values by a single column ('b')
frame.sort_values('b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [27]:
#sort values by a single column ('a')
frame.sort_values('a')

Unnamed: 0,b,a
0,4,0
2,-3,0
1,7,1
3,2,1


In [28]:
#sorting by multiple columns
frame.sort_values(['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [29]:
#sort by multiple columns in the other order
frame.sort_values(['b', 'a'])

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [30]:
#we can also pass a list as a single value
frame.sort_values(['b'])

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [31]:
#sort_values(ascending=True) is redundant but we will use it here
frame.sort_values('a', ascending=True)

Unnamed: 0,b,a
0,4,0
2,-3,0
1,7,1
3,2,1


In [32]:
#sort_values(ascending=False) #sort in descending order 
frame.sort_values('a', ascending=False)

Unnamed: 0,b,a
1,7,1
3,2,1
0,4,0
2,-3,0


In [33]:
#sort_values
frame.sort_values(['b', 'a'], ascending=False)

Unnamed: 0,b,a
1,7,1
0,4,0
3,2,1
2,-3,0


In [34]:
#sort_values
frame.sort_values('b', ascending=False)

Unnamed: 0,b,a
1,7,1
0,4,0
3,2,1
2,-3,0


In [35]:
#ranking: assigns values from 1-number of values in an array
obj = pd.Series([7, -5, 7, 4, 2, 0, 5])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    5
dtype: int64

In [36]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [37]:
#assign ranks based on order first observed in data
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [38]:
#rank in descending order
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    4.0
4    5.0
5    6.0
6    3.0
dtype: float64

In [39]:
#compute ranks by row or column order
frame = pd.DataFrame({'b': [4.3, 7, -3, 2],
                     'a': [0, 1, 0, 1],
                     'c':[-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [40]:
#default: rank over rows
frame.rank()

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [41]:
#we can also specify 'rows'
frame.rank(axis='rows')

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [42]:
#or axis=0
frame.rank(axis=0)

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [43]:
#ranking over columns
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [44]:
#same thing but using axis=1
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [45]:
#other tiebreaking methods
frame.rank(method='average')

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,1.5,4.0
3,2.0,3.5,1.0


In [46]:
#axis indexes with duplicate labels
#Note: df.reindex() requires unique labels, but an axis can have duplicate labels
#here we have an example of a series with duplicate indices
obj = pd.Series(np.arange(5.), index=['a', 'a', 'b', 'b', 'c'])

In [47]:
obj

a    0.0
a    1.0
b    2.0
b    3.0
c    4.0
dtype: float64

In [48]:
#is_unique property tells us if an index is unique
obj.index.is_unique

False

In [49]:
obj.index.is_monotonic_increasing

True

In [50]:
#data selection with duplicates: returns either a scalar or a series, depending on the number of items
obj['a']

a    0.0
a    1.0
dtype: float64

In [51]:
obj['c']

4.0

In [52]:
#The same is true with indexing rows
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                 index=['a', 'a', 'b', 'b', 'c'])
df

Unnamed: 0,0,1,2
a,-2.244965,0.63066,0.365876
a,0.244495,-0.98538,-0.548138
b,-0.325386,-1.057372,0.845246
b,1.40033,-0.579693,0.496189
c,1.6753,-1.331742,-1.126785


In [53]:
df.loc['b'] #DataFrame

Unnamed: 0,0,1,2
b,-0.325386,-1.057372,0.845246
b,1.40033,-0.579693,0.496189


In [54]:
df.loc['c']

0    1.675300
1   -1.331742
2   -1.126785
Name: c, dtype: float64

In [55]:
df.loc[['a', 'b']]

Unnamed: 0,0,1,2
a,-2.244965,0.63066,0.365876
a,0.244495,-0.98538,-0.548138
b,-0.325386,-1.057372,0.845246
b,1.40033,-0.579693,0.496189


In [56]:
df.loc[['a'], 0]

a   -2.244965
a    0.244495
Name: 0, dtype: float64

In [57]:
df.loc['a', [0]]

Unnamed: 0,0
a,-2.244965
a,0.244495


In [58]:
df.loc['a':'c', [0]]

Unnamed: 0,0
a,-2.244965
a,0.244495
b,-0.325386
b,1.40033
c,1.6753


In [59]:
df.iloc[:, 2]

a    0.365876
a   -0.548138
b    0.845246
b    0.496189
c   -1.126785
Name: 2, dtype: float64

In [60]:
df.iloc[:, [2]]

Unnamed: 0,2
a,0.365876
a,-0.548138
b,0.845246
b,0.496189
c,-1.126785


In [61]:
#Summarize/compute descriptive statistics
#these statistics produce a single value from a Series or a Series of values from a DataFrame#
#Let's create a sample DataFrame
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                  [np.nan, np.nan], [0.75, -1.3]])
df
#default columns

Unnamed: 0,0,1
0,1.4,
1,7.1,-4.5
2,,
3,0.75,-1.3


In [62]:
#Summarize/compute descriptive statistics
#these statistics produce a single value from a Series or a Series of values from a DataFrame#
#Let's create a sample DataFrame
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                  [np.nan, np.nan], [0.75, -1.3]],
                 index=['a', 'b', 'c', 'd'],
                 columns=['one', 'two'])
df
#default columns

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [63]:
#sum method: returns series containing column sums
df.sum()

one    9.25
two   -5.80
dtype: float64

In [64]:
#we can also specifty axis=0
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [65]:
#we can also specify axis='rows' for the same result
df.sum(axis='rows')

one    9.25
two   -5.80
dtype: float64

In [66]:
#to get the sum of the columns, pass axis='columns' or axis=1
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [67]:
#this has the same result
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [68]:
df.sum(axis='index', skipna=False) #when a whole row or column has n/a values the sum is 0
#otherwise the sum is n/a

one   NaN
two   NaN
dtype: float64

In [69]:
df.sum(axis='columns', skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [70]:
#some aggregations like mean require at least one non-na value
df.mean(axis='columns')

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [71]:
df.mean(axis='columns', skipna=True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [72]:
#idxmin and idxmax return the index value of the minimum and maximum values
df.idxmin()

one    d
two    b
dtype: object

In [73]:
df.idxmax()

one    b
two    d
dtype: object

In [74]:
#A second type of method is an accumulation
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [75]:
#A third kind of method is those methods that are not reductions or accumulations
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [76]:
#nonnumeric data: df.describe() returns summary statistics
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [77]:
obj.cummax() #c comes after a so C is the cumulative maximum

0     a
1     a
2     b
3     c
4     c
5     c
6     c
7     c
8     c
9     c
10    c
11    c
12    c
13    c
14    c
15    c
dtype: object

In [78]:
#some statistics need arguments to compute
price = pd.read_pickle('examples/yahoo_price.pkl')

In [79]:
volume = pd.read_pickle('examples/yahoo_volume.pkl')

In [80]:
#we now compute the percent change of prices
returns = price.pct_change()

In [81]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [82]:
#the corr method of a series computes the correlation of overlapping non-na values
returns['MSFT'].corr(returns['IBM'])

0.4997636114415116

In [83]:
#cov computes the covariance
returns['MSFT'].cov(returns['IBM'])

8.870655479703549e-05

In [84]:
#DataFrame .corr and .cov methods: return a correlation or covariance matrix as a DataFrame
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [85]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [86]:
#corrwith method: computes pairwise correlations between cols/rows of a DataFrame and 
#a second series or dataframe.
returns.corrwith(returns['IBM'])

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [87]:
#df1.corrwith(df2)
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

In [88]:
#unique, value counts, membership
#these methods get information about the values in a Series.
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [89]:
#unique: gives us an array of the unique values in the Series.
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [90]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [100]:
#pd.value_counts works with numpy arrays and sequences
#list
pd.value_counts([i for i in range(10)]+[i for i in range(4, 14)])

4     2
5     2
6     2
7     2
8     2
9     2
0     1
1     1
2     1
3     1
10    1
11    1
12    1
13    1
dtype: int64

In [101]:
#numpy array
pd.value_counts(obj.to_numpy())

c    3
a    3
b    2
d    1
dtype: int64

In [102]:
#numpy array #sort=False
pd.value_counts(obj.to_numpy(), sort=False)

c    3
a    3
d    1
b    2
dtype: int64