In [2]:
import pandas as pd
from pandas import DataFrame, Series

In [19]:
import pandas_datareader.data as web
all_data = {ticke: web.get_data_yahoo(ticke)
for ticke in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticke: data['Adj Close']
for ticke, data in all_data.items()})
volume = pd.DataFrame({ticke: data['Volume']
for ticke, data in all_data.items()})

In [20]:
returns = price.pct_change()

In [40]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-08-24,-0.000601,0.001576,-0.006663,0.009206
2021-08-25,-0.008421,0.000143,-0.002016,0.003873
2021-08-26,-0.005527,-0.007722,-0.009669,-0.005785
2021-08-27,0.007185,0.00454,0.002106,0.01708
2021-08-30,0.030417,-0.003156,0.012912,0.006358


The corr method of Series computes the correlation of the overlapping, non-NA,
aligned-by-index values in two Series. Relatedly, cov computes the covariance:

In [43]:
returns['IBM'].corr(returns['MSFT'])

0.5167144074795372

In [24]:
returns['MSFT'].cov(returns['IBM'])

0.00014545101836918774

Since MSFT is a valid Python attribute, we can also select these columns using more
concise syntax:

In [25]:
returns.MSFT.corr(returns.IBM)

0.5167144074795372

DataFrame’s corr and cov methods, on the other hand, return a full correlation or
covariance matrix as a DataFrame, respectively:

In [26]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.441563,0.735102,0.661473
IBM,0.441563,1.0,0.516714,0.484315
MSFT,0.735102,0.516714,1.0,0.774509
GOOG,0.661473,0.484315,0.774509,1.0


In [27]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000363,0.000138,0.000241,0.000212
IBM,0.000138,0.000268,0.000145,0.000134
MSFT,0.000241,0.000145,0.000295,0.000224
GOOG,0.000212,0.000134,0.000224,0.000283


Using DataFrame’s corrwith method, you can compute pairwise correlations
between a DataFrame’s columns or rows with another Series or DataFrame. Passing a
Series returns a Series with the correlation value computed for each column:

In [28]:
returns.corrwith(returns.IBM)

AAPL    0.441563
IBM     1.000000
MSFT    0.516714
GOOG    0.484315
dtype: float64

Passing a DataFrame computes the correlations of matching column names. Here I
compute correlations of percent changes with volume:

In [29]:
returns.corrwith(volume)

AAPL   -0.063655
IBM    -0.103892
MSFT   -0.055772
GOOG   -0.121519
dtype: float64

Passing axis='columns' does things row-by-row instead. In all cases, the data points
are aligned by label before the correlation is computed.

Pandas dataframe.corrwith() is used to compute pairwise correlation 
between rows or columns of two DataFrame objects. If the shape of 
two dataframe object is not same then the corresponding correlation value will be a NaN value.

In [30]:
df1 = pd.DataFrame({"A":[1, 5, 7, 8], 
                    "B":[5, 8, 4, 3],
                    "C":[10, 4, 9, 3]})
df2 = pd.DataFrame({"A":[5, 3, 6, 4],
                    "B":[11, 2, 4, 3],
                    "C":[4, 3, 8, 5]})

In [31]:
df1

Unnamed: 0,A,B,C
0,1,5,10
1,5,8,4
2,7,4,9
3,8,3,3


In [32]:
df2

Unnamed: 0,A,B,C
0,5,11,4
1,3,2,3
2,6,4,8
3,4,3,5


In [33]:
df1.corrwith(df2, axis = 0)
#The output series contains the correlation between the 
#three columns of two dataframe objects respectively.

A   -0.041703
B   -0.151186
C    0.395437
dtype: float64

In [50]:
# importing pandas as pd
import pandas as pd
  
# Creating the first dataframe
df1 = pd.DataFrame({"A":[1, 5, 7, 8],
                    "B":[5, 8, 4, 3],
                    "C":[10, 4, 9, 3]})
  
# Creating the second dataframe 
df2 = pd.DataFrame({"A":[5, 3, 6, 4],
                    "B":[11, 2, 4, 3], 
                    "C":[4, 3, 8, 5]})
  
# To find the correlation among the
# columns of df1 and df2 along the row axis
df1.corrwith(df2, axis = 1)
#The output series contains the correlation between 
#the four rows of two data frame objects respectively.

0   -0.195254
1   -0.970725
2    0.993399
3    0.000000
dtype: float64

# Another Example

In [51]:
import pandas as pd
import numpy as np


# Setting a seed so the example is reproducible
np.random.seed(4272018)

df = pd.DataFrame(np.random.randint(low= 0, high= 20, size= (5, 2)),
                  columns= ['Commercials Watched', 'Product Purchases'])

df


Unnamed: 0,Commercials Watched,Product Purchases
0,10,13
1,15,0
2,7,7
3,2,4
4,16,11


In [52]:
df.agg(["mean", "std"])

Unnamed: 0,Commercials Watched,Product Purchases
mean,10.0,7.0
std,5.787918,5.244044


In [57]:
df.std()

Commercials Watched    5.787918
Product Purchases      5.244044
dtype: float64

In [53]:
df.var()

Commercials Watched    33.5
Product Purchases      27.5
dtype: float64

In [54]:
df.cov()

Unnamed: 0,Commercials Watched,Product Purchases
Commercials Watched,33.5,3.25
Product Purchases,3.25,27.5


In [56]:
df.corr()

Unnamed: 0,Commercials Watched,Product Purchases
Commercials Watched,1.0,0.107077
Product Purchases,0.107077,1.0


# Unique Values, Value Counts, and Membership

Another class of related methods extracts information about the values contained in a
one-dimensional Series. To illustrate these, consider this example:

In [3]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

The first function is unique, which gives you an array of the unique values in a Series:

In [4]:
uniques = obj.unique()

In [24]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

The unique values are not necessarily returned in sorted order, but could be sorted
after the fact if needed (uniques.sort()). Relatedly, value_counts computes a Series
containing value frequencies:

In [26]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

The Series is sorted by value in descending order as a convenience. value_counts is
also available as a top-level pandas method that can be used with any array or
sequence:

In [27]:
pd.value_counts(obj.values, sort=False)

c    3
d    1
a    3
b    2
dtype: int64

isin performs a vectorized set membership check and can be useful in filtering a
dataset down to a subset of values in a Series or column in a DataFrame:

In [30]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [31]:
mask = obj.isin(['b', 'c'])

In [32]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [33]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

Related to isin is the Index.get_indexer method, which gives you an index array
from an array of possibly non-distinct values into another array of distinct values:

In [5]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'd','a'])

In [6]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [7]:
pd.Index(unique_vals).get_indexer(to_match)

array([ 0,  2,  1,  1,  0, -1,  2], dtype=int64)

In [9]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]})

In [10]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [36]:
result = data.apply(pd.value_counts).fillna(0)
#result = data.apply(pd.value_counts)

In [37]:
result 

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


Here, the row labels in the result are the distinct values occurring in all of the columns.
The values are the respective counts of these values in each column.