# Examples
## Correlation
> #### IEOR 135/290, Data-X: Applied Data Ventures
> #### Author: Joshua Rafael Sanchez | UC Berkeley, B.S. IEOR, '20
> In collaboration with Ikalaq Sidhu.
> Email: joshuarafael@berkeley.edu

> __About This Notebook:__ <br>
> This notebook contains examples from the Data-X lecture entitled "Correlation."  

> __Copyright:__ <br>
> Feel free to use the code as you wish.

### 0. Setup

In [21]:
# Import libraries:
import numpy as np
import pandas as pd

### 1. Correlation of Rows With Numpy

In [32]:
# ignore line formatting:
x = np.array([
    [0.1, .32, .2, 0.4, 0.8],
    [.23, .18, .56, .61, .12],
    [.9, .3, .6, .5, .3],
    [.34, .75, .91, .19, .21]])

# np.corrcoef(x)
np.corrcoef(np.transpose(x))

array([[ 1.        , -0.03783885,  0.34905716,  0.14648975, -0.34945863],
       [-0.03783885,  1.        ,  0.67888519, -0.96102583, -0.12757741],
       [ 0.34905716,  0.67888519,  1.        , -0.45104803, -0.80429469],
       [ 0.14648975, -0.96102583, -0.45104803,  1.        , -0.15132323],
       [-0.34945863, -0.12757741, -0.80429469, -0.15132323,  1.        ]])

### 2. Correlation of Features from Different Sources

In [23]:
# creating dataframe
d = {'mpg': [21.0, 21.0, 22.8, 21.4, 18.7, 18.1],
     'disp': [160, 160, 108, 258, 360, 225],
     'hp': [110, 110, 93, 110, 175, 105],
     'drat': [3.90, 3.90, 3.85, 3.08, 3.15, 2.76],
     'wt': [2.620, 2.875, 2.320, 3.215, 3.440, 3.460],
     'qsec': [16.46, 17.02, 18.61, 19.44, 17.02, 20.22]}
index = ['Mazda RX4','Mazda RX4 Wag', 'Datsun 710', 'Hornet 4 Drive',
        'Hornet Sportabout', 'Valiant']

df = pd.DataFrame(data=d, index=index)
df

Unnamed: 0,mpg,disp,hp,drat,wt,qsec
Mazda RX4,21.0,160,110,3.9,2.62,16.46
Mazda RX4 Wag,21.0,160,110,3.9,2.875,17.02
Datsun 710,22.8,108,93,3.85,2.32,18.61
Hornet 4 Drive,21.4,258,110,3.08,3.215,19.44
Hornet Sportabout,18.7,360,175,3.15,3.44,17.02
Valiant,18.1,225,105,2.76,3.46,20.22


In [24]:
df.corr()

Unnamed: 0,mpg,disp,hp,drat,wt,qsec
mpg,1.0,-0.689418,-0.560904,0.727087,-0.860103,-0.158188
disp,-0.689418,1.0,0.86649,-0.701672,0.854579,0.026017
hp,-0.560904,0.86649,1.0,-0.306172,0.569348,-0.409935
drat,0.727087,-0.701672,-0.306172,1.0,-0.882287,-0.692853
wt,-0.860103,0.854579,0.569348,-0.882287,1.0,0.341026
qsec,-0.158188,0.026017,-0.409935,-0.692853,0.341026,1.0


### 3. Correlation Matrix w/ Columns

In [25]:
# Create DataFrame of random numbers
frame = pd.DataFrame(np.random.randn(1000,5), columns = ['a','b','c','d','e'])
frame.iloc[::2] = np.nan
frame['a'].corr(frame['b'])

0.046502829755131904

In [26]:
frame['a'].corr(frame['b'], method = 'spearman')

0.050304873219492875

In [27]:
# Pairwise correlation of DataFrame columns
frame.corr()

Unnamed: 0,a,b,c,d,e
a,1.0,0.046503,-0.009185,0.038818,-0.054403
b,0.046503,1.0,0.025726,0.052386,-0.060421
c,-0.009185,0.025726,1.0,0.021969,0.027944
d,0.038818,0.052386,0.021969,1.0,-0.015641
e,-0.054403,-0.060421,0.027944,-0.015641,1.0
