# DATA CORRELATION


In [1]:
import pandas as pd

# 1. CREATE

Create from a CSV

In [3]:
df = pd.read_csv('mydataset.csv')

# 2. READ

In [10]:
df.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,169.0,169.0,169.0,164.0
mean,63.846154,107.461538,134.047337,375.790244
std,42.299949,14.510259,16.450434,266.379919
min,15.0,80.0,100.0,50.3
25%,45.0,100.0,124.0,250.925
50%,60.0,105.0,131.0,318.6
75%,60.0,111.0,141.0,387.6
max,300.0,159.0,184.0,1860.4


In [14]:
df.columns

Index(['Duration', 'Pulse', 'Maxpulse', 'Calories'], dtype='object')

In [15]:
df.dtypes

Duration      int64
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object

# 3. Finding Relationships

### A. To show relationship between columns

In [5]:
# For showing relationship of all the columns

df.corr()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.155408,0.009403,0.922717
Pulse,-0.155408,1.0,0.786535,0.025121
Maxpulse,0.009403,0.786535,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0


In [6]:
# For showing relationship between two specific columns
df["Duration"].corr(df["Calories"])

0.9227166783472454

### B. Add a parameter method in corr() function

In [7]:
# pearson: standard correlation coefficient 
df.corr('pearson')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.155408,0.009403,0.922717
Pulse,-0.155408,1.0,0.786535,0.025121
Maxpulse,0.009403,0.786535,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0


In [8]:
# spearman: Spearman rank correlation
df.corr('spearman')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.158436,-0.068517,0.815127
Pulse,-0.158436,1.0,0.78852,0.186022
Maxpulse,-0.068517,0.78852,1.0,0.271928
Calories,0.815127,0.186022,0.271928,1.0


In [9]:
# kendall: Kendall Tau correlation coefficient 
df.corr('kendall')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.124808,-0.046464,0.687394
Pulse,-0.124808,1.0,0.640451,0.158295
Maxpulse,-0.046464,0.640451,1.0,0.198774
Calories,0.687394,0.158295,0.198774,1.0


# Questions and Answers

# 1. Find and identify data with good correlation.

Using Pearson correlation as default

In [12]:
# Find and identify data with good correlation.
df = pd.read_csv('mydataset.csv')

# Use any parameter method to find good correlation
df.corr(method='pearson')


Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.155408,0.009403,0.922717
Pulse,-0.155408,1.0,0.786535,0.025121
Maxpulse,0.009403,0.786535,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0


**What are the data with good correlation?**

In [17]:
# Show relationship between two or more specific columns that show good correlation
df["Duration"].corr(df["Calories"])

0.9227166783472454

**Duration and Calories Burned have a good correlation because the value is close to 1.**

# 2. Find and identify data with bad correlation.


Using Pearson correlation as default

In [16]:
df = pd.read_csv('mydataset.csv')

df.corr(method='pearson')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.155408,0.009403,0.922717
Pulse,-0.155408,1.0,0.786535,0.025121
Maxpulse,0.009403,0.786535,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0


**What are the data with bad correlation?**

In [22]:
# Show relationship between two or more specific columns that show bad correlation
df["Duration"].corr(df["Maxpulse"])

0.009402912085577995

**Duration and Maxpulse have a bad correlation because the value is close to 0.**

# 3. What do you notice when you use different methods in the corr() function?

df = pd.read_csv('mydataset.csv')

### Pearson correlation

In [23]:
df.corr(method='pearson')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.155408,0.009403,0.922717
Pulse,-0.155408,1.0,0.786535,0.025121
Maxpulse,0.009403,0.786535,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0


### Spearman correlation

In [24]:
df.corr(method='spearman')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.158436,-0.068517,0.815127
Pulse,-0.158436,1.0,0.78852,0.186022
Maxpulse,-0.068517,0.78852,1.0,0.271928
Calories,0.815127,0.186022,0.271928,1.0


### Kendall correlation

In [25]:
df.corr(method='kendall')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.124808,-0.046464,0.687394
Pulse,-0.124808,1.0,0.640451,0.158295
Maxpulse,-0.046464,0.640451,1.0,0.198774
Calories,0.687394,0.158295,0.198774,1.0


I notice that the **some** values of the correlation are different when I use different methods in the corr() function.
I think this is because the methods are different due to their mathematical formulas.

Below, in this case, I've run correlation between Duration and Calories through the three methods and the values are just the same.

In [29]:
# Perform more data analysis coding to differentiate the three methods of correlation.
df = pd.read_csv('mydataset.csv')

df.corr(method='pearson')

df["Duration"].corr(df["Calories"])

0.9227166783472454

In [30]:
df.corr(method='spearman')

df["Duration"].corr(df["Calories"])

0.9227166783472454

In [31]:
df.corr(method='kendall')

df["Duration"].corr(df["Calories"])

0.9227166783472454

# END