In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import r_regression

# Correlation coefficient
# Only for linear dependencies
The correlation coefficient ― is a measure of linear correlation between
two sets of data. It is the ratio between the covariance of two variables and
the product of their standard deviations; thus it is essentially a normalised
measurement of the covariance, such that the result always has a value between
−1 and 1. As with covariance itself, the measure can only reflect a linear
correlation of variables, and ignores many other types of relationship or
correlation.

In [8]:
titanic = pd.read_csv('../data/titanic.csv')
titanic = titanic[['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'SibSp',
                   'Survived']]
titanic.dropna(inplace=True)
X = titanic[['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'SibSp']].copy()
X['Sex'] = LabelEncoder().fit_transform(X['Sex'])
y = titanic['Survived']

## Check correlations

In [31]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
r_stat = r_regression(X, y)
p_series = pd.Series(r_stat, index=X.columns, dtype='float')
p_series

Pclass   -0.35965
Sex      -0.53883
Age      -0.07722
Fare      0.26819
Parch     0.09332
SibSp    -0.01736
dtype: float64

In [20]:
min_correlation = 0.25
redundant = p_series[p_series.abs() < min_correlation]
redundant

Age     -0.07722
Parch    0.09332
SibSp   -0.01736
dtype: float64

# Calculate by hand for 'Sex' feature

In [61]:
covariance =  ((X['Sex'] - X['Sex'].mean()) * (y - y.mean())).sum() / \
              (len(X) - 1)
covariance

-0.12761794759979728

In [62]:
correlation = covariance / (X['Sex'].std() * y.std())
correlation

-0.5388255930146355