In [4]:
import scipy.stats as ss
import pandas as pd
import numpy as np
import math

In [2]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [3]:
def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

In [6]:
_REPLACE = 'replace'
_DROP = 'drop'
_DROP_SAMPLES = 'drop_samples'
_DROP_FEATURES = 'drop_features'
_SKIP = 'skip'
_DEFAULT_REPLACE_VALUE = 0.0

In [7]:
def conditional_entropy(x,y,nan_strategy=_REPLACE,nan_replace_value=_DEFAULT_REPLACE_VALUE,log_base: float = math.e):
    if nan_strategy == _REPLACE:
        x, y = replace_nan_with_value(x, y, nan_replace_value)
    elif nan_strategy == _DROP:
        x, y = remove_incomplete_samples(x, y)
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0.0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y / p_xy, log_base)
    return entropy



In [6]:
def cramerv_corr(data):
    K = len(data.columns)
    correl = np.empty((K, K), dtype=float)
    for i,x in enumerate(data.columns):
        for j,y in enumerate(data.columns):
            if i==j:
                c = 1
            else:
                c = cramers_v(data[x],data[y])
            correl[i,j] = c
    return pd.DataFrame(correl,index=data.columns,columns=data.columns)

In [5]:
import inspect
inspect.getsourcelines(pd.DataFrame.corr)

(['    def corr(self, method="pearson", min_periods=1) -> "DataFrame":\n',
  '        """\n',
  '        Compute pairwise correlation of columns, excluding NA/null values.\n',
  '\n',
  '        Parameters\n',
  '        ----------\n',
  "        method : {'pearson', 'kendall', 'spearman'} or callable\n",
  '            Method of correlation:\n',
  '\n',
  '            * pearson : standard correlation coefficient\n',
  '            * kendall : Kendall Tau correlation coefficient\n',
  '            * spearman : Spearman rank correlation\n',
  '            * callable: callable with input two 1d ndarrays\n',
  '                and returning a float. Note that the returned matrix from corr\n',
  '                will have 1 along the diagonals and will be symmetric\n',
  "                regardless of the callable's behavior.\n",
  '\n',
  '                .. versionadded:: 0.24.0\n',
  '\n',
  '        min_periods : int, optional\n',
  '            Minimum number of observations required 