-
Notifications
You must be signed in to change notification settings - Fork 0
/
eda.py
136 lines (106 loc) · 4.43 KB
/
eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# AUTOGENERATED! DO NOT EDIT! File to edit: ../01_eda.ipynb.
# %% auto 0
__all__ = ['plot_univariate_continuous', 'strength_of_assoc', 'soa_graph']
# %% ../01_eda.ipynb 3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import sklearn.feature_selection as skfl
# %% ../01_eda.ipynb 7
def plot_univariate_continuous(df:pd.DataFrame, # Data
var:str, # Variable to plot
var_name:str, # Variable name
ax): # Axes on which to draw the plot
## Calculate the quantiles
df_plot = df[[var]].copy()
df_plot['qcut'] = pd.qcut(df_plot[var], [0, 0.25, 0.75, 1], labels=['1st', 'iqr', '4th'])
# Define the plaette
# color palette as dictionary
palette = {"1st":"silver",
"iqr":"gold",
"4th":"silver"}
# Create a density plot
sns.histplot(data=df_plot,
x=var,
stat='percent',
ax=ax,
hue='qcut',
multiple='stack',
palette=palette)
# Remove legend
ax.get_legend().remove()
# Add a vertical line at the mean
var_mean = df[var].mean()
ax.axvline(var_mean)
# Add labels
ax.set_xlabel(var_name, fontfamily='Century Gothic', fontsize=16)
ax.set_ylabel('Percent', fontfamily='Century Gothic', fontsize=16)
# Set tick font size
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_name('Century Gothic')
label.set_size(12)
return ax
# %% ../01_eda.ipynb 22
def rr_corr(df:pd.DataFrame, # Data
ratio_vars:list): # Columns in `df` with ratio variables
"""
Correlation between all pairs of ratio variables in `df`
Uses the `corr` method of `pandas.DataFrame`
"""
# Extract ratio variables
df_corr = df[ratio_vars].copy()
# Calculate the correlation between ratio features in the dataset
df_corr = df.corr()
# Reshape into a table removing redundant pairs
df_corr = df_corr.where(np.triu(np.ones(df_corr.shape), 1).astype(bool))
df_corr = df_corr.stack().reset_index()
df_corr.columns = ['feat_1', 'feat_2', 'value']
df_corr['metric'] = "Pearson correlation coefficient"
# Absolute value of correlation is used since we only search for association
df_corr['assoc_strength'] = df_corr['value'].abs()
df_corr['assoc_strength'] = pd.cut(df_corr['assoc_strength'],
bins=[0, 0.2, 0.5, 1],
labels=['weak', 'moderate', 'strong'])
return df_corr.sort_values('assoc_strength', ascending=False)
# %% ../01_eda.ipynb 23
def strength_of_assoc(df:pd.DataFrame, # Data
ratio_vars:list=None, # Columns in `df` with ratio variables
ordinal_vars:list=None, # Columns in `df` with ordinal variables
nominal_vars:list=None, # Columns in `df` with nominal variables
binary_vars:list=None): # Columns in `df` with binary variables
# Initialize results dataframe
soa_df = []
## Calculate strength of association between different variables
# Ratio - Ratio
if ratio_vars:
soa_df.append(rr_corr(df, ratio_vars))
# Ratio - Ordinal
# Ratio - Nominal
# Ratio - Binary
# Ordinal - Ordinal
# Ordinal - Nominal
# Ordinal - Binary
# Nominal - Nominal
# Nominal - Binary
# Binary - Binary
return pd.concat(soa_df)
# %% ../01_eda.ipynb 31
def soa_graph(cdf:pd.DataFrame, # A dataframe as output by `ratio_corr`
min_strength:str='strong'): # Threshold for high correlation
# Filter values below min_strength (weak < moderate < strong)
if min_strength == 'strong':
filter_list = ['strong']
elif min_strength == 'moderate':
filter_list = ['moderate', 'strong']
else:
filter_list = ['weak', 'moderate', 'strong']
high_soa = cdf[cdf.assoc_strength.isin(filter_list)].copy()
# Sort for visualization
high_soa = high_soa.sort_values(['assoc_strength', 'feat_1', 'feat_2'], ascending=False)
# Generate the graph
soa_graph = nx.from_pandas_edgelist(high_soa, 'feat_1', 'feat_2')
for node in soa_graph.nodes():
soa_graph.nodes[node]['label'] = node
return high_soa, soa_graph