In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# stay offline for plotly
init_notebook_mode(connected=True)

In [None]:
# using sample data from https://github.com/matplotlib/matplotlib/blob/master/lib/matplotlib/mpl-data/sample_data/percent_bachelors_degrees_women_usa.csv
file_name = "../data/percent_bachelors_degrees_women_usa.csv"

In [None]:
# extract extension
extension = file_name.rsplit('.', 1)[1].lower() 

# process into dataframe
if extension == 'csv':
    try:
        df = pd.read_csv(file_name)
    except:
        df = pd.read_csv(file_name, encoding='latin-1')

In [None]:
df.tail()

or example, with demographic data, we we generally consider correlations above 0.75 to be relatively strong; correlations between 0.45 and 0.75 are moderate, and those below 0.45 are considered weak.

In [None]:
# find correlations
corrs = pd.DataFrame(df.corr().abs().unstack().sort_values(kind="quicksort"))

# remove correlations of 1
corrs = corrs[corrs[0] < 1]

# only compute if there are correlations to work with
if len(corrs.index) > 0:
    # extract highest correlation
    highest_corr = round(corrs.iloc[-1][0], 2)
    highest_corr_y = corrs.iloc[-1].name[0]
    highest_corr_x = corrs.iloc[-1].name[1]

    # compute whether it's a strong correlation
    if highest_corr > 0.75:
        correlation_strength = 'high'
    elif highest_corr > 0.45:
        correlation_strength = 'moderate'
    else:
        correlation_strength = 'weak'

In [None]:
plt.scatter(df[highest_corr_x], df[highest_corr_y])
plt.title(f'Correlation between {highest_corr_x} and {highest_corr_y}')
plt.xlabel(highest_corr_x)
plt.ylabel(highest_corr_y)
plt.show()

In [None]:
print(f'There is a {correlation_strength} correlation between {highest_corr_x} and {highest_corr_y}.')

In [None]:
fig = go.Figure(data=go.Scatter(x=df[highest_corr_x], y=df[highest_corr_y], 
                                mode='markers', marker_color='rgb(227, 0, 6)'))

fig.update_layout(
    title=f'Correlation between {highest_corr_x} and {highest_corr_y}',
    xaxis_title=highest_corr_x,
    yaxis_title=highest_corr_y,
    margin=dict(
        pad=10
    ),
    font=dict(
        family="-apple-system, BlinkMacSystemFont, 'Segoe UI', 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', 'Helvetica Neue', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';",
        size=12,
        color="#7f7f7f"
    )
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#EEEEEE')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#EEEEEE')

fig['layout']['plot_bgcolor'] = 'rgba(0,0,0,0)'
fig.show()

In [None]:
output_json = fig.to_json()

print(output_json)