# Correlations for Anscombe's Quartet

## References

* [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet)
* [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient)
* [Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) 

## Imports

In [None]:
import io
import numpy as np # type: ignore
import pandas as pd # type: ignore
import plotly.express as px # type: ignore

from IPython.display import display, Markdown # type: ignore

## Create DataFrame with Anscombe's Quartet as Data

In [None]:
anscombes_quartet_str = (
        # copied from https://en.wikipedia.org/wiki/Anscombe%27s_quartet
        """
            x_1	y_1	x_2	y_2	x_3	y_3	x_4	y_4
            10.0	8.04	10.0	9.14	10.0	7.46	8.0	6.58
            8.0	6.95	8.0	8.14	8.0	6.77	8.0	5.76
            13.0	7.58	13.0	8.74	13.0	12.74	8.0	7.71
            9.0	8.81	9.0	8.77	9.0	7.11	8.0	8.84
            11.0	8.33	11.0	9.26	11.0	7.81	8.0	8.47
            14.0	9.96	14.0	8.10	14.0	8.84	8.0	7.04
            6.0	7.24	6.0	6.13	6.0	6.08	8.0	5.25
            4.0	4.26	4.0	3.10	4.0	5.39	19.0	12.50
            12.0	10.84	12.0	9.13	12.0	8.15	8.0	5.56
            7.0	4.82	7.0	7.26	7.0	6.42	8.0	7.91
            5.0	5.68	5.0	4.74	5.0	5.73	8.0	6.89
        """
    )

anscombes_quartet_wide = (
        pd.read_csv(io.StringIO(anscombes_quartet_str), sep="\\t", engine="python")
    )

all_x_values = np.concat([anscombes_quartet_wide[f"x_{i}"].values for i in range(1,5)])
all_y_values = np.concat([anscombes_quartet_wide[f"y_{i}"].values for i in range(1,5)])
rows = [i // 22 + 1 for i in range(len(all_x_values))]
columns = [(i // 11) % 2 + 1 for i in range(len(all_x_values))]

ANSCOMBES_QUARTET = (
        pd.DataFrame(
            data = { 
                "x": all_x_values,
                "y": all_y_values,
                "row": rows,
                "column": columns,
                "label": [f"{i}-{j}" for i, j in zip(rows, columns)]
            }
        )
    )

del anscombes_quartet_str, anscombes_quartet_wide
del all_x_values, all_y_values, rows, columns 

## Plot Data

In [None]:
fig = (
    px.scatter(
        ANSCOMBES_QUARTET, 
        x="x", 
        y="y", 
        facet_row="row", 
        facet_col="column", 
        trendline="ols", 
        trendline_color_override="red", 
        width=1000, 
        height=600
    )
)

fig.show()

trendline_results = px.get_trendline_results(fig)

display(
    Markdown(
        "<br>"
        .join(
            [
                f"row: {i}, column: {j}, $R^2: {results.rsquared:.2f}$, "
                f"equation: ${results.params[0]:.2f} + {results.params[1]:.3f} \\cdot x$" 
                    for _, i, j, results in trendline_results.itertuples()
            ]
        )
    )
)

del fig, trendline_results

## Determine Mean and Standard Deviation of Each Series

In [None]:
ANSCOMBES_QUARTET[["label", "x", "y"]].groupby("label").describe()

## Determine Pearson Correlation Coefficient and Spearman's Rank Correlation Coefficient

In [None]:
output = []
for label in ANSCOMBES_QUARTET["label"].unique():

    x = ANSCOMBES_QUARTET.loc[ANSCOMBES_QUARTET["label"] == label, "x"]
    y = ANSCOMBES_QUARTET.loc[ANSCOMBES_QUARTET["label"] == label, "y"]

    output.append(
        f"{label}: Pearson: {x.corr(y, method='pearson'):.3f}, "
        f"Spearman: {x.corr(y, method='spearman'):.3f}"
    )

display(Markdown("<br>".join(output)))

del label, x, y, output