In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [None]:
import tomllib
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")
df = pd.read_parquet(datafile)
df.head()

In [None]:
def convert_to_decimal_hours(timestamp):
    dec_hour = timestamp.hour + timestamp.minute / 60 + timestamp.second / 3600
    return dec_hour

df["hour"] = df["timestamp"].apply(convert_to_decimal_hours)
df["log_len"] = df["message_length"].apply(lambda x: np.log(x))
df.head()

In [None]:
sns.scatterplot(data=df, x="hour", y="log_len", alpha=0.2)

In [None]:
p = df.groupby(["author"]).agg({
    "message_length": "mean",
    "has_emoji": "mean",
    "author": "count"
}).rename(columns={"author": "count"})

p = p[p["count"] > 10]
sns.scatterplot(data=p, x="message_length", y="has_emoji", alpha=0.5)
p

In [None]:
sns.scatterplot(data=p, x="message_length", y="has_emoji", size="count", sizes=(10, 500), alpha=0.3)

In [None]:
p["message_length"] /= p["message_length"].max()
p["has_emoji"] /= p["has_emoji"].max()
p["color"] = p.apply(lambda x: "grey" if x["message_length"] > x["has_emoji"] else "red", axis=1)

sns.scatterplot(data=p, y="message_length", x=0, color="grey")
sns.scatterplot(data=p, y="has_emoji", x=1, color="grey")
for index, row in p.iterrows():
    sns.lineplot(x=[0, 1], y=[row["message_length"], row["has_emoji"]], color=row["color"])

# The map is not the terrain

A model is always an inaccurate representation of reality. That is not a problem, but a useful feature: the simplification of reality allows us to spot patterns and trends that might otherwise be lost in the details.

The problem is that simplifying reality always carries the risk of bending the truth to fit your story, but it isn't always obvious which model is the "best". 

In [None]:
from mads_datasets import DatasetFactoryProvider, DatasetType
penguinsdataset = DatasetFactoryProvider.create_factory(
    DatasetType.PENGUINS
)
penguinsdataset.download_data()

df = pd.read_parquet(penguinsdataset.filepath)
select = [
    "Species",
    "Island",
    "Culmen Length (mm)",
    "Culmen Depth (mm)",
    "Flipper Length (mm)",
    "Delta 15 N (o/oo)",
    "Delta 13 C (o/oo)",
    "Sex",
    "Body Mass (g)",
]
subset = df[select].dropna()

In [None]:
subset

In [None]:
sns.lmplot(data=subset, x="Culmen Length (mm)", y="Body Mass (g)", fit_reg=True)

In [None]:
from scipy import stats
x = subset["Culmen Length (mm)"]
y = subset["Body Mass (g)"]
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(f"The model is y = {slope:.2f}x + {intercept:.2f}, with R^2 = {r_value**2:.2f}")


In [None]:
sns.lmplot(data=subset, x="Culmen Length (mm)", y="Body Mass (g)", order=2)

In [None]:
import numpy as np
model = np.polyfit(x, y, 2)
print(f"The model is {model[0]:.2f}x^2 + {model[1]:.2f}x + {model[2]:.2f}")

In [None]:
sns.lmplot(data=subset, x="Culmen Length (mm)", y="Body Mass (g)", logx=True)

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(np.log(x), y)
print(f"The model is y = {slope:.2f} log(x) + {intercept:.2f}, with R^2 = {r_value**2:.2f}")

In [None]:
sns.lmplot(data=subset, x="Culmen Length (mm)", y="Body Mass (g)", lowess=True)

Lowess stands for Locally Weighted Scatterplot Smoothing, and it is a non-parametric regression method that fits a smooth curve to the data. It is a useful tool to explore the relationship between two variables, but it is not a model in the traditional sense. It is a tool to help us understand the data, not to make predictions.