# Do Female Fish have Pre-Existing Preferences for a Male Trait?

In [None]:
# standard library imports
import warnings

warnings.simplefilter('ignore', category=FutureWarning)

# 3rd party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

sns.set()

In [None]:
df = pd.read_csv('case0602.csv')

# Robustness of Assumptions

In [None]:
df.groupby('Pair')['Proportion'].describe()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=[12, 6.4])
_ = sns.boxplot(data=df, x='Pair', y='Proportion', ax=axes[0])
_ = sns.kdeplot(data=df, x='Proportion', hue='Pair', ax=axes[1])

There is strong evidence against the assumption of equal variance.  Can we even go on with ANOVA?  Sure, let's just ignore that.

# Is there any evidence of percentage differences between groups?

$
\begin{align}
H_0:  & \: \text{The pair proportions are the same} \\
H_a:  & \: \text{At least one pair is different} \\
\end{align}
$

In [None]:
model = smf.ols('Proportion ~ Pair', data=df)
results = model.fit()
results.summary()

In [None]:
sm.stats.anova_lm(results)

There is only weak evidence that the group means are different ($F_{5,78}$ = 0.77, $p$-value = 0.57).

In [None]:
res = scipy.stats.ttest_1samp(df['Proportion'], 0.50)
print(res)
print(res.confidence_interval(0.95))

There is strong evidence that the females spend a far higher proportion of their time with yellow-sword males than with transparent sword males ($t_{0.95, 83}$ = 7.2276, $p$-value < 0.0001).  We are 95% confident that females spent between 58.5% and 65.4% of their time with yellow-sword tails.

# Is there evidence of any linear trend?

In [None]:
ci = df.groupby('Pair')['Length'].mean() - df.groupby('Pair')['Length'].mean().mean()
ci

In [None]:
model = smf.ols('Proportion ~ Pair', data=df).fit()

In [None]:
r = ci
g = ci @ df.groupby('Pair')['Proportion'].mean()
n = df.groupby('Pair')['Proportion'].count()
anova_table = sm.stats.anova_lm(model)
sp = np.sqrt(anova_table.loc['Residual', 'mean_sq'])
se = sp * np.sqrt((ci ** 2) @  (1 / n))
t = g / se
dof = n.sum() - len(n)
pvalue = scipy.stats.t.cdf(t, dof)
print(t, pvalue, dof)

There is only weak evidence of a linear trend ($t_{0.95, 78}$ = -0.4528, $p$-value = 0.3260).