# Analysis of Soybean Data

## Setup

In [None]:
# 3rd party library imports
import matplotlib.pyplot as plt                                                    
import numpy as np                                                                 
import pandas as pd                                                                
from scipy.stats import t as tdist                                                 
import seaborn as sns                                                              
import statsmodels.formula.api as smf                                              
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

sns.set()

df = pd.read_csv('case1402.csv')

# have to fix the columns
df.columns = [x.strip() for x in df.columns]

## Exploratory Analysis
### Display 14.12:  Scatterplots of soybean yield (log scale) versus ozoze and sulpher dioxide, with different plotting symbols to indicate water stress

In [None]:
# set logarithmic scale for all plots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=[12.8, 6.4])

sns.stripplot(data=df, x='O3',  y='Forrest', hue='Stress', log_scale=(False, True), ax=axes[0][0])
axes[0][0].legend(loc='lower left')

sns.stripplot(data=df, x='SO2', y='Forrest', hue='Stress', log_scale=(False, True), legend=False, ax=axes[0][1])
sns.stripplot(data=df, x='O3',  y='William', hue='Stress', log_scale=(False, True), legend=False, ax=axes[1][0])
sns.stripplot(data=df, x='SO2', y='William', hue='Stress', log_scale=(False, True), legend=False, ax=axes[1][1])

# remove x labels from top row
axes[0][0].get_xaxis().set_visible(False)
axes[0][1].get_xaxis().set_visible(False)

# remove y labels from right column
axes[0][1].get_yaxis().set_visible(False)
axes[1][1].get_yaxis().set_visible(False)

fig.tight_layout()

The effect of ozone looks to be linear on the log scale, while the effect of sulphur dioxide is unclear.

### Assessment of Linearity in Ozone

In [None]:
fformula = 'np.log(Forrest) ~ O3 + I(O3**2) + C(SO2, Treatment(reference=0.00)) + C(Stress, Treatment(reference="Well-watered"))'
fmodel = smf.ols(formula=fformula, data=df).fit()
fmodel.summary()

In [None]:
wformula = 'np.log(William) ~ O3 + I(O3**2) + C(SO2, Treatment(reference=0.00)) + C(Stress, Treatment(reference="Well-watered"))'
wmodel = smf.ols(formula=wformula, data=df).fit()
wmodel.summary()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=[12.8, 6.4], sharex=True, sharey=True)
sns.scatterplot(x=fmodel.fittedvalues, y=fmodel.resid, ax=axes[0])
sns.scatterplot(x=wmodel.fittedvalues, y=wmodel.resid, ax=axes[1])
axes[0].set_ylabel('Forrest Residuals')
axes[1].set_ylabel('William Residuals')