# Breakdown Times for Insulating Fluid Under Different Voltages

How does the distribution of breakdown time depend on voltage?

In [None]:
# standard library imports
import warnings

warnings.simplefilter('ignore', category=FutureWarning)

# 3rd party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

sns.set()

df = pd.read_csv('case0802.csv')

## Summary Statistics and Graphical Display

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
fig, axes = plt.subplots()
#sns.regplot(data=df, x='Voltage', y='Time', scatter=False, ax=axes)
sns.lineplot(data=df.groupby('Group').mean(), x='Voltage', y='Time', ax=axes)
time_means = df.groupby('Group')['Time'].mean()
voltage_means = df.groupby('Group')['Voltage'].mean()
axes.errorbar(
    voltage_means, time_means, 
    yerr=df.groupby('Group')['Time'].std(),color='black', linewidth=0.5, linestyle='--', capsize=5.0
)
_ = sns.scatterplot(data=df, x='Voltage', y='Time', hue='Group', ax=axes)
fig.set_figwidth(12)

As the means are curved and the SDs are decreasing, there is a clear need of a transformation.

In [None]:
df['logtime'] = np.log(df['Time'])

In [None]:
fig, axes = plt.subplots()

time_means = df.groupby('Group')['logtime'].mean()
voltage_means = df.groupby('Group')['Voltage'].mean()
axes.errorbar(
    voltage_means, time_means, 
    yerr=df.groupby('Group')['logtime'].std(),color='black', linewidth=0.5, linestyle='--', capsize=5.0
)

# sns.regplot(data=df, x='Voltage', y='logtime', scatter=False, ax=axes)
sns.scatterplot(data=df, x='Voltage', y='logtime', hue='Group', ax=axes)

fig.set_figwidth(12)

Equal variance of subpopulations still seems like a problem.

## Linear Model

In [None]:
model = smf.ols(formula='np.log(Time) ~ Voltage', data=df)
res = model.fit()
print(res.summary())

$\mu\{\log(Time)| V \} = -0.5074 \cdot V + 18.9555$

We estimate that the change breakdown time per unit change in voltage is 40% (1 - $\exp^{-0.5074}$).   We are 95% confident that the true percentage change is between 32% and 46%.

In [None]:
fig, ax = plt.subplots()
sns.lineplot(x=df['Voltage'], y=res.fittedvalues, ax=ax)
_ = sns.scatterplot(data=df, x='Voltage', y='logtime', ax=ax)
_ = ax.set_ylabel('log time')

## Robustness of Assumptions

### Normality

In [None]:
sm.graphics.qqplot(df['logtime'], line='45', fit=True)
plt.show()

Normality looks ok.

## Equal Variance

In [None]:
fig, ax = plt.subplots()
fitted = res.fittedvalues 
sns.scatterplot(x=res.fittedvalues, y=res.resid, ax=ax)
ax.set_xlabel('Fitted Values')
_ = ax.set_ylabel('Residuals')

Equal variance looks ok.