In [None]:
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import VarianceThreshold
import numpy as np
from scipy import stats as st
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from scipy.stats import f_oneway
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from IPython.display import display

# Measuring Feature Relevane with ... Linear Models

- $n$, the number of instances, rows
- $p$, the number of features, columns

### The Model

$$y=\beta_0+\beta_1x_1+\dots+\beta_px_p$$

### The Mean

$$\mu = \text{E}[y]=\frac1n\sum y$$

### The Variance

How far off will a measurement be from the mean?

$$\text{Var}(y)=\text{E}\left[(y-\mu)^2\right]$$

### TSS, the Total Sum of Squares
TSS measures the total variance in $y$ i.e. the variance  before the regression is performed.

$$\text{TSS}=\sum (y_i-\mu)^2= nVar(y)$$

### RSS, the Residual Sum of Squares

RSS measures the variance left unexplained after performing the regression. 

$$\text{RSS}=\sum (y_i-\hat f(x_i))^2$$


### $R^2$, the Coefficient of Determination

$$R^2=\frac{\text{TSS}-\text{RSS}}{\text{TSS}} = 1 - \frac{RSS}{TSS} $$

TSS − RSS measures the amount of variability in the response that is explained by performing the regression

$R^2$ measures the proportion of variance in $y$ that can be explained using $x$.

## Test Statistics

A test statistic is a statistic (a quantity derived from the sample) used in statistical hypothesis testing.

A hypothesis test is typically specified in terms of a test statistic, considered as a numerical summary of a data-set that reduces the data to one value that can be used to perform the hypothesis test. 

In general, a test statistic is selected or defined in such a way as to quantify, within observed data, behaviours that would distinguish the null from the alternative hypothesis.

### F-Statistic

$$F \approx \frac{TSS-RSS}{RSS} = \frac{\text{explained variance}}{\text{unexplained variance}}$$

### Assessing Relevance
1. Is at least one of the predictors $x_1 , x_2 , \dots , x_p$ useful in predicting the response?
2. Do all the predictors help to explain $y$, or is only a subset of the predictors useful?
3. How well does the model fit the data?
4. Given a set of predictor values, what response value should we predict, and how accurate is our prediction?

### One: Is There a Relationship Between the Response and Predictors?

In [None]:
cols_diab = ['Age','Sex','Body_mass_index','Average_blood_pressure','S1','S2','S3','S4','S5','S6']


    ==============      ==================
    Samples total       442
    Dimensionality      10
    Features            real, -.2 < x < .2
    Targets             integer 25 - 346
    ==============      ==================

In [None]:
X_diab, y_diab = load_diabetes(return_X_y=True)
X_diab = pd.DataFrame(X_diab, columns=cols_diab)

In [None]:
X_diab.T

In [None]:
full_data_diab = pd.merge(X_diab, pd.DataFrame(y_diab), left_index=True, right_index=True)
full_data_diab.columns = list(X_diab.columns) + ['target']

In [None]:
sns.pairplot(full_data_diab)

## Classical Hypothesis Testing

Given a sample and an apparent effect, what is the probability of seeing such an effect by chance?

1. Choose a **test statistic**
1. Define a **null hypothesis**
1. Compute a **p-value**
1. Interpret the result:

If the p-value is low, the effect is said to be **statistically significant**, which means that it is unlikely to have occurred by chance. 

1. test stat: t-statistic of a feature
2. **H**$_0$: the feature is not important in predicting the target
3. Compute p-values:

### For Simple Linear Regression, our Test Statistic is typically the $t$-statistic

$$t_{\widehat{\beta}} = \frac{\widehat\beta - \beta_0}{\mathrm{s.e.}(\widehat\beta)} $$

R notation:

    "target ~ Body_mass_index"
    
`target` can be predicted by `Body_mass_index`.

$$y = \beta_0 + \beta_1x_1$$

where $y$ is the target and $x_1$ is the `Body_mass_index`.

$$\text{H}_0: \beta_0 = \beta_1=0$$

In [None]:
model = ols("target ~ Body_mass_index", full_data_diab).fit()
model.summary()

#### Can we reject the null hypothesis?

In [None]:
model.pvalues

### Individual Linear Regression against all Features

In [None]:
#regress one at a time.
for feat in cols_diab:
    model = ols("target ~ {}".format(feat), full_data_diab).fit()
    print("{:25} {}".format(feat, model.pvalues[1]))

In [None]:

#'+'.join()...joins on all _
model = ols("target ~ " + "+".join(cols_diab), full_data_diab).fit()

model.summary()

#note... p values change if you regress individually or regress on features combined

### Two: Deciding on Important Variables

    =================   ==============
    Classes                          3
    Samples per class               50
    Samples total                  150
    Dimensionality                   4
    Features            real, positive
    =================   ==============

In [None]:
iris = load_iris()
iris_cols = ['sepal_length','sepal_width','petal_length','petal_width']
full_data_iris = pd.merge(
                pd.DataFrame(iris.data, 
                             columns=iris_cols),
                pd.DataFrame(iris.target,
                             columns=['species']),
                left_index=True, right_index=True)

In [None]:
full_data_iris.T

In [None]:
model = ols("species ~ sepal_width + sepal_length", full_data_iris).fit()
model.summary()

In [None]:
model = ols("species ~ sepal_width + sepal_length + petal_length + petal_width", full_data_iris).fit()
model.summary()

In [None]:
model = ols("species ~ sepal_length + petal_length + petal_width - 1", full_data_iris).fit()
model.summary()

### Analysis of Variance

In [None]:
fig = plt.figure(figsize=(20,6))
for i, feat in enumerate(['sepal_length', 'sepal_width', 'petal_length', 'petal_width']):
    fig.add_subplot(1,4,i+1)
    sns.boxplot('species', feat, data=full_data_iris)

### One-Way Anova

Use one-way anova when you have one nominal variable and one measurement variable; the nominal variable divides the measurements into two or more groups. It tests whether the means of the measurement variable are the same for the different groups.

#### Null hypothesis
The statistical null hypothesis is that the means of the measurement variable are the same for the different categories of data; the alternative hypothesis is that they are not all the same. 

$$H_0:\mu_1=\mu_2=\dots=\mu_p$$

In [None]:
species_groups = full_data_iris.groupby('species')

In [None]:
species_groups.agg(['var', 'mean'])

In [None]:
results = f_oneway(full_data_iris[full_data_iris['species']==0].drop('species', axis=1),
                   full_data_iris[full_data_iris['species']==1].drop('species', axis=1),
                   full_data_iris[full_data_iris['species']==2].drop('species', axis=1))
list(zip(iris_cols, results.pvalue))

### Univariate Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from sklearn import svm

In [None]:
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))

In [None]:
X = np.hstack((iris.data, E))
y = iris.target

In [None]:
X_indices = np.arange(X.shape[-1])
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange')

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

svm_weights = (clf.coef_ ** 2).sum(axis=0)
svm_weights /= svm_weights.max()

plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
        color='navy')

clf_selected = svm.SVC(kernel='linear')
clf_selected.fit(selector.transform(X), y)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
svm_weights_selected /= svm_weights_selected.max()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
        width=.2, label='SVM weights after selection', color='c')
