In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

# Improving Regression

- Review Mutlivariate Linear Regression
- Coding Qualitative Variables
- Polynomial Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
ads = pd.read_csv('data/ads.csv', index_col = 0)

In [None]:
ads.head()

In [None]:
ads.head()

In [None]:
corr_mat = ads.corr()

In [None]:
plt.figure()
sns.heatmap(corr_mat, cmap = 'magma')

In [None]:
model2 = smf.ols('sales ~ TV', data = ads).fit()
model2.summary2()

In [None]:
m = model2.params[1]
b = model2.params[0]

In [None]:
def l(x): return m*x + b

In [None]:
x = np.linspace(min(ads.TV), max(ads.TV), 1000)
plt.figure()
plt.scatter(ads['TV'], ads['sales'], alpha = 0.3)
plt.plot(x, l(x), '--r')
plt.xlabel("Television")
plt.ylabel("Sales")

In [None]:
model3 = smf.ols('sales ~ radio', data = ads).fit()
model4 = smf.ols('sales ~ newspaper', data = ads).fit()

In [None]:
model3.summary2()

In [None]:
model4.summary2()

In [None]:
plt.figure(figsize = (9, 3))
plt.subplot(121)
plt.scatter(ads['radio'], ads['sales'])


plt.subplot(122)
plt.scatter(ads['newspaper'], ads['sales'])

In [None]:
model_all = smf.ols('sales ~ TV + radio + newspaper', data = ads).fit()

In [None]:
model_all.summary()

In [None]:
model_TVradio = smf.ols('sales ~ TV + radio', data = ads).fit()
model_TVradio.summary()

In [None]:
model_TVradio.params

In [None]:
model_TVradio.params[0]

In [None]:
from mpl_toolkits import mplot3d

In [None]:
fig = plt.figure()
ax = plt.axes(projection='3d')
x = ads['radio']
y = ads['TV']
z = ads['sales']

ax.scatter3D(x, y, z, label = 'Data')

X, Y = np.meshgrid(x, y)
def pred(x, y): return model_TVradio.params[0] + model_TVradio.params[2]*x + model_TVradio.params[1]*y
ax.scatter3D(x, y, pred(x,y), color = 'red', label = 'Predictions')

ax.set_title("3D Linear Model")

### Qualitative Features

To this point, we've only examined quantitative features.  Here, we follow an example where we can incorporate some qualitative features into our analysis.  In our dataset below, we have four variables that are qualitative:

    Gender, Student, Married, Ethnicity
    
We begin by considering the relationship between `Gender` and `Balance`.

In [None]:
credit = pd.read_csv('data/credit.csv', index_col = 'Unnamed: 0')

In [None]:
credit.info()

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
scatter_matrix(credit);

In [None]:
lm = smf.ols('Balance ~ Gender', data = credit).fit()

In [None]:
lm.summary2()

In [None]:
credit.head()

In [None]:
credit.info()

In [None]:
credit['Gender'].value_counts()

### Interpretation and Dummy Variables

The idea above is that the equation can be understood as the intercept meaning the average for the 0 category, and the coefficient as the difference between the two categories.  Further, the sum of the intercepts would be the average value for the 1 category.  

As we've discussed, we want to introduce quantitative data to many machine learning algorithms, so we should consider adding a dummy variable for this column.  We can follow our earlier example.

In [None]:
gender_dummies = pd.get_dummies(credit.Gender, prefix='Gender')

In [None]:
gender_dummies.head()

In [None]:
credit['Gender_Female'] = gender_dummies['Gender_Female']

In [None]:
credit.head()

In [None]:
gender_model = smf.ols('Balance ~ Gender_Female', data = credit).fit()
gender_model.summary2()

#### Problem

Using the `Credit` dataset above, add encoding to the other binary categorical variables.  Fit a basic Linear Model to one or two of these new columns against the `Balance` column.  Interpret your findings in terms of the categories.

### More than two Categories

Here, we need more than one dummy variable and will subsequently run a linear regression on a both of these columns and interpret the data accordingly.  In our credit dataset, we have a three valued column with `Ethnicity`.  From this, we will create a model where:

$$y_i = \beta_0 + \beta_1x_{i1} + \beta_2x_{i2} + \epsilon_i $$

where $x_{i1} = 1$ if the $i$th person is Asian and 0 otherwise, and similarly $x_{i2}$ for Caucasian.  Again, this assignment is arbitrary.  However, we can interpret the model as:

- $\beta_0 + \beta_1 + \epsilon_i$: if $i$th person is Asian
- $\beta_0 + \beta_2 + \epsilon_i$: if $i$th person is Caucasian
- $\beta_0 +\epsilon_i$: if $i$th person is African American

In [None]:
credit['Ethnicity'].value_counts()

In [None]:
ethn_dummies = pd.get_dummies(credit.Ethnicity)

In [None]:
ethn_dummies.head()

In [None]:
credit['ethn_asian'] = ethn_dummies['Asian']
credit['ethn_cauc'] = ethn_dummies['Caucasian']

In [None]:
lin_tre = smf.ols('Balance ~ ethn_asian + ethn_cauc', data = credit).fit()

In [None]:
lin_tre.summary2()

We interpret these results as saying that the balance for African Americans is \$531.00, the Asian category has \$18.69 less than this, and the Caucasian category will carry \$12.50 less than the African American category.

### Problem

Examine a multiple regression model on the `Credit` dataset provided after appropriately coding all categorical variables and dealing with any missing values.  Make a single markdown cell containing a scatterplot and the fitted line and the RMSE. (to save a plot you can type `plt.savefig()` and pass a filename for saving the image, subsequently displaying it in a markdown cell with `![](path/to/image.png)`)

### Polynomial Regression

While we see what the relationship between these variables modeled as a straight line would be, but could a polynomial shape do better?  Let's first consider the simple polynomial case.  

In [None]:
mpg = pd.read_csv('data/mtcars.csv')

In [None]:
mpg.info()

In [None]:
plt.figure()
plt.scatter(mpg['hp'], mpg['mpg'])

In [None]:
lin = np.polyfit(mpg['hp'], mpg['mpg'], 1)
lin_p = np.poly1d(lin)

x = mpg['hp'].sort_values()
plt.plot(x, lin_p(x), label = 'Linear')

In [None]:
quad = np.polyfit(mpg['hp'], mpg['mpg'], 2)
quad_p = np.poly1d(quad)

plt.plot(x, quad_p(x), label = 'Quadratic')

In [None]:
many = np.polyfit(mpg['hp'], mpg['mpg'], 14)
big_p = np.poly1d(many)

plt.plot(x, big_p(x), label = 'Degree 14')
plt.legend(frameon = False)

**Determining Shape**


One way to look at whether there is a quadratic relationship between variables is to examine the graph of the residuals.  Below, we construct residual plots for the linear and quadratic case that include a fitted line.  Note the lack of pattern in the quadratic fit.

In [None]:
plt.figure(figsize = (10, 5))
plt.subplot(1, 2, 1)
sns.residplot(mpg['mpg'], mpg['hp'], lowess = True)

plt.subplot(1, 2, 2)
sns.residplot(mpg['mpg'], mpg['hp'], order = 2, lowess = True)

### More than One Polynomial Feature

While a polynomial in 2-Dimensions looks like

$$ y = a_0 + a_1x + a_2x^2 + ... + a_nx^n $$

A quadratic polynomial in 3-Dimensions could look something like:

$$ f(x, y) = ax^2 + bx + cy^2 + dy + exy  + f$$

Note the existence of the $exy$ term, where the variables $x$ and $y$ interact.  We can see something like this in our advertising data.  Let's first create a new column that combines the TV and radio columns through multiplication.  We can consider this in a 2D plot against sales.

In [None]:
ads['TVradio'] = ads.TV * ads.radio

In [None]:
ads.head()

In [None]:
plt.figure()
plt.scatter(ads['TVradio'], ads['sales'])

In [None]:
quad = np.polyfit(ads.TVradio, ads.sales, 2)

In [None]:
quad_p = np.poly1d(quad)

In [None]:
x = ads.TVradio.sort_values()

In [None]:
plt.plot(x, quad_p(x), color = 'red', linewidth = 5)

We want to include the individual terms that make up the interaction term in our original model.  Thus, we will need a 3D quadratic polynomial for our model in the advertising data.  The smoothest way I know to accomplish this is to us the `PolynomialFeatures` method from scikitlearn.  Below, we create an instance of the `PolynomialFeatures` method, create a single object containing the input variables, and fit these values with the `.fit_transform()` method.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias = False)

In [None]:
X = ads[['TV', 'radio']]

In [None]:
X_poly = poly_features.fit_transform(X)

In [None]:
X_poly[0]

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()

In [None]:
lin_reg.fit(X_poly, ads.sales)

In [None]:
lin_reg.intercept_

In [None]:
lin_reg.predict(X_poly)[:10]

In [None]:
lin_reg.score(X_poly, ads.sales)

### Pipelines and Higher Degree Fits

We could use a higher order polynomial also, examining a degree 3 polynomial with the `Pipeline` approach, combining the two operations together.  We will see much more from piplines moving forward.

In [None]:
model = Pipeline([('poly', PolynomialFeatures(3)),
                 ('linear', LinearRegression(fit_intercept= False))])

In [None]:
X = ads[['TV', 'radio']]
y = ads['sales']

In [None]:
model = model.fit(X, y)

In [None]:
model.score(X, y)

In [None]:
ads.plot(x = 'TV', y = 'sales', kind = 'scatter')
plt.scatter(ads['TV'], y = model.predict(X), color = 'red', alpha = 0.2 )

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse = mean_squared_error(y, model.predict(X))

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X, y)
tree_predictions = tree_reg.predict(X)
mse = mean_squared_error(y, tree_predictions)
rmse = np.sqrt(mse)

In [None]:
mse

In [None]:
rmse

### Problem

Investigate the use of `PolynomialFeatures` on the `Credit` dataset.  Does a cubic polynomial significantly improve performance?