In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

# Improving Regression

- Review Mutlivariate Linear Regression
- Coding Qualitative Variables
- Polynomial Regression

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [3]:
ads = pd.read_csv('data/ads.csv', index_col = 0)

In [4]:
ads.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [5]:
ads.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [6]:
corr_mat = ads.corr()

In [7]:
plt.figure()
sns.heatmap(corr_mat, cmap = 'magma')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x118184fd0>

In [11]:
model2 = smf.ols('sales ~ TV', data = ads).fit()
model2.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.61
Dependent Variable:,sales,AIC:,1042.0913
Date:,2018-04-03 18:38,BIC:,1048.688
No. Observations:,200,Log-Likelihood:,-519.05
Df Model:,1,F-statistic:,312.1
Df Residuals:,198,Prob (F-statistic):,1.47e-42
R-squared:,0.612,Scale:,10.619

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,7.0326,0.4578,15.3603,0.0000,6.1297,7.9355
TV,0.0475,0.0027,17.6676,0.0000,0.0422,0.0528

0,1,2,3
Omnibus:,0.531,Durbin-Watson:,1.935
Prob(Omnibus):,0.767,Jarque-Bera (JB):,0.669
Skew:,-0.089,Prob(JB):,0.716
Kurtosis:,2.779,Condition No.:,338.0


In [12]:
m = model2.params[1]
b = model2.params[0]

In [13]:
def l(x): return m*x + b

In [14]:
x = np.linspace(min(ads.TV), max(ads.TV), 1000)
plt.figure()
plt.scatter(ads['TV'], ads['sales'], alpha = 0.3)
plt.plot(x, l(x), '--r')
plt.xlabel("Television")
plt.ylabel("Sales")

<IPython.core.display.Javascript object>

Text(0,0.5,'Sales')

In [15]:
model3 = smf.ols('sales ~ radio', data = ads).fit()
model4 = smf.ols('sales ~ newspaper', data = ads).fit()

In [16]:
model3.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.329
Dependent Variable:,sales,AIC:,1150.6738
Date:,2018-04-03 18:39,BIC:,1157.2704
No. Observations:,200,Log-Likelihood:,-573.34
Df Model:,1,F-statistic:,98.42
Df Residuals:,198,Prob (F-statistic):,4.35e-19
R-squared:,0.332,Scale:,18.275

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,9.3116,0.5629,16.5422,0.0000,8.2016,10.4217
radio,0.2025,0.0204,9.9208,0.0000,0.1622,0.2427

0,1,2,3
Omnibus:,19.358,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.91
Skew:,-0.764,Prob(JB):,0.0
Kurtosis:,3.544,Condition No.:,51.0


In [17]:
model4.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.047
Dependent Variable:,sales,AIC:,1220.6714
Date:,2018-04-03 18:39,BIC:,1227.268
No. Observations:,200,Log-Likelihood:,-608.34
Df Model:,1,F-statistic:,10.89
Df Residuals:,198,Prob (F-statistic):,0.00115
R-squared:,0.052,Scale:,25.933

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,12.3514,0.6214,19.8761,0.0000,11.1260,13.5769
newspaper,0.0547,0.0166,3.2996,0.0011,0.0220,0.0874

0,1,2,3
Omnibus:,6.231,Durbin-Watson:,1.983
Prob(Omnibus):,0.044,Jarque-Bera (JB):,5.483
Skew:,0.33,Prob(JB):,0.064
Kurtosis:,2.527,Condition No.:,65.0


In [18]:
plt.figure(figsize = (9, 3))
plt.subplot(121)
plt.scatter(ads['radio'], ads['sales'])


plt.subplot(122)
plt.scatter(ads['newspaper'], ads['sales'])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x11e761978>

In [19]:
model_all = smf.ols('sales ~ TV + radio + newspaper', data = ads).fit()

In [20]:
model_all.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Tue, 03 Apr 2018",Prob (F-statistic):,1.58e-96
Time:,18:40:24,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [21]:
model_TVradio = smf.ols('sales ~ TV + radio', data = ads).fit()
model_TVradio.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,859.6
Date:,"Tue, 03 Apr 2018",Prob (F-statistic):,4.83e-98
Time:,18:40:28,Log-Likelihood:,-386.2
No. Observations:,200,AIC:,778.4
Df Residuals:,197,BIC:,788.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9211,0.294,9.919,0.000,2.340,3.502
TV,0.0458,0.001,32.909,0.000,0.043,0.048
radio,0.1880,0.008,23.382,0.000,0.172,0.204

0,1,2,3
Omnibus:,60.022,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148.679
Skew:,-1.323,Prob(JB):,5.19e-33
Kurtosis:,6.292,Cond. No.,425.0


In [22]:
model_TVradio.params

Intercept    2.921100
TV           0.045755
radio        0.187994
dtype: float64

In [23]:
model_TVradio.params[0]

2.921099912405143

In [24]:
from mpl_toolkits import mplot3d

In [25]:
fig = plt.figure()
ax = plt.axes(projection='3d')
x = ads['radio']
y = ads['TV']
z = ads['sales']

ax.scatter3D(x, y, z, label = 'Data')

X, Y = np.meshgrid(x, y)
def pred(x, y): return model_TVradio.params[0] + model_TVradio.params[2]*x + model_TVradio.params[1]*y
ax.scatter3D(x, y, pred(x,y), color = 'red', label = 'Predictions')

ax.set_title("3D Linear Model")

<IPython.core.display.Javascript object>

Text(0.5,0.92,'3D Linear Model')

### Qualitative Features

To this point, we've only examined quantitative features.  Here, we follow an example where we can incorporate some qualitative features into our analysis.  In our dataset below, we have four variables that are qualitative:

    Gender, Student, Married, Ethnicity
    
We begin by considering the relationship between `Gender` and `Balance`.

In [26]:
credit = pd.read_csv('data/credit.csv', index_col = 'Unnamed: 0')

In [27]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 1 to 400
Data columns (total 11 columns):
Income       400 non-null float64
Limit        400 non-null int64
Rating       400 non-null int64
Cards        400 non-null int64
Age          400 non-null int64
Education    400 non-null int64
Gender       400 non-null object
Student      400 non-null object
Married      400 non-null object
Ethnicity    400 non-null object
Balance      400 non-null int64
dtypes: float64(1), int64(6), object(4)
memory usage: 37.5+ KB


In [30]:
credit.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [28]:
from pandas.plotting import scatter_matrix

In [29]:
scatter_matrix(credit);

<IPython.core.display.Javascript object>

In [31]:
lm = smf.ols('Balance ~ Gender', data = credit).fit()

In [32]:
lm.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,-0.002
Dependent Variable:,Balance,AIC:,6042.5268
Date:,2018-04-03 18:45,BIC:,6050.5097
No. Observations:,400,Log-Likelihood:,-3019.3
Df Model:,1,F-statistic:,0.1836
Df Residuals:,398,Prob (F-statistic):,0.669
R-squared:,0.000,Scale:,211810.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,509.8031,33.1281,15.3889,0.0000,444.6752,574.9310
Gender[T.Female],19.7331,46.0512,0.4285,0.6685,-70.8009,110.2671

0,1,2,3
Omnibus:,28.438,Durbin-Watson:,1.94
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.346
Skew:,0.583,Prob(JB):,0.0
Kurtosis:,2.471,Condition No.:,3.0


In [33]:
credit.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [34]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 1 to 400
Data columns (total 11 columns):
Income       400 non-null float64
Limit        400 non-null int64
Rating       400 non-null int64
Cards        400 non-null int64
Age          400 non-null int64
Education    400 non-null int64
Gender       400 non-null object
Student      400 non-null object
Married      400 non-null object
Ethnicity    400 non-null object
Balance      400 non-null int64
dtypes: float64(1), int64(6), object(4)
memory usage: 57.5+ KB


In [35]:
credit['Gender'].value_counts()

Female    207
 Male     193
Name: Gender, dtype: int64

### Interpretation and Dummy Variables

The idea above is that the equation can be understood as the intercept meaning the average for the 0 category, and the coefficient as the difference between the two categories.  Further, the sum of the intercepts would be the average value for the 1 category.  

As we've discussed, we want to introduce quantitative data to many machine learning algorithms, so we should consider adding a dummy variable for this column.  We can follow our earlier example.

In [36]:
gender_dummies = pd.get_dummies(credit.Gender, prefix='Gender')

In [37]:
gender_dummies.head()

Unnamed: 0,Gender_ Male,Gender_Female
1,1,0
2,0,1
3,1,0
4,0,1
5,1,0


In [38]:
credit['Gender_Female'] = gender_dummies['Gender_Female']

In [39]:
credit.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Gender_Female
1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,0
2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,1
3,104.593,7075,514,4,71,11,Male,No,No,Asian,580,0
4,148.924,9504,681,3,36,11,Female,No,No,Asian,964,1
5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331,0


In [40]:
gender_model = smf.ols('Balance ~ Gender_Female', data = credit).fit()
gender_model.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,-0.002
Dependent Variable:,Balance,AIC:,6042.5268
Date:,2018-04-03 18:47,BIC:,6050.5097
No. Observations:,400,Log-Likelihood:,-3019.3
Df Model:,1,F-statistic:,0.1836
Df Residuals:,398,Prob (F-statistic):,0.669
R-squared:,0.000,Scale:,211810.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,509.8031,33.1281,15.3889,0.0000,444.6752,574.9310
Gender_Female,19.7331,46.0512,0.4285,0.6685,-70.8009,110.2671

0,1,2,3
Omnibus:,28.438,Durbin-Watson:,1.94
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.346
Skew:,0.583,Prob(JB):,0.0
Kurtosis:,2.471,Condition No.:,3.0


#### Problem

Using the `Credit` dataset above, add encoding to the other binary categorical variables.  Fit a basic Linear Model to one or two of these new columns against the `Balance` column.  Interpret your findings in terms of the categories.

In [41]:
student_dummies = pd.get_dummies(credit.Student)

In [42]:
student_dummies.head()

Unnamed: 0,No,Yes
1,1,0
2,0,1
3,1,0
4,1,0
5,1,0


In [43]:
married_dummies = pd.get_dummies(credit.Married)

In [44]:
married_dummies.head()

Unnamed: 0,No,Yes
1,0,1
2,0,1
3,1,0
4,1,0
5,0,1


### More than two Categories

Here, we need more than one dummy variable and will subsequently run a linear regression on a both of these columns and interpret the data accordingly.  In our credit dataset, we have a three valued column with `Ethnicity`.  From this, we will create a model where:

$$y_i = \beta_0 + \beta_1x_{i1} + \beta_2x_{i2} + \epsilon_i $$

where $x_{i1} = 1$ if the $i$th person is Asian and 0 otherwise, and similarly $x_{i2}$ for Caucasian.  Again, this assignment is arbitrary.  However, we can interpret the model as:

- $\beta_0 + \beta_1 + \epsilon_i$: if $i$th person is Asian
- $\beta_0 + \beta_2 + \epsilon_i$: if $i$th person is Caucasian
- $\beta_0 +\epsilon_i$: if $i$th person is African American

In [45]:
credit['Ethnicity'].value_counts()

Caucasian           199
Asian               102
African American     99
Name: Ethnicity, dtype: int64

In [46]:
ethn_dummies = pd.get_dummies(credit.Ethnicity)

In [47]:
ethn_dummies.head()

Unnamed: 0,African American,Asian,Caucasian
1,0,0,1
2,0,1,0
3,0,1,0
4,0,1,0
5,0,0,1


In [48]:
credit['ethn_asian'] = ethn_dummies['Asian']
credit['ethn_cauc'] = ethn_dummies['Caucasian']

In [49]:
lin_tre = smf.ols('Balance ~ ethn_asian + ethn_cauc', data = credit).fit()

In [50]:
lin_tre.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,-0.005
Dependent Variable:,Balance,AIC:,6044.6238
Date:,2018-04-03 18:57,BIC:,6056.5982
No. Observations:,400,Log-Likelihood:,-3019.3
Df Model:,2,F-statistic:,0.04344
Df Residuals:,397,Prob (F-statistic):,0.957
R-squared:,0.000,Scale:,212400.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,531.0000,46.3187,11.4641,0.0000,439.9394,622.0606
ethn_asian,-18.6863,65.0211,-0.2874,0.7740,-146.5149,109.1424
ethn_cauc,-12.5025,56.6810,-0.2206,0.8255,-123.9350,98.9300

0,1,2,3
Omnibus:,28.829,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.395
Skew:,0.581,Prob(JB):,0.0
Kurtosis:,2.46,Condition No.:,4.0


We interpret these results as saying that the balance for African Americans is \$531.00, the Asian category has \$18.69 less than this, and the Caucasian category will carry \$12.50 less than the African American category.

### Problem

Examine a multiple regression model on the `Credit` dataset provided after appropriately coding all categorical variables and dealing with any missing values.  Make a single markdown cell containing a scatterplot and the fitted line and the RMSE. (to save a plot you can type `plt.savefig()` and pass a filename for saving the image, subsequently displaying it in a markdown cell with `![](path/to/image.png)`)

### Polynomial Regression

While we see what the relationship between these variables modeled as a straight line would be, but could a polynomial shape do better?  Let's first consider the simple polynomial case.  

In [52]:
mpg = pd.read_csv('data/mtcars.csv')

In [53]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 12 columns):
Unnamed: 0    32 non-null object
mpg           32 non-null float64
cyl           32 non-null int64
disp          32 non-null float64
hp            32 non-null int64
drat          32 non-null float64
wt            32 non-null float64
qsec          32 non-null float64
vs            32 non-null int64
am            32 non-null int64
gear          32 non-null int64
carb          32 non-null int64
dtypes: float64(5), int64(6), object(1)
memory usage: 3.1+ KB


In [54]:
plt.figure()
plt.scatter(mpg['hp'], mpg['mpg'])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1221cca90>

In [55]:
lin = np.polyfit(mpg['hp'], mpg['mpg'], 1)
lin_p = np.poly1d(lin)

x = mpg['hp'].sort_values()
plt.plot(x, lin_p(x), label = 'Linear')

[<matplotlib.lines.Line2D at 0x120f2e550>]

In [56]:
quad = np.polyfit(mpg['hp'], mpg['mpg'], 2)
quad_p = np.poly1d(quad)

plt.plot(x, quad_p(x), label = 'Quadratic')

[<matplotlib.lines.Line2D at 0x121fee2b0>]

In [57]:
many = np.polyfit(mpg['hp'], mpg['mpg'], 14)
big_p = np.poly1d(many)

plt.plot(x, big_p(x), label = 'Degree 14')
plt.legend(frameon = False)

<matplotlib.legend.Legend at 0x1220045c0>

**Determining Shape**


One way to look at whether there is a quadratic relationship between variables is to examine the graph of the residuals.  Below, we construct residual plots for the linear and quadratic case that include a fitted line.  Note the lack of pattern in the quadratic fit.

In [58]:
plt.figure(figsize = (10, 5))
plt.subplot(1, 2, 1)
sns.residplot(mpg['mpg'], mpg['hp'], lowess = True)

plt.subplot(1, 2, 2)
sns.residplot(mpg['mpg'], mpg['hp'], order = 2, lowess = True)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x121c11390>

### More than One Polynomial Feature

While a polynomial in 2-Dimensions looks like

$$ y = a_0 + a_1x + a_2x^2 + ... + a_nx^n $$

A quadratic polynomial in 3-Dimensions could look something like:

$$ f(x, y) = ax^2 + bx + cy^2 + dy + exy  + f$$

Note the existence of the $exy$ term, where the variables $x$ and $y$ interact.  We can see something like this in our advertising data.  Let's first create a new column that combines the TV and radio columns through multiplication.  We can consider this in a 2D plot against sales.

In [59]:
ads['TVradio'] = ads.TV * ads.radio

In [60]:
ads.head()

Unnamed: 0,TV,radio,newspaper,sales,TVradio
1,230.1,37.8,69.2,22.1,8697.78
2,44.5,39.3,45.1,10.4,1748.85
3,17.2,45.9,69.3,9.3,789.48
4,151.5,41.3,58.5,18.5,6256.95
5,180.8,10.8,58.4,12.9,1952.64


In [61]:
plt.figure()
plt.scatter(ads['TVradio'], ads['sales'])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1212a4a90>

In [62]:
quad = np.polyfit(ads.TVradio, ads.sales, 2)

In [63]:
quad_p = np.poly1d(quad)

In [64]:
x = ads.TVradio.sort_values()

In [65]:
plt.plot(x, quad_p(x), color = 'red', linewidth = 5)

[<matplotlib.lines.Line2D at 0x1213ea550>]

We want to include the individual terms that make up the interaction term in our original model.  Thus, we will need a 3D quadratic polynomial for our model in the advertising data.  The smoothest way I know to accomplish this is to us the `PolynomialFeatures` method from scikitlearn.  Below, we create an instance of the `PolynomialFeatures` method, create a single object containing the input variables, and fit these values with the `.fit_transform()` method.

In [68]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias = False)

In [69]:
X = ads[['TV', 'radio']]

In [70]:
X_poly = poly_features.fit_transform(X)

In [71]:
X_poly[0]

array([2.301000e+02, 3.780000e+01, 5.294601e+04, 8.697780e+03,
       1.428840e+03])

In [72]:
from sklearn.linear_model import LinearRegression

In [73]:
lin_reg = LinearRegression()

In [74]:
lin_reg.fit(X_poly, ads.sales)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [75]:
lin_reg.intercept_

5.194441866943253

In [76]:
lin_reg.predict(X_poly)[:10]

array([21.72951482, 10.45593622,  8.49764066, 18.5367012 , 13.2309278 ,
        7.82970007, 10.8608992 , 12.860567  ,  5.70082255, 11.62749287])

In [77]:
lin_reg.score(X_poly, ads.sales)

0.986039101078374

### Pipelines and Higher Degree Fits

We could use a higher order polynomial also, examining a degree 3 polynomial with the `Pipeline` approach, combining the two operations together.  We will see much more from piplines moving forward.

In [78]:
model = Pipeline([('poly', PolynomialFeatures(3)),
                 ('linear', LinearRegression(fit_intercept= False))])

In [79]:
X = ads[['TV', 'radio']]
y = ads['sales']

In [80]:
model = model.fit(X, y)

In [81]:
model.score(X, y)

0.9911667563818458

In [82]:
ads.plot(x = 'TV', y = 'sales', kind = 'scatter')
plt.scatter(ads['TV'], y = model.predict(X), color = 'red', alpha = 0.2 )

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1222a9128>

In [83]:
from sklearn.metrics import mean_squared_error

In [85]:
mse = mean_squared_error(y, model.predict(X))

In [86]:
rmse = np.sqrt(mse)

In [87]:
rmse

0.48913696765082887

In [88]:
from sklearn.tree import DecisionTreeRegressor

In [89]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X, y)
tree_predictions = tree_reg.predict(X)
mse = mean_squared_error(y, tree_predictions)
rmse = np.sqrt(mse)

In [90]:
mse

0.0

In [91]:
rmse

0.0

### Problem

Investigate the use of `PolynomialFeatures` on the `Credit` dataset.  Does a cubic polynomial significantly improve performance?

In [92]:
credit.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Gender_Female,ethn_asian,ethn_cauc
1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,0,0,1
2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,1,1,0
3,104.593,7075,514,4,71,11,Male,No,No,Asian,580,0,1,0
4,148.924,9504,681,3,36,11,Female,No,No,Asian,964,1,1,0
5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331,0,0,1


In [93]:
X = credit[['Limit', 'Rating', 'Education']]

In [96]:
y = credit['Balance']
lm = LinearRegression()

In [97]:
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [98]:
lm.score(X, y)

0.7462397184698877

In [100]:
mse = mean_squared_error(lm.predict(X), y)

In [101]:
np.sqrt(mse)

231.31212565766234